fix partial strings getting dropped when multi-byte codepoints span input buffers

This commit is contained in:
Josh Wolfe 2024-08-07 05:47:32 -04:00
parent 8bec737ca2
commit a805454dea

View File

@ -897,7 +897,7 @@ pub const Scanner = struct {
},
.number_post_dot => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
switch (try self.expectByte()) {
switch (self.input[self.cursor]) {
'0'...'9' => {
self.cursor += 1;
self.state = .number_frac;
@ -1032,7 +1032,8 @@ pub const Scanner = struct {
return error.BufferUnderrun;
},
.string_backslash => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
'"', '\\', '/' => {
// Since these characters now represent themselves literally,
// we can simply begin the next plaintext slice here.
@ -1080,7 +1081,8 @@ pub const Scanner = struct {
}
},
.string_backslash_u => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] = @as(u16, c - '0') << 12;
@ -1098,7 +1100,8 @@ pub const Scanner = struct {
continue :state_loop;
},
.string_backslash_u_1 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
@ -1116,7 +1119,8 @@ pub const Scanner = struct {
continue :state_loop;
},
.string_backslash_u_2 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
@ -1134,7 +1138,8 @@ pub const Scanner = struct {
continue :state_loop;
},
.string_backslash_u_3 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= c - '0';
@ -1160,7 +1165,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
'\\' => {
self.cursor += 1;
self.state = .string_surrogate_half_backslash;
@ -1170,7 +1176,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half_backslash => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
'u' => {
self.cursor += 1;
self.state = .string_surrogate_half_backslash_u;
@ -1180,7 +1187,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half_backslash_u => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
'D', 'd' => {
self.cursor += 1;
self.utf16_code_units[1] = 0xD << 12;
@ -1191,7 +1199,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half_backslash_u_1 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'C'...'F' => {
self.cursor += 1;
@ -1209,7 +1218,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half_backslash_u_2 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.cursor += 1;
@ -1233,7 +1243,8 @@ pub const Scanner = struct {
}
},
.string_surrogate_half_backslash_u_3 => {
const c = try self.expectByte();
if (self.cursor >= self.input.len) return self.endOfBufferInString();
const c = self.input[self.cursor];
switch (c) {
'0'...'9' => {
self.utf16_code_units[1] |= c - '0';
@ -1254,7 +1265,8 @@ pub const Scanner = struct {
},
.string_utf8_last_byte => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string;
@ -1264,7 +1276,8 @@ pub const Scanner = struct {
}
},
.string_utf8_second_to_last_byte => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
@ -1274,7 +1287,8 @@ pub const Scanner = struct {
}
},
.string_utf8_second_to_last_byte_guard_against_overlong => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0xA0...0xBF => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
@ -1284,7 +1298,8 @@ pub const Scanner = struct {
}
},
.string_utf8_second_to_last_byte_guard_against_surrogate_half => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x80...0x9F => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
@ -1294,7 +1309,8 @@ pub const Scanner = struct {
}
},
.string_utf8_third_to_last_byte => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
@ -1304,7 +1320,8 @@ pub const Scanner = struct {
}
},
.string_utf8_third_to_last_byte_guard_against_overlong => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x90...0xBF => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
@ -1314,7 +1331,8 @@ pub const Scanner = struct {
}
},
.string_utf8_third_to_last_byte_guard_against_too_large => {
switch (try self.expectByte()) {
if (self.cursor >= self.input.len) return self.endOfBufferInString();
switch (self.input[self.cursor]) {
0x80...0x8F => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
@ -1666,6 +1684,17 @@ pub const Scanner = struct {
self.value_start = self.cursor;
return slice;
}
fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 {
// Check if the escape sequence started before the current input buffer.
// (The algebra here is awkward to avoid unsigned underflow,
// but it's just making sure the slice on the next line isn't UB.)
if (self.cursor <= self.value_start + trailing_negative_offset) return "";
const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset];
// When trailing_negative_offset is non-zero, setting self.value_start doesn't matter,
// because we always set it again while emitting the .partial_string_escaped_*.
self.value_start = self.cursor;
return slice;
}
fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
const slice = self.takeValueSlice();
@ -1678,6 +1707,39 @@ pub const Scanner = struct {
return Token{ .partial_number = slice };
}
fn endOfBufferInString(self: *@This()) !Token {
if (self.is_end_of_input) return error.UnexpectedEndOfInput;
const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) {
// Don't include the escape sequence in the partial string.
.string_backslash => 1,
.string_backslash_u => 2,
.string_backslash_u_1 => 3,
.string_backslash_u_2 => 4,
.string_backslash_u_3 => 5,
.string_surrogate_half => 6,
.string_surrogate_half_backslash => 7,
.string_surrogate_half_backslash_u => 8,
.string_surrogate_half_backslash_u_1 => 9,
.string_surrogate_half_backslash_u_2 => 10,
.string_surrogate_half_backslash_u_3 => 11,
// Include everything up to the cursor otherwise.
.string,
.string_utf8_last_byte,
.string_utf8_second_to_last_byte,
.string_utf8_second_to_last_byte_guard_against_overlong,
.string_utf8_second_to_last_byte_guard_against_surrogate_half,
.string_utf8_third_to_last_byte,
.string_utf8_third_to_last_byte_guard_against_overlong,
.string_utf8_third_to_last_byte_guard_against_too_large,
=> 0,
else => unreachable,
});
if (slice.len == 0) return error.BufferUnderrun;
return Token{ .partial_string = slice };
}
fn partialStringCodepoint(code_point: u21) Token {
var buf: [4]u8 = undefined;
switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {