zig/deps/aro/Tokenizer.zig
Veikka Tuominen 58b07ea14f sync Aro dependency
ref: 482951b0e0eb99ec5dd122e7f893a007586f83f4
2023-10-17 11:55:01 +03:00

2171 lines
67 KiB
Zig
Vendored
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const std = @import("std");
const assert = std.debug.assert;
const Compilation = @import("Compilation.zig");
const Source = @import("Source.zig");
const LangOpts = @import("LangOpts.zig");
const Tokenizer = @This();
pub const Token = struct {
id: Id,
source: Source.Id,
start: u32 = 0,
end: u32 = 0,
line: u32 = 0,
pub const Id = enum(u8) {
invalid,
nl,
whitespace,
eof,
/// identifier containing solely basic character set characters
identifier,
/// identifier with at least one extended character
extended_identifier,
// string literals with prefixes
string_literal,
string_literal_utf_16,
string_literal_utf_8,
string_literal_utf_32,
string_literal_wide,
// <foobar> only generated by preprocessor
macro_string,
// char literals with prefixes
char_literal,
char_literal_utf_8,
char_literal_utf_16,
char_literal_utf_32,
char_literal_wide,
/// Integer literal tokens generated by preprocessor.
one,
zero,
bang,
bang_equal,
pipe,
pipe_pipe,
pipe_equal,
equal,
equal_equal,
l_paren,
r_paren,
l_brace,
r_brace,
l_bracket,
r_bracket,
period,
ellipsis,
caret,
caret_equal,
plus,
plus_plus,
plus_equal,
minus,
minus_minus,
minus_equal,
asterisk,
asterisk_equal,
percent,
percent_equal,
arrow,
colon,
colon_colon,
semicolon,
slash,
slash_equal,
comma,
ampersand,
ampersand_ampersand,
ampersand_equal,
question_mark,
angle_bracket_left,
angle_bracket_left_equal,
angle_bracket_angle_bracket_left,
angle_bracket_angle_bracket_left_equal,
angle_bracket_right,
angle_bracket_right_equal,
angle_bracket_angle_bracket_right,
angle_bracket_angle_bracket_right_equal,
tilde,
hash,
hash_hash,
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
macro_param,
/// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
macro_param_no_expand,
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
stringify_param,
/// Same as stringify_param, but for var args
stringify_va_args,
/// Special macro whitespace, always equal to a single space
macro_ws,
/// Special token for implementing __has_attribute
macro_param_has_attribute,
/// Special token for implementing __has_declspec_attribute
macro_param_has_declspec_attribute,
/// Special token for implementing __has_warning
macro_param_has_warning,
/// Special token for implementing __has_feature
macro_param_has_feature,
/// Special token for implementing __has_extension
macro_param_has_extension,
/// Special token for implementing __has_builtin
macro_param_has_builtin,
/// Special token for implementing __has_include
macro_param_has_include,
/// Special token for implementing __has_include_next
macro_param_has_include_next,
/// Special token for implementing __is_identifier
macro_param_is_identifier,
/// Special token for implementing __FILE__
macro_file,
/// Special token for implementing __LINE__
macro_line,
/// Special token for implementing __COUNTER__
macro_counter,
/// Special token for implementing _Pragma
macro_param_pragma_operator,
/// Special identifier for implementing __func__
macro_func,
/// Special identifier for implementing __FUNCTION__
macro_function,
/// Special identifier for implementing __PRETTY_FUNCTION__
macro_pretty_func,
keyword_auto,
keyword_auto_type,
keyword_break,
keyword_case,
keyword_char,
keyword_const,
keyword_continue,
keyword_default,
keyword_do,
keyword_double,
keyword_else,
keyword_enum,
keyword_extern,
keyword_float,
keyword_for,
keyword_goto,
keyword_if,
keyword_int,
keyword_long,
keyword_register,
keyword_return,
keyword_short,
keyword_signed,
keyword_sizeof,
keyword_static,
keyword_struct,
keyword_switch,
keyword_typedef,
keyword_typeof1,
keyword_typeof2,
keyword_union,
keyword_unsigned,
keyword_void,
keyword_volatile,
keyword_while,
// ISO C99
keyword_bool,
keyword_complex,
keyword_imaginary,
keyword_inline,
keyword_restrict,
// ISO C11
keyword_alignas,
keyword_alignof,
keyword_atomic,
keyword_generic,
keyword_noreturn,
keyword_static_assert,
keyword_thread_local,
// ISO C23
keyword_bit_int,
keyword_c23_alignas,
keyword_c23_alignof,
keyword_c23_bool,
keyword_c23_static_assert,
keyword_c23_thread_local,
keyword_constexpr,
keyword_true,
keyword_false,
keyword_nullptr,
// Preprocessor directives
keyword_include,
keyword_include_next,
keyword_embed,
keyword_define,
keyword_defined,
keyword_undef,
keyword_ifdef,
keyword_ifndef,
keyword_elif,
keyword_elifdef,
keyword_elifndef,
keyword_endif,
keyword_error,
keyword_warning,
keyword_pragma,
keyword_line,
keyword_va_args,
// gcc keywords
keyword_const1,
keyword_const2,
keyword_inline1,
keyword_inline2,
keyword_volatile1,
keyword_volatile2,
keyword_restrict1,
keyword_restrict2,
keyword_alignof1,
keyword_alignof2,
keyword_typeof,
keyword_attribute1,
keyword_attribute2,
keyword_extension,
keyword_asm,
keyword_asm1,
keyword_asm2,
keyword_float80,
keyword_float128,
keyword_int128,
keyword_imag1,
keyword_imag2,
keyword_real1,
keyword_real2,
keyword_float16,
// clang keywords
keyword_fp16,
// ms keywords
keyword_declspec,
keyword_int64,
keyword_int64_2,
keyword_int32,
keyword_int32_2,
keyword_int16,
keyword_int16_2,
keyword_int8,
keyword_int8_2,
keyword_stdcall,
keyword_stdcall2,
keyword_thiscall,
keyword_thiscall2,
keyword_vectorcall,
keyword_vectorcall2,
// builtins that require special parsing
builtin_choose_expr,
builtin_va_arg,
builtin_offsetof,
builtin_bitoffsetof,
builtin_types_compatible_p,
/// Generated by #embed directive
/// Decimal value with no prefix or suffix
embed_byte,
/// preprocessor number
/// An optional period, followed by a digit 0-9, followed by any number of letters
/// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
pp_num,
/// preprocessor placemarker token
/// generated if `##` is used with a zero-token argument
/// removed after substitution, so the parser should never see this
/// See C99 6.10.3.3.2
placemarker,
/// Virtual linemarker token output from preprocessor to indicate start of a new include
include_start,
/// Virtual linemarker token output from preprocessor to indicate resuming a file after
/// completion of the preceding #include
include_resume,
/// A comment token if asked to preserve comments.
comment,
/// Return true if token is identifier or keyword.
pub fn isMacroIdentifier(id: Id) bool {
switch (id) {
.keyword_include,
.keyword_include_next,
.keyword_embed,
.keyword_define,
.keyword_defined,
.keyword_undef,
.keyword_ifdef,
.keyword_ifndef,
.keyword_elif,
.keyword_elifdef,
.keyword_elifndef,
.keyword_endif,
.keyword_error,
.keyword_warning,
.keyword_pragma,
.keyword_line,
.keyword_va_args,
.macro_func,
.macro_function,
.macro_pretty_func,
.keyword_auto,
.keyword_auto_type,
.keyword_break,
.keyword_case,
.keyword_char,
.keyword_const,
.keyword_continue,
.keyword_default,
.keyword_do,
.keyword_double,
.keyword_else,
.keyword_enum,
.keyword_extern,
.keyword_float,
.keyword_for,
.keyword_goto,
.keyword_if,
.keyword_int,
.keyword_long,
.keyword_register,
.keyword_return,
.keyword_short,
.keyword_signed,
.keyword_sizeof,
.keyword_static,
.keyword_struct,
.keyword_switch,
.keyword_typedef,
.keyword_union,
.keyword_unsigned,
.keyword_void,
.keyword_volatile,
.keyword_while,
.keyword_bool,
.keyword_complex,
.keyword_imaginary,
.keyword_inline,
.keyword_restrict,
.keyword_alignas,
.keyword_alignof,
.keyword_atomic,
.keyword_generic,
.keyword_noreturn,
.keyword_static_assert,
.keyword_thread_local,
.identifier,
.extended_identifier,
.keyword_typeof,
.keyword_typeof1,
.keyword_typeof2,
.keyword_const1,
.keyword_const2,
.keyword_inline1,
.keyword_inline2,
.keyword_volatile1,
.keyword_volatile2,
.keyword_restrict1,
.keyword_restrict2,
.keyword_alignof1,
.keyword_alignof2,
.builtin_choose_expr,
.builtin_va_arg,
.builtin_offsetof,
.builtin_bitoffsetof,
.builtin_types_compatible_p,
.keyword_attribute1,
.keyword_attribute2,
.keyword_extension,
.keyword_asm,
.keyword_asm1,
.keyword_asm2,
.keyword_float80,
.keyword_float128,
.keyword_int128,
.keyword_imag1,
.keyword_imag2,
.keyword_real1,
.keyword_real2,
.keyword_float16,
.keyword_fp16,
.keyword_declspec,
.keyword_int64,
.keyword_int64_2,
.keyword_int32,
.keyword_int32_2,
.keyword_int16,
.keyword_int16_2,
.keyword_int8,
.keyword_int8_2,
.keyword_stdcall,
.keyword_stdcall2,
.keyword_thiscall,
.keyword_thiscall2,
.keyword_vectorcall,
.keyword_vectorcall2,
.keyword_bit_int,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_constexpr,
.keyword_true,
.keyword_false,
.keyword_nullptr,
=> return true,
else => return false,
}
}
/// Turn macro keywords into identifiers.
/// `keyword_defined` is special since it should only turn into an identifier if
/// we are *not* in an #if or #elif expression
pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
switch (id.*) {
.keyword_include,
.keyword_include_next,
.keyword_embed,
.keyword_define,
.keyword_undef,
.keyword_ifdef,
.keyword_ifndef,
.keyword_elif,
.keyword_elifdef,
.keyword_elifndef,
.keyword_endif,
.keyword_error,
.keyword_warning,
.keyword_pragma,
.keyword_line,
.keyword_va_args,
=> id.* = .identifier,
.keyword_defined => if (defined_to_identifier) {
id.* = .identifier;
},
else => {},
}
}
pub fn simplifyMacroKeyword(id: *Id) void {
simplifyMacroKeywordExtra(id, false);
}
pub fn lexeme(id: Id) ?[]const u8 {
return switch (id) {
.include_start,
.include_resume,
=> unreachable,
.invalid,
.identifier,
.extended_identifier,
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
.macro_string,
.whitespace,
.pp_num,
.embed_byte,
.comment,
=> null,
.zero => "0",
.one => "1",
.nl,
.eof,
.macro_param,
.macro_param_no_expand,
.stringify_param,
.stringify_va_args,
.macro_param_has_attribute,
.macro_param_has_declspec_attribute,
.macro_param_has_warning,
.macro_param_has_feature,
.macro_param_has_extension,
.macro_param_has_builtin,
.macro_param_has_include,
.macro_param_has_include_next,
.macro_param_is_identifier,
.macro_file,
.macro_line,
.macro_counter,
.macro_param_pragma_operator,
.placemarker,
=> "",
.macro_ws => " ",
.macro_func => "__func__",
.macro_function => "__FUNCTION__",
.macro_pretty_func => "__PRETTY_FUNCTION__",
.bang => "!",
.bang_equal => "!=",
.pipe => "|",
.pipe_pipe => "||",
.pipe_equal => "|=",
.equal => "=",
.equal_equal => "==",
.l_paren => "(",
.r_paren => ")",
.l_brace => "{",
.r_brace => "}",
.l_bracket => "[",
.r_bracket => "]",
.period => ".",
.ellipsis => "...",
.caret => "^",
.caret_equal => "^=",
.plus => "+",
.plus_plus => "++",
.plus_equal => "+=",
.minus => "-",
.minus_minus => "--",
.minus_equal => "-=",
.asterisk => "*",
.asterisk_equal => "*=",
.percent => "%",
.percent_equal => "%=",
.arrow => "->",
.colon => ":",
.colon_colon => "::",
.semicolon => ";",
.slash => "/",
.slash_equal => "/=",
.comma => ",",
.ampersand => "&",
.ampersand_ampersand => "&&",
.ampersand_equal => "&=",
.question_mark => "?",
.angle_bracket_left => "<",
.angle_bracket_left_equal => "<=",
.angle_bracket_angle_bracket_left => "<<",
.angle_bracket_angle_bracket_left_equal => "<<=",
.angle_bracket_right => ">",
.angle_bracket_right_equal => ">=",
.angle_bracket_angle_bracket_right => ">>",
.angle_bracket_angle_bracket_right_equal => ">>=",
.tilde => "~",
.hash => "#",
.hash_hash => "##",
.keyword_auto => "auto",
.keyword_auto_type => "__auto_type",
.keyword_break => "break",
.keyword_case => "case",
.keyword_char => "char",
.keyword_const => "const",
.keyword_continue => "continue",
.keyword_default => "default",
.keyword_do => "do",
.keyword_double => "double",
.keyword_else => "else",
.keyword_enum => "enum",
.keyword_extern => "extern",
.keyword_float => "float",
.keyword_for => "for",
.keyword_goto => "goto",
.keyword_if => "if",
.keyword_int => "int",
.keyword_long => "long",
.keyword_register => "register",
.keyword_return => "return",
.keyword_short => "short",
.keyword_signed => "signed",
.keyword_sizeof => "sizeof",
.keyword_static => "static",
.keyword_struct => "struct",
.keyword_switch => "switch",
.keyword_typedef => "typedef",
.keyword_typeof => "typeof",
.keyword_union => "union",
.keyword_unsigned => "unsigned",
.keyword_void => "void",
.keyword_volatile => "volatile",
.keyword_while => "while",
.keyword_bool => "_Bool",
.keyword_complex => "_Complex",
.keyword_imaginary => "_Imaginary",
.keyword_inline => "inline",
.keyword_restrict => "restrict",
.keyword_alignas => "_Alignas",
.keyword_alignof => "_Alignof",
.keyword_atomic => "_Atomic",
.keyword_generic => "_Generic",
.keyword_noreturn => "_Noreturn",
.keyword_static_assert => "_Static_assert",
.keyword_thread_local => "_Thread_local",
.keyword_bit_int => "_BitInt",
.keyword_c23_alignas => "alignas",
.keyword_c23_alignof => "alignof",
.keyword_c23_bool => "bool",
.keyword_c23_static_assert => "static_assert",
.keyword_c23_thread_local => "thread_local",
.keyword_constexpr => "constexpr",
.keyword_true => "true",
.keyword_false => "false",
.keyword_nullptr => "nullptr",
.keyword_include => "include",
.keyword_include_next => "include_next",
.keyword_embed => "embed",
.keyword_define => "define",
.keyword_defined => "defined",
.keyword_undef => "undef",
.keyword_ifdef => "ifdef",
.keyword_ifndef => "ifndef",
.keyword_elif => "elif",
.keyword_elifdef => "elifdef",
.keyword_elifndef => "elifndef",
.keyword_endif => "endif",
.keyword_error => "error",
.keyword_warning => "warning",
.keyword_pragma => "pragma",
.keyword_line => "line",
.keyword_va_args => "__VA_ARGS__",
.keyword_const1 => "__const",
.keyword_const2 => "__const__",
.keyword_inline1 => "__inline",
.keyword_inline2 => "__inline__",
.keyword_volatile1 => "__volatile",
.keyword_volatile2 => "__volatile__",
.keyword_restrict1 => "__restrict",
.keyword_restrict2 => "__restrict__",
.keyword_alignof1 => "__alignof",
.keyword_alignof2 => "__alignof__",
.keyword_typeof1 => "__typeof",
.keyword_typeof2 => "__typeof__",
.builtin_choose_expr => "__builtin_choose_expr",
.builtin_va_arg => "__builtin_va_arg",
.builtin_offsetof => "__builtin_offsetof",
.builtin_bitoffsetof => "__builtin_bitoffsetof",
.builtin_types_compatible_p => "__builtin_types_compatible_p",
.keyword_attribute1 => "__attribute",
.keyword_attribute2 => "__attribute__",
.keyword_extension => "__extension__",
.keyword_asm => "asm",
.keyword_asm1 => "__asm",
.keyword_asm2 => "__asm__",
.keyword_float80 => "__float80",
.keyword_float128 => "__float18",
.keyword_int128 => "__int128",
.keyword_imag1 => "__imag",
.keyword_imag2 => "__imag__",
.keyword_real1 => "__real",
.keyword_real2 => "__real__",
.keyword_float16 => "_Float16",
.keyword_fp16 => "__fp16",
.keyword_declspec => "__declspec",
.keyword_int64 => "__int64",
.keyword_int64_2 => "_int64",
.keyword_int32 => "__int32",
.keyword_int32_2 => "_int32",
.keyword_int16 => "__int16",
.keyword_int16_2 => "_int16",
.keyword_int8 => "__int8",
.keyword_int8_2 => "_int8",
.keyword_stdcall => "__stdcall",
.keyword_stdcall2 => "_stdcall",
.keyword_thiscall => "__thiscall",
.keyword_thiscall2 => "_thiscall",
.keyword_vectorcall => "__vectorcall",
.keyword_vectorcall2 => "_vectorcall",
};
}
pub fn symbol(id: Id) []const u8 {
return switch (id) {
.macro_string, .invalid => unreachable,
.identifier,
.extended_identifier,
.macro_func,
.macro_function,
.macro_pretty_func,
.builtin_choose_expr,
.builtin_va_arg,
.builtin_offsetof,
.builtin_bitoffsetof,
.builtin_types_compatible_p,
=> "an identifier",
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
=> "a string literal",
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
=> "a character literal",
.pp_num, .embed_byte => "A number",
else => id.lexeme().?,
};
}
/// tokens that can start an expression parsed by Preprocessor.expr
/// Note that eof, r_paren, and string literals cannot actually start a
/// preprocessor expression, but we include them here so that a nicer
/// error message can be generated by the parser.
pub fn validPreprocessorExprStart(id: Id) bool {
return switch (id) {
.eof,
.r_paren,
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
.l_paren,
.plus,
.minus,
.tilde,
.bang,
.identifier,
.extended_identifier,
.keyword_defined,
.one,
.zero,
.pp_num,
.keyword_true,
.keyword_false,
=> true,
else => false,
};
}
pub fn allowsDigraphs(id: Id, comp: *const Compilation) bool {
return switch (id) {
.l_bracket,
.r_bracket,
.l_brace,
.r_brace,
.hash,
.hash_hash,
=> comp.langopts.hasDigraphs(),
else => false,
};
}
pub fn canOpenGCCAsmStmt(id: Id) bool {
return switch (id) {
.keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
else => false,
};
}
pub fn isStringLiteral(id: Id) bool {
return switch (id) {
.string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
else => false,
};
}
};
/// double underscore and underscore + capital letter identifiers
/// belong to the implementation namespace, so we always convert them
/// to keywords.
pub fn getTokenId(comp: *const Compilation, str: []const u8) Token.Id {
const kw = all_kws.get(str) orelse return .identifier;
const standard = comp.langopts.standard;
return switch (kw) {
.keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
.keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
.keyword_typeof => if (standard.isGNU() or standard.atLeast(.c2x)) kw else .identifier,
.keyword_asm => if (standard.isGNU()) kw else .identifier,
.keyword_declspec => if (comp.langopts.declspec_attrs) kw else .identifier,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_constexpr,
.keyword_true,
.keyword_false,
.keyword_nullptr,
.keyword_elifdef,
.keyword_elifndef,
=> if (standard.atLeast(.c2x)) kw else .identifier,
.keyword_int64,
.keyword_int64_2,
.keyword_int32,
.keyword_int32_2,
.keyword_int16,
.keyword_int16_2,
.keyword_int8,
.keyword_int8_2,
.keyword_stdcall2,
.keyword_thiscall2,
.keyword_vectorcall2,
=> if (comp.langopts.ms_extensions) kw else .identifier,
else => kw,
};
}
const all_kws = std.ComptimeStringMap(Id, .{
.{ "auto", auto: {
@setEvalBranchQuota(3000);
break :auto .keyword_auto;
} },
.{ "break", .keyword_break },
.{ "case", .keyword_case },
.{ "char", .keyword_char },
.{ "const", .keyword_const },
.{ "continue", .keyword_continue },
.{ "default", .keyword_default },
.{ "do", .keyword_do },
.{ "double", .keyword_double },
.{ "else", .keyword_else },
.{ "enum", .keyword_enum },
.{ "extern", .keyword_extern },
.{ "float", .keyword_float },
.{ "for", .keyword_for },
.{ "goto", .keyword_goto },
.{ "if", .keyword_if },
.{ "int", .keyword_int },
.{ "long", .keyword_long },
.{ "register", .keyword_register },
.{ "return", .keyword_return },
.{ "short", .keyword_short },
.{ "signed", .keyword_signed },
.{ "sizeof", .keyword_sizeof },
.{ "static", .keyword_static },
.{ "struct", .keyword_struct },
.{ "switch", .keyword_switch },
.{ "typedef", .keyword_typedef },
.{ "union", .keyword_union },
.{ "unsigned", .keyword_unsigned },
.{ "void", .keyword_void },
.{ "volatile", .keyword_volatile },
.{ "while", .keyword_while },
.{ "__typeof__", .keyword_typeof2 },
.{ "__typeof", .keyword_typeof1 },
// ISO C99
.{ "_Bool", .keyword_bool },
.{ "_Complex", .keyword_complex },
.{ "_Imaginary", .keyword_imaginary },
.{ "inline", .keyword_inline },
.{ "restrict", .keyword_restrict },
// ISO C11
.{ "_Alignas", .keyword_alignas },
.{ "_Alignof", .keyword_alignof },
.{ "_Atomic", .keyword_atomic },
.{ "_Generic", .keyword_generic },
.{ "_Noreturn", .keyword_noreturn },
.{ "_Static_assert", .keyword_static_assert },
.{ "_Thread_local", .keyword_thread_local },
// ISO C23
.{ "_BitInt", .keyword_bit_int },
.{ "alignas", .keyword_c23_alignas },
.{ "alignof", .keyword_c23_alignof },
.{ "bool", .keyword_c23_bool },
.{ "static_assert", .keyword_c23_static_assert },
.{ "thread_local", .keyword_c23_thread_local },
.{ "constexpr", .keyword_constexpr },
.{ "true", .keyword_true },
.{ "false", .keyword_false },
.{ "nullptr", .keyword_nullptr },
// Preprocessor directives
.{ "include", .keyword_include },
.{ "include_next", .keyword_include_next },
.{ "embed", .keyword_embed },
.{ "define", .keyword_define },
.{ "defined", .keyword_defined },
.{ "undef", .keyword_undef },
.{ "ifdef", .keyword_ifdef },
.{ "ifndef", .keyword_ifndef },
.{ "elif", .keyword_elif },
.{ "elifdef", .keyword_elifdef },
.{ "elifndef", .keyword_elifndef },
.{ "endif", .keyword_endif },
.{ "error", .keyword_error },
.{ "warning", .keyword_warning },
.{ "pragma", .keyword_pragma },
.{ "line", .keyword_line },
.{ "__VA_ARGS__", .keyword_va_args },
.{ "__func__", .macro_func },
.{ "__FUNCTION__", .macro_function },
.{ "__PRETTY_FUNCTION__", .macro_pretty_func },
// gcc keywords
.{ "__auto_type", .keyword_auto_type },
.{ "__const", .keyword_const1 },
.{ "__const__", .keyword_const2 },
.{ "__inline", .keyword_inline1 },
.{ "__inline__", .keyword_inline2 },
.{ "__volatile", .keyword_volatile1 },
.{ "__volatile__", .keyword_volatile2 },
.{ "__restrict", .keyword_restrict1 },
.{ "__restrict__", .keyword_restrict2 },
.{ "__alignof", .keyword_alignof1 },
.{ "__alignof__", .keyword_alignof2 },
.{ "typeof", .keyword_typeof },
.{ "__attribute", .keyword_attribute1 },
.{ "__attribute__", .keyword_attribute2 },
.{ "__extension__", .keyword_extension },
.{ "asm", .keyword_asm },
.{ "__asm", .keyword_asm1 },
.{ "__asm__", .keyword_asm2 },
.{ "__float80", .keyword_float80 },
.{ "__float128", .keyword_float128 },
.{ "__int128", .keyword_int128 },
.{ "__imag", .keyword_imag1 },
.{ "__imag__", .keyword_imag2 },
.{ "__real", .keyword_real1 },
.{ "__real__", .keyword_real2 },
.{ "_Float16", .keyword_float16 },
// clang keywords
.{ "__fp16", .keyword_fp16 },
// ms keywords
.{ "__declspec", .keyword_declspec },
.{ "__int64", .keyword_int64 },
.{ "_int64", .keyword_int64_2 },
.{ "__int32", .keyword_int32 },
.{ "_int32", .keyword_int32_2 },
.{ "__int16", .keyword_int16 },
.{ "_int16", .keyword_int16_2 },
.{ "__int8", .keyword_int8 },
.{ "_int8", .keyword_int8_2 },
.{ "__stdcall", .keyword_stdcall },
.{ "_stdcall", .keyword_stdcall2 },
.{ "__thiscall", .keyword_thiscall },
.{ "_thiscall", .keyword_thiscall2 },
.{ "__vectorcall", .keyword_vectorcall },
.{ "_vectorcall", .keyword_vectorcall2 },
// builtins that require special parsing
.{ "__builtin_choose_expr", .builtin_choose_expr },
.{ "__builtin_va_arg", .builtin_va_arg },
.{ "__builtin_offsetof", .builtin_offsetof },
.{ "__builtin_bitoffsetof", .builtin_bitoffsetof },
.{ "__builtin_types_compatible_p", .builtin_types_compatible_p },
});
};
buf: []const u8,
index: u32 = 0,
source: Source.Id,
comp: *const Compilation,
line: u32 = 1,
/// Used to parse include strings with Windows style paths.
path_escapes: bool = false,
pub fn next(self: *Tokenizer) Token {
var state: enum {
start,
whitespace,
u,
u8,
U,
L,
string_literal,
path_escape,
char_literal_start,
char_literal,
char_escape_sequence,
escape_sequence,
octal_escape,
hex_escape,
unicode_escape,
identifier,
extended_identifier,
equal,
bang,
pipe,
colon,
percent,
asterisk,
plus,
angle_bracket_left,
angle_bracket_angle_bracket_left,
angle_bracket_right,
angle_bracket_angle_bracket_right,
caret,
period,
period2,
minus,
slash,
ampersand,
hash,
hash_digraph,
hash_hash_digraph_partial,
line_comment,
multi_line_comment,
multi_line_comment_asterisk,
multi_line_comment_done,
pp_num,
pp_num_exponent,
pp_num_digit_separator,
} = .start;
var start = self.index;
var id: Token.Id = .eof;
var return_state = state;
var counter: u32 = 0;
while (self.index < self.buf.len) : (self.index += 1) {
const c = self.buf[self.index];
switch (state) {
.start => switch (c) {
'\n' => {
id = .nl;
self.index += 1;
self.line += 1;
break;
},
'"' => {
id = .string_literal;
state = .string_literal;
},
'\'' => {
id = .char_literal;
state = .char_literal_start;
},
'u' => state = .u,
'U' => state = .U,
'L' => state = .L,
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
'=' => state = .equal,
'!' => state = .bang,
'|' => state = .pipe,
'(' => {
id = .l_paren;
self.index += 1;
break;
},
')' => {
id = .r_paren;
self.index += 1;
break;
},
'[' => {
id = .l_bracket;
self.index += 1;
break;
},
']' => {
id = .r_bracket;
self.index += 1;
break;
},
';' => {
id = .semicolon;
self.index += 1;
break;
},
',' => {
id = .comma;
self.index += 1;
break;
},
'?' => {
id = .question_mark;
self.index += 1;
break;
},
':' => state = .colon,
'%' => state = .percent,
'*' => state = .asterisk,
'+' => state = .plus,
'<' => state = .angle_bracket_left,
'>' => state = .angle_bracket_right,
'^' => state = .caret,
'{' => {
id = .l_brace;
self.index += 1;
break;
},
'}' => {
id = .r_brace;
self.index += 1;
break;
},
'~' => {
id = .tilde;
self.index += 1;
break;
},
'.' => state = .period,
'-' => state = .minus,
'/' => state = .slash,
'&' => state = .ampersand,
'#' => state = .hash,
'0'...'9' => state = .pp_num,
'\t', '\x0B', '\x0C', ' ' => state = .whitespace,
'$' => if (self.comp.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = .invalid;
self.index += 1;
break;
},
0x1A => if (self.comp.langopts.ms_extensions) {
id = .eof;
break;
} else {
id = .invalid;
self.index += 1;
break;
},
0x80...0xFF => state = .extended_identifier,
else => {
id = .invalid;
self.index += 1;
break;
},
},
.whitespace => switch (c) {
'\t', '\x0B', '\x0C', ' ' => {},
else => {
id = .whitespace;
break;
},
},
.u => switch (c) {
'8' => {
state = .u8;
},
'\'' => {
id = .char_literal_utf_16;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_utf_16;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.u8 => switch (c) {
'\"' => {
id = .string_literal_utf_8;
state = .string_literal;
},
'\'' => {
id = .char_literal_utf_8;
state = .char_literal_start;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.U => switch (c) {
'\'' => {
id = .char_literal_utf_32;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_utf_32;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.L => switch (c) {
'\'' => {
id = .char_literal_wide;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_wide;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.string_literal => switch (c) {
'\\' => {
return_state = .string_literal;
state = if (self.path_escapes) .path_escape else .escape_sequence;
},
'"' => {
self.index += 1;
break;
},
'\n' => {
id = .invalid;
break;
},
'\r' => unreachable,
else => {},
},
.path_escape => {
state = .string_literal;
},
.char_literal_start => switch (c) {
'\\' => {
state = .char_escape_sequence;
},
'\'', '\n' => {
id = .invalid;
break;
},
else => {
state = .char_literal;
},
},
.char_literal => switch (c) {
'\\' => {
state = .char_escape_sequence;
},
'\'' => {
self.index += 1;
break;
},
'\n' => {
id = .invalid;
break;
},
else => {},
},
.char_escape_sequence => switch (c) {
'\r', '\n' => unreachable, // removed by line splicing
else => state = .char_literal,
},
.escape_sequence => switch (c) {
'\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => {
state = return_state;
},
'\r', '\n' => unreachable, // removed by line splicing
'0'...'7' => {
counter = 1;
state = .octal_escape;
},
'x' => state = .hex_escape,
'u' => {
counter = 4;
state = .unicode_escape;
},
'U' => {
counter = 8;
state = .unicode_escape;
},
else => {
id = .invalid;
break;
},
},
.octal_escape => switch (c) {
'0'...'7' => {
counter += 1;
if (counter == 3) state = return_state;
},
else => {
self.index -= 1;
state = return_state;
},
},
.hex_escape => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => {
self.index -= 1;
state = return_state;
},
},
.unicode_escape => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {
counter -= 1;
if (counter == 0) state = return_state;
},
else => {
id = .invalid;
break;
},
},
.identifier, .extended_identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
'$' => if (self.comp.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
break;
},
0x80...0xFF => state = .extended_identifier,
else => {
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
break;
},
},
.equal => switch (c) {
'=' => {
id = .equal_equal;
self.index += 1;
break;
},
else => {
id = .equal;
break;
},
},
.bang => switch (c) {
'=' => {
id = .bang_equal;
self.index += 1;
break;
},
else => {
id = .bang;
break;
},
},
.pipe => switch (c) {
'=' => {
id = .pipe_equal;
self.index += 1;
break;
},
'|' => {
id = .pipe_pipe;
self.index += 1;
break;
},
else => {
id = .pipe;
break;
},
},
.colon => switch (c) {
'>' => {
if (self.comp.langopts.hasDigraphs()) {
id = .r_bracket;
self.index += 1;
} else {
id = .colon;
}
break;
},
':' => {
if (self.comp.langopts.standard.atLeast(.c2x)) {
id = .colon_colon;
self.index += 1;
break;
} else {
id = .colon;
break;
}
},
else => {
id = .colon;
break;
},
},
.percent => switch (c) {
'=' => {
id = .percent_equal;
self.index += 1;
break;
},
'>' => {
if (self.comp.langopts.hasDigraphs()) {
id = .r_brace;
self.index += 1;
} else {
id = .percent;
}
break;
},
':' => {
if (self.comp.langopts.hasDigraphs()) {
state = .hash_digraph;
} else {
id = .percent;
break;
}
},
else => {
id = .percent;
break;
},
},
.asterisk => switch (c) {
'=' => {
id = .asterisk_equal;
self.index += 1;
break;
},
else => {
id = .asterisk;
break;
},
},
.plus => switch (c) {
'=' => {
id = .plus_equal;
self.index += 1;
break;
},
'+' => {
id = .plus_plus;
self.index += 1;
break;
},
else => {
id = .plus;
break;
},
},
.angle_bracket_left => switch (c) {
'<' => state = .angle_bracket_angle_bracket_left,
'=' => {
id = .angle_bracket_left_equal;
self.index += 1;
break;
},
':' => {
if (self.comp.langopts.hasDigraphs()) {
id = .l_bracket;
self.index += 1;
} else {
id = .angle_bracket_left;
}
break;
},
'%' => {
if (self.comp.langopts.hasDigraphs()) {
id = .l_brace;
self.index += 1;
} else {
id = .angle_bracket_left;
}
break;
},
else => {
id = .angle_bracket_left;
break;
},
},
.angle_bracket_angle_bracket_left => switch (c) {
'=' => {
id = .angle_bracket_angle_bracket_left_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_angle_bracket_left;
break;
},
},
.angle_bracket_right => switch (c) {
'>' => state = .angle_bracket_angle_bracket_right,
'=' => {
id = .angle_bracket_right_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_right;
break;
},
},
.angle_bracket_angle_bracket_right => switch (c) {
'=' => {
id = .angle_bracket_angle_bracket_right_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_angle_bracket_right;
break;
},
},
.caret => switch (c) {
'=' => {
id = .caret_equal;
self.index += 1;
break;
},
else => {
id = .caret;
break;
},
},
.period => switch (c) {
'.' => state = .period2,
'0'...'9' => state = .pp_num,
else => {
id = .period;
break;
},
},
.period2 => switch (c) {
'.' => {
id = .ellipsis;
self.index += 1;
break;
},
else => {
id = .period;
self.index -= 1;
break;
},
},
.minus => switch (c) {
'>' => {
id = .arrow;
self.index += 1;
break;
},
'=' => {
id = .minus_equal;
self.index += 1;
break;
},
'-' => {
id = .minus_minus;
self.index += 1;
break;
},
else => {
id = .minus;
break;
},
},
.ampersand => switch (c) {
'&' => {
id = .ampersand_ampersand;
self.index += 1;
break;
},
'=' => {
id = .ampersand_equal;
self.index += 1;
break;
},
else => {
id = .ampersand;
break;
},
},
.hash => switch (c) {
'#' => {
id = .hash_hash;
self.index += 1;
break;
},
else => {
id = .hash;
break;
},
},
.hash_digraph => switch (c) {
'%' => state = .hash_hash_digraph_partial,
else => {
id = .hash;
break;
},
},
.hash_hash_digraph_partial => switch (c) {
':' => {
id = .hash_hash;
self.index += 1;
break;
},
else => {
id = .hash;
self.index -= 1; // re-tokenize the percent
break;
},
},
.slash => switch (c) {
'/' => state = .line_comment,
'*' => state = .multi_line_comment,
'=' => {
id = .slash_equal;
self.index += 1;
break;
},
else => {
id = .slash;
break;
},
},
.line_comment => switch (c) {
'\n' => {
if (self.comp.langopts.preserve_comments) {
id = .comment;
break;
}
self.index -= 1;
state = .start;
},
else => {},
},
.multi_line_comment => switch (c) {
'*' => state = .multi_line_comment_asterisk,
'\n' => self.line += 1,
else => {},
},
.multi_line_comment_asterisk => switch (c) {
'/' => {
if (self.comp.langopts.preserve_comments) {
self.index += 1;
id = .comment;
break;
}
state = .multi_line_comment_done;
},
'\n' => {
self.line += 1;
state = .multi_line_comment;
},
'*' => {},
else => state = .multi_line_comment,
},
.multi_line_comment_done => switch (c) {
'\n' => {
start = self.index;
id = .nl;
self.index += 1;
self.line += 1;
break;
},
'\r' => unreachable,
'\t', '\x0B', '\x0C', ' ' => {
start = self.index;
state = .whitespace;
},
else => {
id = .whitespace;
break;
},
},
.pp_num => switch (c) {
'a'...'d',
'A'...'D',
'f'...'o',
'F'...'O',
'q'...'z',
'Q'...'Z',
'0'...'9',
'_',
'.',
=> {},
'e', 'E', 'p', 'P' => state = .pp_num_exponent,
'\'' => if (self.comp.langopts.standard.atLeast(.c2x)) {
state = .pp_num_digit_separator;
} else {
id = .pp_num;
break;
},
else => {
id = .pp_num;
break;
},
},
.pp_num_digit_separator => switch (c) {
'a'...'d',
'A'...'D',
'f'...'o',
'F'...'O',
'q'...'z',
'Q'...'Z',
'0'...'9',
'_',
=> state = .pp_num,
else => {
self.index -= 1;
id = .pp_num;
break;
},
},
.pp_num_exponent => switch (c) {
'a'...'z',
'A'...'Z',
'0'...'9',
'_',
'.',
'+',
'-',
=> state = .pp_num,
else => {
id = .pp_num;
break;
},
},
}
} else if (self.index == self.buf.len) {
switch (state) {
.start, .line_comment => {},
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]),
.extended_identifier => id = .extended_identifier,
.period2,
.string_literal,
.path_escape,
.char_literal_start,
.char_literal,
.escape_sequence,
.char_escape_sequence,
.octal_escape,
.hex_escape,
.unicode_escape,
.multi_line_comment,
.multi_line_comment_asterisk,
=> id = .invalid,
.whitespace => id = .whitespace,
.multi_line_comment_done => id = .whitespace,
.equal => id = .equal,
.bang => id = .bang,
.minus => id = .minus,
.slash => id = .slash,
.ampersand => id = .ampersand,
.hash => id = .hash,
.period => id = .period,
.pipe => id = .pipe,
.angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
.angle_bracket_right => id = .angle_bracket_right,
.angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
.angle_bracket_left => id = .angle_bracket_left,
.plus => id = .plus,
.colon => id = .colon,
.percent => id = .percent,
.caret => id = .caret,
.asterisk => id = .asterisk,
.hash_digraph => id = .hash,
.hash_hash_digraph_partial => {
id = .hash;
self.index -= 1; // re-tokenize the percent
},
.pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
}
}
return .{
.id = id,
.start = start,
.end = self.index,
.line = self.line,
.source = self.source,
};
}
pub fn nextNoWS(self: *Tokenizer) Token {
var tok = self.next();
while (tok.id == .whitespace or tok.id == .comment) tok = self.next();
return tok;
}
pub fn nextNoWSComments(self: *Tokenizer) Token {
var tok = self.next();
while (tok.id == .whitespace) tok = self.next();
return tok;
}
test "operators" {
try expectTokens(
\\ ! != | || |= = ==
\\ ( ) { } [ ] . .. ...
\\ ^ ^= + ++ += - -- -=
\\ * *= % %= -> : ; / /=
\\ , & && &= ? < <= <<
\\ <<= > >= >> >>= ~ # ##
\\
, &.{
.bang,
.bang_equal,
.pipe,
.pipe_pipe,
.pipe_equal,
.equal,
.equal_equal,
.nl,
.l_paren,
.r_paren,
.l_brace,
.r_brace,
.l_bracket,
.r_bracket,
.period,
.period,
.period,
.ellipsis,
.nl,
.caret,
.caret_equal,
.plus,
.plus_plus,
.plus_equal,
.minus,
.minus_minus,
.minus_equal,
.nl,
.asterisk,
.asterisk_equal,
.percent,
.percent_equal,
.arrow,
.colon,
.semicolon,
.slash,
.slash_equal,
.nl,
.comma,
.ampersand,
.ampersand_ampersand,
.ampersand_equal,
.question_mark,
.angle_bracket_left,
.angle_bracket_left_equal,
.angle_bracket_angle_bracket_left,
.nl,
.angle_bracket_angle_bracket_left_equal,
.angle_bracket_right,
.angle_bracket_right_equal,
.angle_bracket_angle_bracket_right,
.angle_bracket_angle_bracket_right_equal,
.tilde,
.hash,
.hash_hash,
.nl,
});
}
test "keywords" {
try expectTokens(
\\auto __auto_type break case char const continue default do
\\double else enum extern float for goto if int
\\long register return short signed sizeof static
\\struct switch typedef union unsigned void volatile
\\while _Bool _Complex _Imaginary inline restrict _Alignas
\\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
\\__attribute __attribute__
\\
, &.{
.keyword_auto,
.keyword_auto_type,
.keyword_break,
.keyword_case,
.keyword_char,
.keyword_const,
.keyword_continue,
.keyword_default,
.keyword_do,
.nl,
.keyword_double,
.keyword_else,
.keyword_enum,
.keyword_extern,
.keyword_float,
.keyword_for,
.keyword_goto,
.keyword_if,
.keyword_int,
.nl,
.keyword_long,
.keyword_register,
.keyword_return,
.keyword_short,
.keyword_signed,
.keyword_sizeof,
.keyword_static,
.nl,
.keyword_struct,
.keyword_switch,
.keyword_typedef,
.keyword_union,
.keyword_unsigned,
.keyword_void,
.keyword_volatile,
.nl,
.keyword_while,
.keyword_bool,
.keyword_complex,
.keyword_imaginary,
.keyword_inline,
.keyword_restrict,
.keyword_alignas,
.nl,
.keyword_alignof,
.keyword_atomic,
.keyword_generic,
.keyword_noreturn,
.keyword_static_assert,
.keyword_thread_local,
.nl,
.keyword_attribute1,
.keyword_attribute2,
.nl,
});
}
test "preprocessor keywords" {
try expectTokens(
\\#include
\\#include_next
\\#embed
\\#define
\\#ifdef
\\#ifndef
\\#error
\\#pragma
\\
, &.{
.hash,
.keyword_include,
.nl,
.hash,
.keyword_include_next,
.nl,
.hash,
.keyword_embed,
.nl,
.hash,
.keyword_define,
.nl,
.hash,
.keyword_ifdef,
.nl,
.hash,
.keyword_ifndef,
.nl,
.hash,
.keyword_error,
.nl,
.hash,
.keyword_pragma,
.nl,
});
}
test "line continuation" {
try expectTokens(
\\#define foo \
\\ bar
\\"foo\
\\ bar"
\\#define "foo"
\\ "bar"
\\#define "foo" \
\\ "bar"
, &.{
.hash,
.keyword_define,
.identifier,
.identifier,
.nl,
.string_literal,
.nl,
.hash,
.keyword_define,
.string_literal,
.nl,
.string_literal,
.nl,
.hash,
.keyword_define,
.string_literal,
.string_literal,
});
}
test "string prefix" {
try expectTokens(
\\"foo"
\\u"foo"
\\u8"foo"
\\U"foo"
\\L"foo"
\\'foo'
\\u8'A'
\\u'foo'
\\U'foo'
\\L'foo'
\\
, &.{
.string_literal,
.nl,
.string_literal_utf_16,
.nl,
.string_literal_utf_8,
.nl,
.string_literal_utf_32,
.nl,
.string_literal_wide,
.nl,
.char_literal,
.nl,
.char_literal_utf_8,
.nl,
.char_literal_utf_16,
.nl,
.char_literal_utf_32,
.nl,
.char_literal_wide,
.nl,
});
}
test "num suffixes" {
try expectTokens(
\\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
\\ 0l 0lu 0ll 0llu 0
\\ 1u 1ul 1ull 1
\\ 1.0i 1.0I
\\ 1.0if 1.0If 1.0fi 1.0fI
\\ 1.0il 1.0Il 1.0li 1.0lI
\\
, &.{
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
});
}
test "comments" {
try expectTokens(
\\//foo
\\#foo
, &.{
.nl,
.hash,
.identifier,
});
}
test "extended identifiers" {
try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("1™", &.{ .pp_num, .extended_identifier });
try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
try expectTokens("..™", &.{ .period, .period, .extended_identifier });
try expectTokens("0™", &.{ .pp_num, .extended_identifier });
try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid });
try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
}
test "digraphs" {
try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
}
test "C23 keywords" {
try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr", &.{
.keyword_true,
.keyword_false,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_nullptr,
}, .c2x);
}
fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void {
var comp = Compilation.init(std.testing.allocator);
defer comp.deinit();
if (standard) |provided| {
comp.langopts.standard = provided;
}
const source = try comp.addSourceFromBuffer("path", contents);
var tokenizer = Tokenizer{
.buf = source.buf,
.source = source.id,
.comp = &comp,
};
var i: usize = 0;
while (i < expected_tokens.len) {
const token = tokenizer.next();
if (token.id == .whitespace) continue;
const expected_token_id = expected_tokens[i];
i += 1;
if (!std.meta.eql(token.id, expected_token_id)) {
std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
return error.TokensDoNotEqual;
}
}
const last_token = tokenizer.next();
try std.testing.expect(last_token.id == .eof);
}
fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
return expectTokensExtra(contents, expected_tokens, null);
}