mirror of
https://github.com/ziglang/zig.git
synced 2025-02-10 22:50:18 +00:00
2171 lines
67 KiB
Zig
Vendored
2171 lines
67 KiB
Zig
Vendored
const std = @import("std");
|
||
const assert = std.debug.assert;
|
||
const Compilation = @import("Compilation.zig");
|
||
const Source = @import("Source.zig");
|
||
const LangOpts = @import("LangOpts.zig");
|
||
|
||
const Tokenizer = @This();
|
||
|
||
pub const Token = struct {
|
||
id: Id,
|
||
source: Source.Id,
|
||
start: u32 = 0,
|
||
end: u32 = 0,
|
||
line: u32 = 0,
|
||
|
||
pub const Id = enum(u8) {
|
||
invalid,
|
||
nl,
|
||
whitespace,
|
||
eof,
|
||
/// identifier containing solely basic character set characters
|
||
identifier,
|
||
/// identifier with at least one extended character
|
||
extended_identifier,
|
||
|
||
// string literals with prefixes
|
||
string_literal,
|
||
string_literal_utf_16,
|
||
string_literal_utf_8,
|
||
string_literal_utf_32,
|
||
string_literal_wide,
|
||
|
||
// <foobar> only generated by preprocessor
|
||
macro_string,
|
||
|
||
// char literals with prefixes
|
||
char_literal,
|
||
char_literal_utf_8,
|
||
char_literal_utf_16,
|
||
char_literal_utf_32,
|
||
char_literal_wide,
|
||
|
||
/// Integer literal tokens generated by preprocessor.
|
||
one,
|
||
zero,
|
||
|
||
bang,
|
||
bang_equal,
|
||
pipe,
|
||
pipe_pipe,
|
||
pipe_equal,
|
||
equal,
|
||
equal_equal,
|
||
l_paren,
|
||
r_paren,
|
||
l_brace,
|
||
r_brace,
|
||
l_bracket,
|
||
r_bracket,
|
||
period,
|
||
ellipsis,
|
||
caret,
|
||
caret_equal,
|
||
plus,
|
||
plus_plus,
|
||
plus_equal,
|
||
minus,
|
||
minus_minus,
|
||
minus_equal,
|
||
asterisk,
|
||
asterisk_equal,
|
||
percent,
|
||
percent_equal,
|
||
arrow,
|
||
colon,
|
||
colon_colon,
|
||
semicolon,
|
||
slash,
|
||
slash_equal,
|
||
comma,
|
||
ampersand,
|
||
ampersand_ampersand,
|
||
ampersand_equal,
|
||
question_mark,
|
||
angle_bracket_left,
|
||
angle_bracket_left_equal,
|
||
angle_bracket_angle_bracket_left,
|
||
angle_bracket_angle_bracket_left_equal,
|
||
angle_bracket_right,
|
||
angle_bracket_right_equal,
|
||
angle_bracket_angle_bracket_right,
|
||
angle_bracket_angle_bracket_right_equal,
|
||
tilde,
|
||
hash,
|
||
hash_hash,
|
||
|
||
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
|
||
macro_param,
|
||
/// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
|
||
macro_param_no_expand,
|
||
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
|
||
stringify_param,
|
||
/// Same as stringify_param, but for var args
|
||
stringify_va_args,
|
||
/// Special macro whitespace, always equal to a single space
|
||
macro_ws,
|
||
/// Special token for implementing __has_attribute
|
||
macro_param_has_attribute,
|
||
/// Special token for implementing __has_declspec_attribute
|
||
macro_param_has_declspec_attribute,
|
||
/// Special token for implementing __has_warning
|
||
macro_param_has_warning,
|
||
/// Special token for implementing __has_feature
|
||
macro_param_has_feature,
|
||
/// Special token for implementing __has_extension
|
||
macro_param_has_extension,
|
||
/// Special token for implementing __has_builtin
|
||
macro_param_has_builtin,
|
||
/// Special token for implementing __has_include
|
||
macro_param_has_include,
|
||
/// Special token for implementing __has_include_next
|
||
macro_param_has_include_next,
|
||
/// Special token for implementing __is_identifier
|
||
macro_param_is_identifier,
|
||
/// Special token for implementing __FILE__
|
||
macro_file,
|
||
/// Special token for implementing __LINE__
|
||
macro_line,
|
||
/// Special token for implementing __COUNTER__
|
||
macro_counter,
|
||
/// Special token for implementing _Pragma
|
||
macro_param_pragma_operator,
|
||
|
||
/// Special identifier for implementing __func__
|
||
macro_func,
|
||
/// Special identifier for implementing __FUNCTION__
|
||
macro_function,
|
||
/// Special identifier for implementing __PRETTY_FUNCTION__
|
||
macro_pretty_func,
|
||
|
||
keyword_auto,
|
||
keyword_auto_type,
|
||
keyword_break,
|
||
keyword_case,
|
||
keyword_char,
|
||
keyword_const,
|
||
keyword_continue,
|
||
keyword_default,
|
||
keyword_do,
|
||
keyword_double,
|
||
keyword_else,
|
||
keyword_enum,
|
||
keyword_extern,
|
||
keyword_float,
|
||
keyword_for,
|
||
keyword_goto,
|
||
keyword_if,
|
||
keyword_int,
|
||
keyword_long,
|
||
keyword_register,
|
||
keyword_return,
|
||
keyword_short,
|
||
keyword_signed,
|
||
keyword_sizeof,
|
||
keyword_static,
|
||
keyword_struct,
|
||
keyword_switch,
|
||
keyword_typedef,
|
||
keyword_typeof1,
|
||
keyword_typeof2,
|
||
keyword_union,
|
||
keyword_unsigned,
|
||
keyword_void,
|
||
keyword_volatile,
|
||
keyword_while,
|
||
|
||
// ISO C99
|
||
keyword_bool,
|
||
keyword_complex,
|
||
keyword_imaginary,
|
||
keyword_inline,
|
||
keyword_restrict,
|
||
|
||
// ISO C11
|
||
keyword_alignas,
|
||
keyword_alignof,
|
||
keyword_atomic,
|
||
keyword_generic,
|
||
keyword_noreturn,
|
||
keyword_static_assert,
|
||
keyword_thread_local,
|
||
|
||
// ISO C23
|
||
keyword_bit_int,
|
||
keyword_c23_alignas,
|
||
keyword_c23_alignof,
|
||
keyword_c23_bool,
|
||
keyword_c23_static_assert,
|
||
keyword_c23_thread_local,
|
||
keyword_constexpr,
|
||
keyword_true,
|
||
keyword_false,
|
||
keyword_nullptr,
|
||
|
||
// Preprocessor directives
|
||
keyword_include,
|
||
keyword_include_next,
|
||
keyword_embed,
|
||
keyword_define,
|
||
keyword_defined,
|
||
keyword_undef,
|
||
keyword_ifdef,
|
||
keyword_ifndef,
|
||
keyword_elif,
|
||
keyword_elifdef,
|
||
keyword_elifndef,
|
||
keyword_endif,
|
||
keyword_error,
|
||
keyword_warning,
|
||
keyword_pragma,
|
||
keyword_line,
|
||
keyword_va_args,
|
||
|
||
// gcc keywords
|
||
keyword_const1,
|
||
keyword_const2,
|
||
keyword_inline1,
|
||
keyword_inline2,
|
||
keyword_volatile1,
|
||
keyword_volatile2,
|
||
keyword_restrict1,
|
||
keyword_restrict2,
|
||
keyword_alignof1,
|
||
keyword_alignof2,
|
||
keyword_typeof,
|
||
keyword_attribute1,
|
||
keyword_attribute2,
|
||
keyword_extension,
|
||
keyword_asm,
|
||
keyword_asm1,
|
||
keyword_asm2,
|
||
keyword_float80,
|
||
keyword_float128,
|
||
keyword_int128,
|
||
keyword_imag1,
|
||
keyword_imag2,
|
||
keyword_real1,
|
||
keyword_real2,
|
||
keyword_float16,
|
||
|
||
// clang keywords
|
||
keyword_fp16,
|
||
|
||
// ms keywords
|
||
keyword_declspec,
|
||
keyword_int64,
|
||
keyword_int64_2,
|
||
keyword_int32,
|
||
keyword_int32_2,
|
||
keyword_int16,
|
||
keyword_int16_2,
|
||
keyword_int8,
|
||
keyword_int8_2,
|
||
keyword_stdcall,
|
||
keyword_stdcall2,
|
||
keyword_thiscall,
|
||
keyword_thiscall2,
|
||
keyword_vectorcall,
|
||
keyword_vectorcall2,
|
||
|
||
// builtins that require special parsing
|
||
builtin_choose_expr,
|
||
builtin_va_arg,
|
||
builtin_offsetof,
|
||
builtin_bitoffsetof,
|
||
builtin_types_compatible_p,
|
||
|
||
/// Generated by #embed directive
|
||
/// Decimal value with no prefix or suffix
|
||
embed_byte,
|
||
|
||
/// preprocessor number
|
||
/// An optional period, followed by a digit 0-9, followed by any number of letters
|
||
/// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
|
||
pp_num,
|
||
|
||
/// preprocessor placemarker token
|
||
/// generated if `##` is used with a zero-token argument
|
||
/// removed after substitution, so the parser should never see this
|
||
/// See C99 6.10.3.3.2
|
||
placemarker,
|
||
|
||
/// Virtual linemarker token output from preprocessor to indicate start of a new include
|
||
include_start,
|
||
|
||
/// Virtual linemarker token output from preprocessor to indicate resuming a file after
|
||
/// completion of the preceding #include
|
||
include_resume,
|
||
|
||
/// A comment token if asked to preserve comments.
|
||
comment,
|
||
|
||
/// Return true if token is identifier or keyword.
|
||
pub fn isMacroIdentifier(id: Id) bool {
|
||
switch (id) {
|
||
.keyword_include,
|
||
.keyword_include_next,
|
||
.keyword_embed,
|
||
.keyword_define,
|
||
.keyword_defined,
|
||
.keyword_undef,
|
||
.keyword_ifdef,
|
||
.keyword_ifndef,
|
||
.keyword_elif,
|
||
.keyword_elifdef,
|
||
.keyword_elifndef,
|
||
.keyword_endif,
|
||
.keyword_error,
|
||
.keyword_warning,
|
||
.keyword_pragma,
|
||
.keyword_line,
|
||
.keyword_va_args,
|
||
.macro_func,
|
||
.macro_function,
|
||
.macro_pretty_func,
|
||
.keyword_auto,
|
||
.keyword_auto_type,
|
||
.keyword_break,
|
||
.keyword_case,
|
||
.keyword_char,
|
||
.keyword_const,
|
||
.keyword_continue,
|
||
.keyword_default,
|
||
.keyword_do,
|
||
.keyword_double,
|
||
.keyword_else,
|
||
.keyword_enum,
|
||
.keyword_extern,
|
||
.keyword_float,
|
||
.keyword_for,
|
||
.keyword_goto,
|
||
.keyword_if,
|
||
.keyword_int,
|
||
.keyword_long,
|
||
.keyword_register,
|
||
.keyword_return,
|
||
.keyword_short,
|
||
.keyword_signed,
|
||
.keyword_sizeof,
|
||
.keyword_static,
|
||
.keyword_struct,
|
||
.keyword_switch,
|
||
.keyword_typedef,
|
||
.keyword_union,
|
||
.keyword_unsigned,
|
||
.keyword_void,
|
||
.keyword_volatile,
|
||
.keyword_while,
|
||
.keyword_bool,
|
||
.keyword_complex,
|
||
.keyword_imaginary,
|
||
.keyword_inline,
|
||
.keyword_restrict,
|
||
.keyword_alignas,
|
||
.keyword_alignof,
|
||
.keyword_atomic,
|
||
.keyword_generic,
|
||
.keyword_noreturn,
|
||
.keyword_static_assert,
|
||
.keyword_thread_local,
|
||
.identifier,
|
||
.extended_identifier,
|
||
.keyword_typeof,
|
||
.keyword_typeof1,
|
||
.keyword_typeof2,
|
||
.keyword_const1,
|
||
.keyword_const2,
|
||
.keyword_inline1,
|
||
.keyword_inline2,
|
||
.keyword_volatile1,
|
||
.keyword_volatile2,
|
||
.keyword_restrict1,
|
||
.keyword_restrict2,
|
||
.keyword_alignof1,
|
||
.keyword_alignof2,
|
||
.builtin_choose_expr,
|
||
.builtin_va_arg,
|
||
.builtin_offsetof,
|
||
.builtin_bitoffsetof,
|
||
.builtin_types_compatible_p,
|
||
.keyword_attribute1,
|
||
.keyword_attribute2,
|
||
.keyword_extension,
|
||
.keyword_asm,
|
||
.keyword_asm1,
|
||
.keyword_asm2,
|
||
.keyword_float80,
|
||
.keyword_float128,
|
||
.keyword_int128,
|
||
.keyword_imag1,
|
||
.keyword_imag2,
|
||
.keyword_real1,
|
||
.keyword_real2,
|
||
.keyword_float16,
|
||
.keyword_fp16,
|
||
.keyword_declspec,
|
||
.keyword_int64,
|
||
.keyword_int64_2,
|
||
.keyword_int32,
|
||
.keyword_int32_2,
|
||
.keyword_int16,
|
||
.keyword_int16_2,
|
||
.keyword_int8,
|
||
.keyword_int8_2,
|
||
.keyword_stdcall,
|
||
.keyword_stdcall2,
|
||
.keyword_thiscall,
|
||
.keyword_thiscall2,
|
||
.keyword_vectorcall,
|
||
.keyword_vectorcall2,
|
||
.keyword_bit_int,
|
||
.keyword_c23_alignas,
|
||
.keyword_c23_alignof,
|
||
.keyword_c23_bool,
|
||
.keyword_c23_static_assert,
|
||
.keyword_c23_thread_local,
|
||
.keyword_constexpr,
|
||
.keyword_true,
|
||
.keyword_false,
|
||
.keyword_nullptr,
|
||
=> return true,
|
||
else => return false,
|
||
}
|
||
}
|
||
|
||
/// Turn macro keywords into identifiers.
|
||
/// `keyword_defined` is special since it should only turn into an identifier if
|
||
/// we are *not* in an #if or #elif expression
|
||
pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
|
||
switch (id.*) {
|
||
.keyword_include,
|
||
.keyword_include_next,
|
||
.keyword_embed,
|
||
.keyword_define,
|
||
.keyword_undef,
|
||
.keyword_ifdef,
|
||
.keyword_ifndef,
|
||
.keyword_elif,
|
||
.keyword_elifdef,
|
||
.keyword_elifndef,
|
||
.keyword_endif,
|
||
.keyword_error,
|
||
.keyword_warning,
|
||
.keyword_pragma,
|
||
.keyword_line,
|
||
.keyword_va_args,
|
||
=> id.* = .identifier,
|
||
.keyword_defined => if (defined_to_identifier) {
|
||
id.* = .identifier;
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
|
||
pub fn simplifyMacroKeyword(id: *Id) void {
|
||
simplifyMacroKeywordExtra(id, false);
|
||
}
|
||
|
||
pub fn lexeme(id: Id) ?[]const u8 {
|
||
return switch (id) {
|
||
.include_start,
|
||
.include_resume,
|
||
=> unreachable,
|
||
|
||
.invalid,
|
||
.identifier,
|
||
.extended_identifier,
|
||
.string_literal,
|
||
.string_literal_utf_16,
|
||
.string_literal_utf_8,
|
||
.string_literal_utf_32,
|
||
.string_literal_wide,
|
||
.char_literal,
|
||
.char_literal_utf_8,
|
||
.char_literal_utf_16,
|
||
.char_literal_utf_32,
|
||
.char_literal_wide,
|
||
.macro_string,
|
||
.whitespace,
|
||
.pp_num,
|
||
.embed_byte,
|
||
.comment,
|
||
=> null,
|
||
|
||
.zero => "0",
|
||
.one => "1",
|
||
|
||
.nl,
|
||
.eof,
|
||
.macro_param,
|
||
.macro_param_no_expand,
|
||
.stringify_param,
|
||
.stringify_va_args,
|
||
.macro_param_has_attribute,
|
||
.macro_param_has_declspec_attribute,
|
||
.macro_param_has_warning,
|
||
.macro_param_has_feature,
|
||
.macro_param_has_extension,
|
||
.macro_param_has_builtin,
|
||
.macro_param_has_include,
|
||
.macro_param_has_include_next,
|
||
.macro_param_is_identifier,
|
||
.macro_file,
|
||
.macro_line,
|
||
.macro_counter,
|
||
.macro_param_pragma_operator,
|
||
.placemarker,
|
||
=> "",
|
||
.macro_ws => " ",
|
||
|
||
.macro_func => "__func__",
|
||
.macro_function => "__FUNCTION__",
|
||
.macro_pretty_func => "__PRETTY_FUNCTION__",
|
||
|
||
.bang => "!",
|
||
.bang_equal => "!=",
|
||
.pipe => "|",
|
||
.pipe_pipe => "||",
|
||
.pipe_equal => "|=",
|
||
.equal => "=",
|
||
.equal_equal => "==",
|
||
.l_paren => "(",
|
||
.r_paren => ")",
|
||
.l_brace => "{",
|
||
.r_brace => "}",
|
||
.l_bracket => "[",
|
||
.r_bracket => "]",
|
||
.period => ".",
|
||
.ellipsis => "...",
|
||
.caret => "^",
|
||
.caret_equal => "^=",
|
||
.plus => "+",
|
||
.plus_plus => "++",
|
||
.plus_equal => "+=",
|
||
.minus => "-",
|
||
.minus_minus => "--",
|
||
.minus_equal => "-=",
|
||
.asterisk => "*",
|
||
.asterisk_equal => "*=",
|
||
.percent => "%",
|
||
.percent_equal => "%=",
|
||
.arrow => "->",
|
||
.colon => ":",
|
||
.colon_colon => "::",
|
||
.semicolon => ";",
|
||
.slash => "/",
|
||
.slash_equal => "/=",
|
||
.comma => ",",
|
||
.ampersand => "&",
|
||
.ampersand_ampersand => "&&",
|
||
.ampersand_equal => "&=",
|
||
.question_mark => "?",
|
||
.angle_bracket_left => "<",
|
||
.angle_bracket_left_equal => "<=",
|
||
.angle_bracket_angle_bracket_left => "<<",
|
||
.angle_bracket_angle_bracket_left_equal => "<<=",
|
||
.angle_bracket_right => ">",
|
||
.angle_bracket_right_equal => ">=",
|
||
.angle_bracket_angle_bracket_right => ">>",
|
||
.angle_bracket_angle_bracket_right_equal => ">>=",
|
||
.tilde => "~",
|
||
.hash => "#",
|
||
.hash_hash => "##",
|
||
|
||
.keyword_auto => "auto",
|
||
.keyword_auto_type => "__auto_type",
|
||
.keyword_break => "break",
|
||
.keyword_case => "case",
|
||
.keyword_char => "char",
|
||
.keyword_const => "const",
|
||
.keyword_continue => "continue",
|
||
.keyword_default => "default",
|
||
.keyword_do => "do",
|
||
.keyword_double => "double",
|
||
.keyword_else => "else",
|
||
.keyword_enum => "enum",
|
||
.keyword_extern => "extern",
|
||
.keyword_float => "float",
|
||
.keyword_for => "for",
|
||
.keyword_goto => "goto",
|
||
.keyword_if => "if",
|
||
.keyword_int => "int",
|
||
.keyword_long => "long",
|
||
.keyword_register => "register",
|
||
.keyword_return => "return",
|
||
.keyword_short => "short",
|
||
.keyword_signed => "signed",
|
||
.keyword_sizeof => "sizeof",
|
||
.keyword_static => "static",
|
||
.keyword_struct => "struct",
|
||
.keyword_switch => "switch",
|
||
.keyword_typedef => "typedef",
|
||
.keyword_typeof => "typeof",
|
||
.keyword_union => "union",
|
||
.keyword_unsigned => "unsigned",
|
||
.keyword_void => "void",
|
||
.keyword_volatile => "volatile",
|
||
.keyword_while => "while",
|
||
.keyword_bool => "_Bool",
|
||
.keyword_complex => "_Complex",
|
||
.keyword_imaginary => "_Imaginary",
|
||
.keyword_inline => "inline",
|
||
.keyword_restrict => "restrict",
|
||
.keyword_alignas => "_Alignas",
|
||
.keyword_alignof => "_Alignof",
|
||
.keyword_atomic => "_Atomic",
|
||
.keyword_generic => "_Generic",
|
||
.keyword_noreturn => "_Noreturn",
|
||
.keyword_static_assert => "_Static_assert",
|
||
.keyword_thread_local => "_Thread_local",
|
||
.keyword_bit_int => "_BitInt",
|
||
.keyword_c23_alignas => "alignas",
|
||
.keyword_c23_alignof => "alignof",
|
||
.keyword_c23_bool => "bool",
|
||
.keyword_c23_static_assert => "static_assert",
|
||
.keyword_c23_thread_local => "thread_local",
|
||
.keyword_constexpr => "constexpr",
|
||
.keyword_true => "true",
|
||
.keyword_false => "false",
|
||
.keyword_nullptr => "nullptr",
|
||
.keyword_include => "include",
|
||
.keyword_include_next => "include_next",
|
||
.keyword_embed => "embed",
|
||
.keyword_define => "define",
|
||
.keyword_defined => "defined",
|
||
.keyword_undef => "undef",
|
||
.keyword_ifdef => "ifdef",
|
||
.keyword_ifndef => "ifndef",
|
||
.keyword_elif => "elif",
|
||
.keyword_elifdef => "elifdef",
|
||
.keyword_elifndef => "elifndef",
|
||
.keyword_endif => "endif",
|
||
.keyword_error => "error",
|
||
.keyword_warning => "warning",
|
||
.keyword_pragma => "pragma",
|
||
.keyword_line => "line",
|
||
.keyword_va_args => "__VA_ARGS__",
|
||
.keyword_const1 => "__const",
|
||
.keyword_const2 => "__const__",
|
||
.keyword_inline1 => "__inline",
|
||
.keyword_inline2 => "__inline__",
|
||
.keyword_volatile1 => "__volatile",
|
||
.keyword_volatile2 => "__volatile__",
|
||
.keyword_restrict1 => "__restrict",
|
||
.keyword_restrict2 => "__restrict__",
|
||
.keyword_alignof1 => "__alignof",
|
||
.keyword_alignof2 => "__alignof__",
|
||
.keyword_typeof1 => "__typeof",
|
||
.keyword_typeof2 => "__typeof__",
|
||
.builtin_choose_expr => "__builtin_choose_expr",
|
||
.builtin_va_arg => "__builtin_va_arg",
|
||
.builtin_offsetof => "__builtin_offsetof",
|
||
.builtin_bitoffsetof => "__builtin_bitoffsetof",
|
||
.builtin_types_compatible_p => "__builtin_types_compatible_p",
|
||
.keyword_attribute1 => "__attribute",
|
||
.keyword_attribute2 => "__attribute__",
|
||
.keyword_extension => "__extension__",
|
||
.keyword_asm => "asm",
|
||
.keyword_asm1 => "__asm",
|
||
.keyword_asm2 => "__asm__",
|
||
.keyword_float80 => "__float80",
|
||
.keyword_float128 => "__float18",
|
||
.keyword_int128 => "__int128",
|
||
.keyword_imag1 => "__imag",
|
||
.keyword_imag2 => "__imag__",
|
||
.keyword_real1 => "__real",
|
||
.keyword_real2 => "__real__",
|
||
.keyword_float16 => "_Float16",
|
||
.keyword_fp16 => "__fp16",
|
||
.keyword_declspec => "__declspec",
|
||
.keyword_int64 => "__int64",
|
||
.keyword_int64_2 => "_int64",
|
||
.keyword_int32 => "__int32",
|
||
.keyword_int32_2 => "_int32",
|
||
.keyword_int16 => "__int16",
|
||
.keyword_int16_2 => "_int16",
|
||
.keyword_int8 => "__int8",
|
||
.keyword_int8_2 => "_int8",
|
||
.keyword_stdcall => "__stdcall",
|
||
.keyword_stdcall2 => "_stdcall",
|
||
.keyword_thiscall => "__thiscall",
|
||
.keyword_thiscall2 => "_thiscall",
|
||
.keyword_vectorcall => "__vectorcall",
|
||
.keyword_vectorcall2 => "_vectorcall",
|
||
};
|
||
}
|
||
|
||
pub fn symbol(id: Id) []const u8 {
|
||
return switch (id) {
|
||
.macro_string, .invalid => unreachable,
|
||
.identifier,
|
||
.extended_identifier,
|
||
.macro_func,
|
||
.macro_function,
|
||
.macro_pretty_func,
|
||
.builtin_choose_expr,
|
||
.builtin_va_arg,
|
||
.builtin_offsetof,
|
||
.builtin_bitoffsetof,
|
||
.builtin_types_compatible_p,
|
||
=> "an identifier",
|
||
.string_literal,
|
||
.string_literal_utf_16,
|
||
.string_literal_utf_8,
|
||
.string_literal_utf_32,
|
||
.string_literal_wide,
|
||
=> "a string literal",
|
||
.char_literal,
|
||
.char_literal_utf_8,
|
||
.char_literal_utf_16,
|
||
.char_literal_utf_32,
|
||
.char_literal_wide,
|
||
=> "a character literal",
|
||
.pp_num, .embed_byte => "A number",
|
||
else => id.lexeme().?,
|
||
};
|
||
}
|
||
|
||
/// tokens that can start an expression parsed by Preprocessor.expr
|
||
/// Note that eof, r_paren, and string literals cannot actually start a
|
||
/// preprocessor expression, but we include them here so that a nicer
|
||
/// error message can be generated by the parser.
|
||
pub fn validPreprocessorExprStart(id: Id) bool {
|
||
return switch (id) {
|
||
.eof,
|
||
.r_paren,
|
||
.string_literal,
|
||
.string_literal_utf_16,
|
||
.string_literal_utf_8,
|
||
.string_literal_utf_32,
|
||
.string_literal_wide,
|
||
|
||
.char_literal,
|
||
.char_literal_utf_8,
|
||
.char_literal_utf_16,
|
||
.char_literal_utf_32,
|
||
.char_literal_wide,
|
||
.l_paren,
|
||
.plus,
|
||
.minus,
|
||
.tilde,
|
||
.bang,
|
||
.identifier,
|
||
.extended_identifier,
|
||
.keyword_defined,
|
||
.one,
|
||
.zero,
|
||
.pp_num,
|
||
.keyword_true,
|
||
.keyword_false,
|
||
=> true,
|
||
else => false,
|
||
};
|
||
}
|
||
|
||
pub fn allowsDigraphs(id: Id, comp: *const Compilation) bool {
|
||
return switch (id) {
|
||
.l_bracket,
|
||
.r_bracket,
|
||
.l_brace,
|
||
.r_brace,
|
||
.hash,
|
||
.hash_hash,
|
||
=> comp.langopts.hasDigraphs(),
|
||
else => false,
|
||
};
|
||
}
|
||
|
||
pub fn canOpenGCCAsmStmt(id: Id) bool {
|
||
return switch (id) {
|
||
.keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
|
||
else => false,
|
||
};
|
||
}
|
||
|
||
pub fn isStringLiteral(id: Id) bool {
|
||
return switch (id) {
|
||
.string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
|
||
else => false,
|
||
};
|
||
}
|
||
};
|
||
|
||
/// double underscore and underscore + capital letter identifiers
|
||
/// belong to the implementation namespace, so we always convert them
|
||
/// to keywords.
|
||
pub fn getTokenId(comp: *const Compilation, str: []const u8) Token.Id {
|
||
const kw = all_kws.get(str) orelse return .identifier;
|
||
const standard = comp.langopts.standard;
|
||
return switch (kw) {
|
||
.keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
|
||
.keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
|
||
.keyword_typeof => if (standard.isGNU() or standard.atLeast(.c2x)) kw else .identifier,
|
||
.keyword_asm => if (standard.isGNU()) kw else .identifier,
|
||
.keyword_declspec => if (comp.langopts.declspec_attrs) kw else .identifier,
|
||
|
||
.keyword_c23_alignas,
|
||
.keyword_c23_alignof,
|
||
.keyword_c23_bool,
|
||
.keyword_c23_static_assert,
|
||
.keyword_c23_thread_local,
|
||
.keyword_constexpr,
|
||
.keyword_true,
|
||
.keyword_false,
|
||
.keyword_nullptr,
|
||
.keyword_elifdef,
|
||
.keyword_elifndef,
|
||
=> if (standard.atLeast(.c2x)) kw else .identifier,
|
||
|
||
.keyword_int64,
|
||
.keyword_int64_2,
|
||
.keyword_int32,
|
||
.keyword_int32_2,
|
||
.keyword_int16,
|
||
.keyword_int16_2,
|
||
.keyword_int8,
|
||
.keyword_int8_2,
|
||
.keyword_stdcall2,
|
||
.keyword_thiscall2,
|
||
.keyword_vectorcall2,
|
||
=> if (comp.langopts.ms_extensions) kw else .identifier,
|
||
else => kw,
|
||
};
|
||
}
|
||
|
||
const all_kws = std.ComptimeStringMap(Id, .{
|
||
.{ "auto", auto: {
|
||
@setEvalBranchQuota(3000);
|
||
break :auto .keyword_auto;
|
||
} },
|
||
.{ "break", .keyword_break },
|
||
.{ "case", .keyword_case },
|
||
.{ "char", .keyword_char },
|
||
.{ "const", .keyword_const },
|
||
.{ "continue", .keyword_continue },
|
||
.{ "default", .keyword_default },
|
||
.{ "do", .keyword_do },
|
||
.{ "double", .keyword_double },
|
||
.{ "else", .keyword_else },
|
||
.{ "enum", .keyword_enum },
|
||
.{ "extern", .keyword_extern },
|
||
.{ "float", .keyword_float },
|
||
.{ "for", .keyword_for },
|
||
.{ "goto", .keyword_goto },
|
||
.{ "if", .keyword_if },
|
||
.{ "int", .keyword_int },
|
||
.{ "long", .keyword_long },
|
||
.{ "register", .keyword_register },
|
||
.{ "return", .keyword_return },
|
||
.{ "short", .keyword_short },
|
||
.{ "signed", .keyword_signed },
|
||
.{ "sizeof", .keyword_sizeof },
|
||
.{ "static", .keyword_static },
|
||
.{ "struct", .keyword_struct },
|
||
.{ "switch", .keyword_switch },
|
||
.{ "typedef", .keyword_typedef },
|
||
.{ "union", .keyword_union },
|
||
.{ "unsigned", .keyword_unsigned },
|
||
.{ "void", .keyword_void },
|
||
.{ "volatile", .keyword_volatile },
|
||
.{ "while", .keyword_while },
|
||
.{ "__typeof__", .keyword_typeof2 },
|
||
.{ "__typeof", .keyword_typeof1 },
|
||
|
||
// ISO C99
|
||
.{ "_Bool", .keyword_bool },
|
||
.{ "_Complex", .keyword_complex },
|
||
.{ "_Imaginary", .keyword_imaginary },
|
||
.{ "inline", .keyword_inline },
|
||
.{ "restrict", .keyword_restrict },
|
||
|
||
// ISO C11
|
||
.{ "_Alignas", .keyword_alignas },
|
||
.{ "_Alignof", .keyword_alignof },
|
||
.{ "_Atomic", .keyword_atomic },
|
||
.{ "_Generic", .keyword_generic },
|
||
.{ "_Noreturn", .keyword_noreturn },
|
||
.{ "_Static_assert", .keyword_static_assert },
|
||
.{ "_Thread_local", .keyword_thread_local },
|
||
|
||
// ISO C23
|
||
.{ "_BitInt", .keyword_bit_int },
|
||
.{ "alignas", .keyword_c23_alignas },
|
||
.{ "alignof", .keyword_c23_alignof },
|
||
.{ "bool", .keyword_c23_bool },
|
||
.{ "static_assert", .keyword_c23_static_assert },
|
||
.{ "thread_local", .keyword_c23_thread_local },
|
||
.{ "constexpr", .keyword_constexpr },
|
||
.{ "true", .keyword_true },
|
||
.{ "false", .keyword_false },
|
||
.{ "nullptr", .keyword_nullptr },
|
||
|
||
// Preprocessor directives
|
||
.{ "include", .keyword_include },
|
||
.{ "include_next", .keyword_include_next },
|
||
.{ "embed", .keyword_embed },
|
||
.{ "define", .keyword_define },
|
||
.{ "defined", .keyword_defined },
|
||
.{ "undef", .keyword_undef },
|
||
.{ "ifdef", .keyword_ifdef },
|
||
.{ "ifndef", .keyword_ifndef },
|
||
.{ "elif", .keyword_elif },
|
||
.{ "elifdef", .keyword_elifdef },
|
||
.{ "elifndef", .keyword_elifndef },
|
||
.{ "endif", .keyword_endif },
|
||
.{ "error", .keyword_error },
|
||
.{ "warning", .keyword_warning },
|
||
.{ "pragma", .keyword_pragma },
|
||
.{ "line", .keyword_line },
|
||
.{ "__VA_ARGS__", .keyword_va_args },
|
||
.{ "__func__", .macro_func },
|
||
.{ "__FUNCTION__", .macro_function },
|
||
.{ "__PRETTY_FUNCTION__", .macro_pretty_func },
|
||
|
||
// gcc keywords
|
||
.{ "__auto_type", .keyword_auto_type },
|
||
.{ "__const", .keyword_const1 },
|
||
.{ "__const__", .keyword_const2 },
|
||
.{ "__inline", .keyword_inline1 },
|
||
.{ "__inline__", .keyword_inline2 },
|
||
.{ "__volatile", .keyword_volatile1 },
|
||
.{ "__volatile__", .keyword_volatile2 },
|
||
.{ "__restrict", .keyword_restrict1 },
|
||
.{ "__restrict__", .keyword_restrict2 },
|
||
.{ "__alignof", .keyword_alignof1 },
|
||
.{ "__alignof__", .keyword_alignof2 },
|
||
.{ "typeof", .keyword_typeof },
|
||
.{ "__attribute", .keyword_attribute1 },
|
||
.{ "__attribute__", .keyword_attribute2 },
|
||
.{ "__extension__", .keyword_extension },
|
||
.{ "asm", .keyword_asm },
|
||
.{ "__asm", .keyword_asm1 },
|
||
.{ "__asm__", .keyword_asm2 },
|
||
.{ "__float80", .keyword_float80 },
|
||
.{ "__float128", .keyword_float128 },
|
||
.{ "__int128", .keyword_int128 },
|
||
.{ "__imag", .keyword_imag1 },
|
||
.{ "__imag__", .keyword_imag2 },
|
||
.{ "__real", .keyword_real1 },
|
||
.{ "__real__", .keyword_real2 },
|
||
.{ "_Float16", .keyword_float16 },
|
||
|
||
// clang keywords
|
||
.{ "__fp16", .keyword_fp16 },
|
||
|
||
// ms keywords
|
||
.{ "__declspec", .keyword_declspec },
|
||
.{ "__int64", .keyword_int64 },
|
||
.{ "_int64", .keyword_int64_2 },
|
||
.{ "__int32", .keyword_int32 },
|
||
.{ "_int32", .keyword_int32_2 },
|
||
.{ "__int16", .keyword_int16 },
|
||
.{ "_int16", .keyword_int16_2 },
|
||
.{ "__int8", .keyword_int8 },
|
||
.{ "_int8", .keyword_int8_2 },
|
||
.{ "__stdcall", .keyword_stdcall },
|
||
.{ "_stdcall", .keyword_stdcall2 },
|
||
.{ "__thiscall", .keyword_thiscall },
|
||
.{ "_thiscall", .keyword_thiscall2 },
|
||
.{ "__vectorcall", .keyword_vectorcall },
|
||
.{ "_vectorcall", .keyword_vectorcall2 },
|
||
|
||
// builtins that require special parsing
|
||
.{ "__builtin_choose_expr", .builtin_choose_expr },
|
||
.{ "__builtin_va_arg", .builtin_va_arg },
|
||
.{ "__builtin_offsetof", .builtin_offsetof },
|
||
.{ "__builtin_bitoffsetof", .builtin_bitoffsetof },
|
||
.{ "__builtin_types_compatible_p", .builtin_types_compatible_p },
|
||
});
|
||
};
|
||
|
||
buf: []const u8,
|
||
index: u32 = 0,
|
||
source: Source.Id,
|
||
comp: *const Compilation,
|
||
line: u32 = 1,
|
||
/// Used to parse include strings with Windows style paths.
|
||
path_escapes: bool = false,
|
||
|
||
pub fn next(self: *Tokenizer) Token {
|
||
var state: enum {
|
||
start,
|
||
whitespace,
|
||
u,
|
||
u8,
|
||
U,
|
||
L,
|
||
string_literal,
|
||
path_escape,
|
||
char_literal_start,
|
||
char_literal,
|
||
char_escape_sequence,
|
||
escape_sequence,
|
||
octal_escape,
|
||
hex_escape,
|
||
unicode_escape,
|
||
identifier,
|
||
extended_identifier,
|
||
equal,
|
||
bang,
|
||
pipe,
|
||
colon,
|
||
percent,
|
||
asterisk,
|
||
plus,
|
||
angle_bracket_left,
|
||
angle_bracket_angle_bracket_left,
|
||
angle_bracket_right,
|
||
angle_bracket_angle_bracket_right,
|
||
caret,
|
||
period,
|
||
period2,
|
||
minus,
|
||
slash,
|
||
ampersand,
|
||
hash,
|
||
hash_digraph,
|
||
hash_hash_digraph_partial,
|
||
line_comment,
|
||
multi_line_comment,
|
||
multi_line_comment_asterisk,
|
||
multi_line_comment_done,
|
||
pp_num,
|
||
pp_num_exponent,
|
||
pp_num_digit_separator,
|
||
} = .start;
|
||
|
||
var start = self.index;
|
||
var id: Token.Id = .eof;
|
||
|
||
var return_state = state;
|
||
var counter: u32 = 0;
|
||
while (self.index < self.buf.len) : (self.index += 1) {
|
||
const c = self.buf[self.index];
|
||
switch (state) {
|
||
.start => switch (c) {
|
||
'\n' => {
|
||
id = .nl;
|
||
self.index += 1;
|
||
self.line += 1;
|
||
break;
|
||
},
|
||
'"' => {
|
||
id = .string_literal;
|
||
state = .string_literal;
|
||
},
|
||
'\'' => {
|
||
id = .char_literal;
|
||
state = .char_literal_start;
|
||
},
|
||
'u' => state = .u,
|
||
'U' => state = .U,
|
||
'L' => state = .L,
|
||
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
|
||
'=' => state = .equal,
|
||
'!' => state = .bang,
|
||
'|' => state = .pipe,
|
||
'(' => {
|
||
id = .l_paren;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
')' => {
|
||
id = .r_paren;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'[' => {
|
||
id = .l_bracket;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
']' => {
|
||
id = .r_bracket;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
';' => {
|
||
id = .semicolon;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
',' => {
|
||
id = .comma;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'?' => {
|
||
id = .question_mark;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
':' => state = .colon,
|
||
'%' => state = .percent,
|
||
'*' => state = .asterisk,
|
||
'+' => state = .plus,
|
||
'<' => state = .angle_bracket_left,
|
||
'>' => state = .angle_bracket_right,
|
||
'^' => state = .caret,
|
||
'{' => {
|
||
id = .l_brace;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'}' => {
|
||
id = .r_brace;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'~' => {
|
||
id = .tilde;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'.' => state = .period,
|
||
'-' => state = .minus,
|
||
'/' => state = .slash,
|
||
'&' => state = .ampersand,
|
||
'#' => state = .hash,
|
||
'0'...'9' => state = .pp_num,
|
||
'\t', '\x0B', '\x0C', ' ' => state = .whitespace,
|
||
'$' => if (self.comp.langopts.dollars_in_identifiers) {
|
||
state = .extended_identifier;
|
||
} else {
|
||
id = .invalid;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
0x1A => if (self.comp.langopts.ms_extensions) {
|
||
id = .eof;
|
||
break;
|
||
} else {
|
||
id = .invalid;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
0x80...0xFF => state = .extended_identifier,
|
||
else => {
|
||
id = .invalid;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
},
|
||
.whitespace => switch (c) {
|
||
'\t', '\x0B', '\x0C', ' ' => {},
|
||
else => {
|
||
id = .whitespace;
|
||
break;
|
||
},
|
||
},
|
||
.u => switch (c) {
|
||
'8' => {
|
||
state = .u8;
|
||
},
|
||
'\'' => {
|
||
id = .char_literal_utf_16;
|
||
state = .char_literal_start;
|
||
},
|
||
'\"' => {
|
||
id = .string_literal_utf_16;
|
||
state = .string_literal;
|
||
},
|
||
else => {
|
||
self.index -= 1;
|
||
state = .identifier;
|
||
},
|
||
},
|
||
.u8 => switch (c) {
|
||
'\"' => {
|
||
id = .string_literal_utf_8;
|
||
state = .string_literal;
|
||
},
|
||
'\'' => {
|
||
id = .char_literal_utf_8;
|
||
state = .char_literal_start;
|
||
},
|
||
else => {
|
||
self.index -= 1;
|
||
state = .identifier;
|
||
},
|
||
},
|
||
.U => switch (c) {
|
||
'\'' => {
|
||
id = .char_literal_utf_32;
|
||
state = .char_literal_start;
|
||
},
|
||
'\"' => {
|
||
id = .string_literal_utf_32;
|
||
state = .string_literal;
|
||
},
|
||
else => {
|
||
self.index -= 1;
|
||
state = .identifier;
|
||
},
|
||
},
|
||
.L => switch (c) {
|
||
'\'' => {
|
||
id = .char_literal_wide;
|
||
state = .char_literal_start;
|
||
},
|
||
'\"' => {
|
||
id = .string_literal_wide;
|
||
state = .string_literal;
|
||
},
|
||
else => {
|
||
self.index -= 1;
|
||
state = .identifier;
|
||
},
|
||
},
|
||
.string_literal => switch (c) {
|
||
'\\' => {
|
||
return_state = .string_literal;
|
||
state = if (self.path_escapes) .path_escape else .escape_sequence;
|
||
},
|
||
'"' => {
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'\n' => {
|
||
id = .invalid;
|
||
break;
|
||
},
|
||
'\r' => unreachable,
|
||
else => {},
|
||
},
|
||
.path_escape => {
|
||
state = .string_literal;
|
||
},
|
||
.char_literal_start => switch (c) {
|
||
'\\' => {
|
||
state = .char_escape_sequence;
|
||
},
|
||
'\'', '\n' => {
|
||
id = .invalid;
|
||
break;
|
||
},
|
||
else => {
|
||
state = .char_literal;
|
||
},
|
||
},
|
||
.char_literal => switch (c) {
|
||
'\\' => {
|
||
state = .char_escape_sequence;
|
||
},
|
||
'\'' => {
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'\n' => {
|
||
id = .invalid;
|
||
break;
|
||
},
|
||
else => {},
|
||
},
|
||
.char_escape_sequence => switch (c) {
|
||
'\r', '\n' => unreachable, // removed by line splicing
|
||
else => state = .char_literal,
|
||
},
|
||
.escape_sequence => switch (c) {
|
||
'\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => {
|
||
state = return_state;
|
||
},
|
||
'\r', '\n' => unreachable, // removed by line splicing
|
||
'0'...'7' => {
|
||
counter = 1;
|
||
state = .octal_escape;
|
||
},
|
||
'x' => state = .hex_escape,
|
||
'u' => {
|
||
counter = 4;
|
||
state = .unicode_escape;
|
||
},
|
||
'U' => {
|
||
counter = 8;
|
||
state = .unicode_escape;
|
||
},
|
||
else => {
|
||
id = .invalid;
|
||
break;
|
||
},
|
||
},
|
||
.octal_escape => switch (c) {
|
||
'0'...'7' => {
|
||
counter += 1;
|
||
if (counter == 3) state = return_state;
|
||
},
|
||
else => {
|
||
self.index -= 1;
|
||
state = return_state;
|
||
},
|
||
},
|
||
.hex_escape => switch (c) {
|
||
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
||
else => {
|
||
self.index -= 1;
|
||
state = return_state;
|
||
},
|
||
},
|
||
.unicode_escape => switch (c) {
|
||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||
counter -= 1;
|
||
if (counter == 0) state = return_state;
|
||
},
|
||
else => {
|
||
id = .invalid;
|
||
break;
|
||
},
|
||
},
|
||
.identifier, .extended_identifier => switch (c) {
|
||
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
||
'$' => if (self.comp.langopts.dollars_in_identifiers) {
|
||
state = .extended_identifier;
|
||
} else {
|
||
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
|
||
break;
|
||
},
|
||
0x80...0xFF => state = .extended_identifier,
|
||
else => {
|
||
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
|
||
break;
|
||
},
|
||
},
|
||
.equal => switch (c) {
|
||
'=' => {
|
||
id = .equal_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .equal;
|
||
break;
|
||
},
|
||
},
|
||
.bang => switch (c) {
|
||
'=' => {
|
||
id = .bang_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .bang;
|
||
break;
|
||
},
|
||
},
|
||
.pipe => switch (c) {
|
||
'=' => {
|
||
id = .pipe_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'|' => {
|
||
id = .pipe_pipe;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .pipe;
|
||
break;
|
||
},
|
||
},
|
||
.colon => switch (c) {
|
||
'>' => {
|
||
if (self.comp.langopts.hasDigraphs()) {
|
||
id = .r_bracket;
|
||
self.index += 1;
|
||
} else {
|
||
id = .colon;
|
||
}
|
||
break;
|
||
},
|
||
':' => {
|
||
if (self.comp.langopts.standard.atLeast(.c2x)) {
|
||
id = .colon_colon;
|
||
self.index += 1;
|
||
break;
|
||
} else {
|
||
id = .colon;
|
||
break;
|
||
}
|
||
},
|
||
else => {
|
||
id = .colon;
|
||
break;
|
||
},
|
||
},
|
||
.percent => switch (c) {
|
||
'=' => {
|
||
id = .percent_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'>' => {
|
||
if (self.comp.langopts.hasDigraphs()) {
|
||
id = .r_brace;
|
||
self.index += 1;
|
||
} else {
|
||
id = .percent;
|
||
}
|
||
break;
|
||
},
|
||
':' => {
|
||
if (self.comp.langopts.hasDigraphs()) {
|
||
state = .hash_digraph;
|
||
} else {
|
||
id = .percent;
|
||
break;
|
||
}
|
||
},
|
||
else => {
|
||
id = .percent;
|
||
break;
|
||
},
|
||
},
|
||
.asterisk => switch (c) {
|
||
'=' => {
|
||
id = .asterisk_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .asterisk;
|
||
break;
|
||
},
|
||
},
|
||
.plus => switch (c) {
|
||
'=' => {
|
||
id = .plus_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'+' => {
|
||
id = .plus_plus;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .plus;
|
||
break;
|
||
},
|
||
},
|
||
.angle_bracket_left => switch (c) {
|
||
'<' => state = .angle_bracket_angle_bracket_left,
|
||
'=' => {
|
||
id = .angle_bracket_left_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
':' => {
|
||
if (self.comp.langopts.hasDigraphs()) {
|
||
id = .l_bracket;
|
||
self.index += 1;
|
||
} else {
|
||
id = .angle_bracket_left;
|
||
}
|
||
break;
|
||
},
|
||
'%' => {
|
||
if (self.comp.langopts.hasDigraphs()) {
|
||
id = .l_brace;
|
||
self.index += 1;
|
||
} else {
|
||
id = .angle_bracket_left;
|
||
}
|
||
break;
|
||
},
|
||
else => {
|
||
id = .angle_bracket_left;
|
||
break;
|
||
},
|
||
},
|
||
.angle_bracket_angle_bracket_left => switch (c) {
|
||
'=' => {
|
||
id = .angle_bracket_angle_bracket_left_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .angle_bracket_angle_bracket_left;
|
||
break;
|
||
},
|
||
},
|
||
.angle_bracket_right => switch (c) {
|
||
'>' => state = .angle_bracket_angle_bracket_right,
|
||
'=' => {
|
||
id = .angle_bracket_right_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .angle_bracket_right;
|
||
break;
|
||
},
|
||
},
|
||
.angle_bracket_angle_bracket_right => switch (c) {
|
||
'=' => {
|
||
id = .angle_bracket_angle_bracket_right_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .angle_bracket_angle_bracket_right;
|
||
break;
|
||
},
|
||
},
|
||
.caret => switch (c) {
|
||
'=' => {
|
||
id = .caret_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .caret;
|
||
break;
|
||
},
|
||
},
|
||
.period => switch (c) {
|
||
'.' => state = .period2,
|
||
'0'...'9' => state = .pp_num,
|
||
else => {
|
||
id = .period;
|
||
break;
|
||
},
|
||
},
|
||
.period2 => switch (c) {
|
||
'.' => {
|
||
id = .ellipsis;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .period;
|
||
self.index -= 1;
|
||
break;
|
||
},
|
||
},
|
||
.minus => switch (c) {
|
||
'>' => {
|
||
id = .arrow;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'=' => {
|
||
id = .minus_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'-' => {
|
||
id = .minus_minus;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .minus;
|
||
break;
|
||
},
|
||
},
|
||
.ampersand => switch (c) {
|
||
'&' => {
|
||
id = .ampersand_ampersand;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
'=' => {
|
||
id = .ampersand_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .ampersand;
|
||
break;
|
||
},
|
||
},
|
||
.hash => switch (c) {
|
||
'#' => {
|
||
id = .hash_hash;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .hash;
|
||
break;
|
||
},
|
||
},
|
||
.hash_digraph => switch (c) {
|
||
'%' => state = .hash_hash_digraph_partial,
|
||
else => {
|
||
id = .hash;
|
||
break;
|
||
},
|
||
},
|
||
.hash_hash_digraph_partial => switch (c) {
|
||
':' => {
|
||
id = .hash_hash;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .hash;
|
||
self.index -= 1; // re-tokenize the percent
|
||
break;
|
||
},
|
||
},
|
||
.slash => switch (c) {
|
||
'/' => state = .line_comment,
|
||
'*' => state = .multi_line_comment,
|
||
'=' => {
|
||
id = .slash_equal;
|
||
self.index += 1;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .slash;
|
||
break;
|
||
},
|
||
},
|
||
.line_comment => switch (c) {
|
||
'\n' => {
|
||
if (self.comp.langopts.preserve_comments) {
|
||
id = .comment;
|
||
break;
|
||
}
|
||
self.index -= 1;
|
||
state = .start;
|
||
},
|
||
else => {},
|
||
},
|
||
.multi_line_comment => switch (c) {
|
||
'*' => state = .multi_line_comment_asterisk,
|
||
'\n' => self.line += 1,
|
||
else => {},
|
||
},
|
||
.multi_line_comment_asterisk => switch (c) {
|
||
'/' => {
|
||
if (self.comp.langopts.preserve_comments) {
|
||
self.index += 1;
|
||
id = .comment;
|
||
break;
|
||
}
|
||
state = .multi_line_comment_done;
|
||
},
|
||
'\n' => {
|
||
self.line += 1;
|
||
state = .multi_line_comment;
|
||
},
|
||
'*' => {},
|
||
else => state = .multi_line_comment,
|
||
},
|
||
.multi_line_comment_done => switch (c) {
|
||
'\n' => {
|
||
start = self.index;
|
||
id = .nl;
|
||
self.index += 1;
|
||
self.line += 1;
|
||
break;
|
||
},
|
||
'\r' => unreachable,
|
||
'\t', '\x0B', '\x0C', ' ' => {
|
||
start = self.index;
|
||
state = .whitespace;
|
||
},
|
||
else => {
|
||
id = .whitespace;
|
||
break;
|
||
},
|
||
},
|
||
.pp_num => switch (c) {
|
||
'a'...'d',
|
||
'A'...'D',
|
||
'f'...'o',
|
||
'F'...'O',
|
||
'q'...'z',
|
||
'Q'...'Z',
|
||
'0'...'9',
|
||
'_',
|
||
'.',
|
||
=> {},
|
||
'e', 'E', 'p', 'P' => state = .pp_num_exponent,
|
||
'\'' => if (self.comp.langopts.standard.atLeast(.c2x)) {
|
||
state = .pp_num_digit_separator;
|
||
} else {
|
||
id = .pp_num;
|
||
break;
|
||
},
|
||
else => {
|
||
id = .pp_num;
|
||
break;
|
||
},
|
||
},
|
||
.pp_num_digit_separator => switch (c) {
|
||
'a'...'d',
|
||
'A'...'D',
|
||
'f'...'o',
|
||
'F'...'O',
|
||
'q'...'z',
|
||
'Q'...'Z',
|
||
'0'...'9',
|
||
'_',
|
||
=> state = .pp_num,
|
||
else => {
|
||
self.index -= 1;
|
||
id = .pp_num;
|
||
break;
|
||
},
|
||
},
|
||
.pp_num_exponent => switch (c) {
|
||
'a'...'z',
|
||
'A'...'Z',
|
||
'0'...'9',
|
||
'_',
|
||
'.',
|
||
'+',
|
||
'-',
|
||
=> state = .pp_num,
|
||
else => {
|
||
id = .pp_num;
|
||
break;
|
||
},
|
||
},
|
||
}
|
||
} else if (self.index == self.buf.len) {
|
||
switch (state) {
|
||
.start, .line_comment => {},
|
||
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]),
|
||
.extended_identifier => id = .extended_identifier,
|
||
.period2,
|
||
.string_literal,
|
||
.path_escape,
|
||
.char_literal_start,
|
||
.char_literal,
|
||
.escape_sequence,
|
||
.char_escape_sequence,
|
||
.octal_escape,
|
||
.hex_escape,
|
||
.unicode_escape,
|
||
.multi_line_comment,
|
||
.multi_line_comment_asterisk,
|
||
=> id = .invalid,
|
||
|
||
.whitespace => id = .whitespace,
|
||
.multi_line_comment_done => id = .whitespace,
|
||
|
||
.equal => id = .equal,
|
||
.bang => id = .bang,
|
||
.minus => id = .minus,
|
||
.slash => id = .slash,
|
||
.ampersand => id = .ampersand,
|
||
.hash => id = .hash,
|
||
.period => id = .period,
|
||
.pipe => id = .pipe,
|
||
.angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
|
||
.angle_bracket_right => id = .angle_bracket_right,
|
||
.angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
|
||
.angle_bracket_left => id = .angle_bracket_left,
|
||
.plus => id = .plus,
|
||
.colon => id = .colon,
|
||
.percent => id = .percent,
|
||
.caret => id = .caret,
|
||
.asterisk => id = .asterisk,
|
||
.hash_digraph => id = .hash,
|
||
.hash_hash_digraph_partial => {
|
||
id = .hash;
|
||
self.index -= 1; // re-tokenize the percent
|
||
},
|
||
.pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
|
||
}
|
||
}
|
||
|
||
return .{
|
||
.id = id,
|
||
.start = start,
|
||
.end = self.index,
|
||
.line = self.line,
|
||
.source = self.source,
|
||
};
|
||
}
|
||
|
||
pub fn nextNoWS(self: *Tokenizer) Token {
|
||
var tok = self.next();
|
||
while (tok.id == .whitespace or tok.id == .comment) tok = self.next();
|
||
return tok;
|
||
}
|
||
|
||
pub fn nextNoWSComments(self: *Tokenizer) Token {
|
||
var tok = self.next();
|
||
while (tok.id == .whitespace) tok = self.next();
|
||
return tok;
|
||
}
|
||
|
||
test "operators" {
|
||
try expectTokens(
|
||
\\ ! != | || |= = ==
|
||
\\ ( ) { } [ ] . .. ...
|
||
\\ ^ ^= + ++ += - -- -=
|
||
\\ * *= % %= -> : ; / /=
|
||
\\ , & && &= ? < <= <<
|
||
\\ <<= > >= >> >>= ~ # ##
|
||
\\
|
||
, &.{
|
||
.bang,
|
||
.bang_equal,
|
||
.pipe,
|
||
.pipe_pipe,
|
||
.pipe_equal,
|
||
.equal,
|
||
.equal_equal,
|
||
.nl,
|
||
.l_paren,
|
||
.r_paren,
|
||
.l_brace,
|
||
.r_brace,
|
||
.l_bracket,
|
||
.r_bracket,
|
||
.period,
|
||
.period,
|
||
.period,
|
||
.ellipsis,
|
||
.nl,
|
||
.caret,
|
||
.caret_equal,
|
||
.plus,
|
||
.plus_plus,
|
||
.plus_equal,
|
||
.minus,
|
||
.minus_minus,
|
||
.minus_equal,
|
||
.nl,
|
||
.asterisk,
|
||
.asterisk_equal,
|
||
.percent,
|
||
.percent_equal,
|
||
.arrow,
|
||
.colon,
|
||
.semicolon,
|
||
.slash,
|
||
.slash_equal,
|
||
.nl,
|
||
.comma,
|
||
.ampersand,
|
||
.ampersand_ampersand,
|
||
.ampersand_equal,
|
||
.question_mark,
|
||
.angle_bracket_left,
|
||
.angle_bracket_left_equal,
|
||
.angle_bracket_angle_bracket_left,
|
||
.nl,
|
||
.angle_bracket_angle_bracket_left_equal,
|
||
.angle_bracket_right,
|
||
.angle_bracket_right_equal,
|
||
.angle_bracket_angle_bracket_right,
|
||
.angle_bracket_angle_bracket_right_equal,
|
||
.tilde,
|
||
.hash,
|
||
.hash_hash,
|
||
.nl,
|
||
});
|
||
}
|
||
|
||
test "keywords" {
|
||
try expectTokens(
|
||
\\auto __auto_type break case char const continue default do
|
||
\\double else enum extern float for goto if int
|
||
\\long register return short signed sizeof static
|
||
\\struct switch typedef union unsigned void volatile
|
||
\\while _Bool _Complex _Imaginary inline restrict _Alignas
|
||
\\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
|
||
\\__attribute __attribute__
|
||
\\
|
||
, &.{
|
||
.keyword_auto,
|
||
.keyword_auto_type,
|
||
.keyword_break,
|
||
.keyword_case,
|
||
.keyword_char,
|
||
.keyword_const,
|
||
.keyword_continue,
|
||
.keyword_default,
|
||
.keyword_do,
|
||
.nl,
|
||
.keyword_double,
|
||
.keyword_else,
|
||
.keyword_enum,
|
||
.keyword_extern,
|
||
.keyword_float,
|
||
.keyword_for,
|
||
.keyword_goto,
|
||
.keyword_if,
|
||
.keyword_int,
|
||
.nl,
|
||
.keyword_long,
|
||
.keyword_register,
|
||
.keyword_return,
|
||
.keyword_short,
|
||
.keyword_signed,
|
||
.keyword_sizeof,
|
||
.keyword_static,
|
||
.nl,
|
||
.keyword_struct,
|
||
.keyword_switch,
|
||
.keyword_typedef,
|
||
.keyword_union,
|
||
.keyword_unsigned,
|
||
.keyword_void,
|
||
.keyword_volatile,
|
||
.nl,
|
||
.keyword_while,
|
||
.keyword_bool,
|
||
.keyword_complex,
|
||
.keyword_imaginary,
|
||
.keyword_inline,
|
||
.keyword_restrict,
|
||
.keyword_alignas,
|
||
.nl,
|
||
.keyword_alignof,
|
||
.keyword_atomic,
|
||
.keyword_generic,
|
||
.keyword_noreturn,
|
||
.keyword_static_assert,
|
||
.keyword_thread_local,
|
||
.nl,
|
||
.keyword_attribute1,
|
||
.keyword_attribute2,
|
||
.nl,
|
||
});
|
||
}
|
||
|
||
test "preprocessor keywords" {
|
||
try expectTokens(
|
||
\\#include
|
||
\\#include_next
|
||
\\#embed
|
||
\\#define
|
||
\\#ifdef
|
||
\\#ifndef
|
||
\\#error
|
||
\\#pragma
|
||
\\
|
||
, &.{
|
||
.hash,
|
||
.keyword_include,
|
||
.nl,
|
||
.hash,
|
||
.keyword_include_next,
|
||
.nl,
|
||
.hash,
|
||
.keyword_embed,
|
||
.nl,
|
||
.hash,
|
||
.keyword_define,
|
||
.nl,
|
||
.hash,
|
||
.keyword_ifdef,
|
||
.nl,
|
||
.hash,
|
||
.keyword_ifndef,
|
||
.nl,
|
||
.hash,
|
||
.keyword_error,
|
||
.nl,
|
||
.hash,
|
||
.keyword_pragma,
|
||
.nl,
|
||
});
|
||
}
|
||
|
||
test "line continuation" {
|
||
try expectTokens(
|
||
\\#define foo \
|
||
\\ bar
|
||
\\"foo\
|
||
\\ bar"
|
||
\\#define "foo"
|
||
\\ "bar"
|
||
\\#define "foo" \
|
||
\\ "bar"
|
||
, &.{
|
||
.hash,
|
||
.keyword_define,
|
||
.identifier,
|
||
.identifier,
|
||
.nl,
|
||
.string_literal,
|
||
.nl,
|
||
.hash,
|
||
.keyword_define,
|
||
.string_literal,
|
||
.nl,
|
||
.string_literal,
|
||
.nl,
|
||
.hash,
|
||
.keyword_define,
|
||
.string_literal,
|
||
.string_literal,
|
||
});
|
||
}
|
||
|
||
test "string prefix" {
|
||
try expectTokens(
|
||
\\"foo"
|
||
\\u"foo"
|
||
\\u8"foo"
|
||
\\U"foo"
|
||
\\L"foo"
|
||
\\'foo'
|
||
\\u8'A'
|
||
\\u'foo'
|
||
\\U'foo'
|
||
\\L'foo'
|
||
\\
|
||
, &.{
|
||
.string_literal,
|
||
.nl,
|
||
.string_literal_utf_16,
|
||
.nl,
|
||
.string_literal_utf_8,
|
||
.nl,
|
||
.string_literal_utf_32,
|
||
.nl,
|
||
.string_literal_wide,
|
||
.nl,
|
||
.char_literal,
|
||
.nl,
|
||
.char_literal_utf_8,
|
||
.nl,
|
||
.char_literal_utf_16,
|
||
.nl,
|
||
.char_literal_utf_32,
|
||
.nl,
|
||
.char_literal_wide,
|
||
.nl,
|
||
});
|
||
}
|
||
|
||
test "num suffixes" {
|
||
try expectTokens(
|
||
\\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
|
||
\\ 0l 0lu 0ll 0llu 0
|
||
\\ 1u 1ul 1ull 1
|
||
\\ 1.0i 1.0I
|
||
\\ 1.0if 1.0If 1.0fi 1.0fI
|
||
\\ 1.0il 1.0Il 1.0li 1.0lI
|
||
\\
|
||
, &.{
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.pp_num,
|
||
.nl,
|
||
});
|
||
}
|
||
|
||
test "comments" {
|
||
try expectTokens(
|
||
\\//foo
|
||
\\#foo
|
||
, &.{
|
||
.nl,
|
||
.hash,
|
||
.identifier,
|
||
});
|
||
}
|
||
|
||
test "extended identifiers" {
|
||
try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
||
try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
||
try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
||
try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
||
try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
||
try expectTokens("1™", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("..™", &.{ .period, .period, .extended_identifier });
|
||
try expectTokens("0™", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
|
||
try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
|
||
try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid });
|
||
try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
|
||
}
|
||
|
||
test "digraphs" {
|
||
try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
|
||
try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
|
||
try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
|
||
}
|
||
|
||
test "C23 keywords" {
|
||
try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr", &.{
|
||
.keyword_true,
|
||
.keyword_false,
|
||
.keyword_c23_alignas,
|
||
.keyword_c23_alignof,
|
||
.keyword_c23_bool,
|
||
.keyword_c23_static_assert,
|
||
.keyword_c23_thread_local,
|
||
.keyword_nullptr,
|
||
}, .c2x);
|
||
}
|
||
|
||
fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void {
|
||
var comp = Compilation.init(std.testing.allocator);
|
||
defer comp.deinit();
|
||
if (standard) |provided| {
|
||
comp.langopts.standard = provided;
|
||
}
|
||
const source = try comp.addSourceFromBuffer("path", contents);
|
||
var tokenizer = Tokenizer{
|
||
.buf = source.buf,
|
||
.source = source.id,
|
||
.comp = &comp,
|
||
};
|
||
var i: usize = 0;
|
||
while (i < expected_tokens.len) {
|
||
const token = tokenizer.next();
|
||
if (token.id == .whitespace) continue;
|
||
const expected_token_id = expected_tokens[i];
|
||
i += 1;
|
||
if (!std.meta.eql(token.id, expected_token_id)) {
|
||
std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
|
||
return error.TokensDoNotEqual;
|
||
}
|
||
}
|
||
const last_token = tokenizer.next();
|
||
try std.testing.expect(last_token.id == .eof);
|
||
}
|
||
|
||
fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
|
||
return expectTokensExtra(contents, expected_tokens, null);
|
||
}
|