fix(perf): remove LUT

This makes it so that we no longer use a LUT (Look-Up Table):
* The code is much simpler and easier to understand now.
* Using a LUT means we rely on a warm cache.
  Relying on the cache like this results in inconsistent performance and in many cases codegen will be worse.
  Also as @topolarity once pointed out, in some cases while it seems like the code may branch, it actually doesn't:
  https://github.com/ziglang/zig/pull/11629#issuecomment-1213641429
* Other languages' standard libraries don't do this either.
  JFF I wanted to see what other languages codegen compared to us now:
  https://rust.godbolt.org/z/Te4ax9Edf, https://zig.godbolt.org/z/nTbYedWKv
  So we are pretty much on par or better than other languages now.
This commit is contained in:
r00ster91 2022-10-16 17:44:31 +02:00
parent 626e02a429
commit 19dbc5805c

View File

@ -12,7 +12,7 @@ const std = @import("std");
/// The C0 control codes of the ASCII encoding.
///
/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`.
/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`
pub const control_code = struct {
/// Null.
pub const nul = 0x00;
@ -88,188 +88,63 @@ pub const control_code = struct {
pub const xoff = dc3;
};
const tIndex = enum(u3) {
Alpha,
Hex,
Space,
Digit,
Lower,
Upper,
// Ctrl, < 0x20 || == DEL
// Print, = Graph || == ' '. NOT '\t' et cetera
Punct,
Graph,
//ASCII, | ~0b01111111
//isBlank, == ' ' || == '\x09'
};
const combinedTable = init: {
comptime var table: [256]u8 = undefined;
const mem = std.mem;
const alpha = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
};
const lower = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
};
const upper = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const digit = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const hex = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const space = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const punct = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
};
const graph = [_]u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
};
comptime var i = 0;
inline while (i < 128) : (i += 1) {
table[i] =
@as(u8, alpha[i]) << @enumToInt(tIndex.Alpha) |
@as(u8, hex[i]) << @enumToInt(tIndex.Hex) |
@as(u8, space[i]) << @enumToInt(tIndex.Space) |
@as(u8, digit[i]) << @enumToInt(tIndex.Digit) |
@as(u8, lower[i]) << @enumToInt(tIndex.Lower) |
@as(u8, upper[i]) << @enumToInt(tIndex.Upper) |
@as(u8, punct[i]) << @enumToInt(tIndex.Punct) |
@as(u8, graph[i]) << @enumToInt(tIndex.Graph);
}
mem.set(u8, table[128..256], 0);
break :init table;
};
fn inTable(c: u8, t: tIndex) bool {
return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0;
}
/// Returns whether the character is alphanumeric.
/// Returns whether the character is alphanumeric: A-Z, a-z, or 0-9.
pub fn isAlphanumeric(c: u8) bool {
return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) |
@as(u8, 1) << @enumToInt(tIndex.Digit))) != 0;
return switch (c) {
'A'...'Z', 'a'...'z', '0'...'9' => true,
else => false,
};
}
/// Returns whether the character is alphabetic.
/// Returns whether the character is alphabetic: A-Z or a-z.
pub fn isAlphabetic(c: u8) bool {
return inTable(c, tIndex.Alpha);
return switch (c) {
'A'...'Z', 'a'...'z' => true,
else => false,
};
}
/// Returns whether the character is a control character.
/// This is the same as `!isPrint(c)`.
///
/// See also: `control_code`.
/// See also: `control_code`
pub fn isControl(c: u8) bool {
return c <= control_code.us or c == control_code.del;
}
/// Returns whether the character is a digit.
pub fn isDigit(c: u8) bool {
return inTable(c, tIndex.Digit);
return switch (c) {
'0'...'9' => true,
else => false,
};
}
/// Returns whether the character is a lowercased letter.
/// Returns whether the character is a lowercase letter.
pub fn isLower(c: u8) bool {
return inTable(c, tIndex.Lower);
return switch (c) {
'a'...'z' => true,
else => false,
};
}
/// Returns whether the character is printable and has some graphical representation.
/// This also returns `true` for the space character.
/// This is the same as `!isControl(c)`.
/// Returns whether the character is printable and has some graphical representation,
/// including the space character.
pub fn isPrint(c: u8) bool {
return inTable(c, tIndex.Graph) or c == ' ';
return isASCII(c) and !isControl(c);
}
/// Returns whether this character is included in `whitespace`.
pub fn isWhitespace(c: u8) bool {
return inTable(c, tIndex.Space);
return for (whitespace) |other| {
if (c == other)
break true;
} else false;
}
/// Whitespace for general use.
/// This may be used with e.g. `std.mem.trim` to trim whitespace.
///
/// See also: `isWhitespace`.
/// See also: `isWhitespace`
pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };
test "whitespace" {
@ -281,14 +156,20 @@ test "whitespace" {
}
}
/// Returns whether the character is an uppercased letter.
/// Returns whether the character is an uppercase letter.
pub fn isUpper(c: u8) bool {
return inTable(c, tIndex.Upper);
return switch (c) {
'A'...'Z' => true,
else => false,
};
}
/// Returns whether the character is a hexadecimal digit. Case-insensitive.
/// Returns whether the character is a hexadecimal digit: A-F, a-f, or 0-9.
pub fn isHex(c: u8) bool {
return inTable(c, tIndex.Hex);
return switch (c) {
'A'...'F', 'a'...'f', '0'...'9' => true,
else => false,
};
}
/// Returns whether the character is a 7-bit ASCII character.
@ -322,6 +203,8 @@ test "ASCII character classes" {
try testing.expect(isControl(control_code.nul));
try testing.expect(isControl(control_code.ff));
try testing.expect(isControl(control_code.us));
try testing.expect(!isControl(0x80));
try testing.expect(!isControl(0xff));
try testing.expect('C' == toUpper('c'));
try testing.expect(':' == toUpper(':'));
@ -351,6 +234,7 @@ test "ASCII character classes" {
try testing.expect(!isHex('g'));
try testing.expect(isHex('b'));
try testing.expect(isHex('F'));
try testing.expect(isHex('9'));
try testing.expect(!isDigit('~'));
@ -361,6 +245,8 @@ test "ASCII character classes" {
try testing.expect(isPrint('@'));
try testing.expect(isPrint('~'));
try testing.expect(!isPrint(control_code.esc));
try testing.expect(!isPrint(0x80));
try testing.expect(!isPrint(0xff));
}
/// Writes a lower case copy of `ascii_string` to `output`.