From 67bd45f0cf1b452cf8de5a016bc6ff2f85393d70 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 4 Feb 2019 15:24:06 -0500 Subject: [PATCH] adjustments to std.mem split / separate * rename std.mem.split to std.mem.tokenize * add future deprecation notice to docs * (unrelated) add note to std.os.path.resolve docs * std.mem.separate - assert delimiter.len not zero * fix implementation of std.mem.separate to respect the delimiter * separate the two iterators to different structs --- build.zig | 16 ++-- src-self-hosted/libc_installation.zig | 8 +- src-self-hosted/main.zig | 2 +- std/build.zig | 6 +- std/mem.zig | 129 ++++++++++++++------------ std/os/child_process.zig | 2 +- std/os/index.zig | 2 +- std/os/path.zig | 44 ++++----- 8 files changed, 112 insertions(+), 97 deletions(-) diff --git a/build.zig b/build.zig index d99165a6de..a41a5f808b 100644 --- a/build.zig +++ b/build.zig @@ -189,14 +189,14 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep { const prefix_output = try b.exec([][]const u8{ llvm_config_exe, "--prefix" }); var result = LibraryDep{ - .prefix = mem.split(prefix_output, " \r\n").next().?, + .prefix = mem.tokenize(prefix_output, " \r\n").next().?, .libs = ArrayList([]const u8).init(b.allocator), .system_libs = ArrayList([]const u8).init(b.allocator), .includes = ArrayList([]const u8).init(b.allocator), .libdirs = ArrayList([]const u8).init(b.allocator), }; { - var it = mem.split(libs_output, " \r\n"); + var it = mem.tokenize(libs_output, " \r\n"); while (it.next()) |lib_arg| { if (mem.startsWith(u8, lib_arg, "-l")) { try result.system_libs.append(lib_arg[2..]); @@ -210,7 +210,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep { } } { - var it = mem.split(includes_output, " \r\n"); + var it = mem.tokenize(includes_output, " \r\n"); while (it.next()) |include_arg| { if (mem.startsWith(u8, include_arg, "-I")) { try result.includes.append(include_arg[2..]); @@ -220,7 +220,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep { } } { - var it = mem.split(libdir_output, " \r\n"); + var it = mem.tokenize(libdir_output, " \r\n"); while (it.next()) |libdir| { if (mem.startsWith(u8, libdir, "-L")) { try result.libdirs.append(libdir[2..]); @@ -233,7 +233,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep { } pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void { - var it = mem.split(stdlib_files, ";"); + var it = mem.tokenize(stdlib_files, ";"); while (it.next()) |stdlib_file| { const src_path = os.path.join(b.allocator, "std", stdlib_file) catch unreachable; const dest_path = os.path.join(b.allocator, "lib", "zig", "std", stdlib_file) catch unreachable; @@ -242,7 +242,7 @@ pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void { } pub fn installCHeaders(b: *Builder, c_header_files: []const u8) void { - var it = mem.split(c_header_files, ";"); + var it = mem.tokenize(c_header_files, ";"); while (it.next()) |c_header_file| { const src_path = os.path.join(b.allocator, "c_headers", c_header_file) catch unreachable; const dest_path = os.path.join(b.allocator, "lib", "zig", "include", c_header_file) catch unreachable; @@ -277,7 +277,7 @@ fn configureStage2(b: *Builder, exe: var, ctx: Context) !void { addCppLib(b, exe, ctx.cmake_binary_dir, "zig_cpp"); if (ctx.lld_include_dir.len != 0) { exe.addIncludeDir(ctx.lld_include_dir); - var it = mem.split(ctx.lld_libraries, ";"); + var it = mem.tokenize(ctx.lld_libraries, ";"); while (it.next()) |lib| { exe.addObjectFile(lib); } @@ -334,7 +334,7 @@ fn addCxxKnownPath( ctx.cxx_compiler, b.fmt("-print-file-name={}", objname), }); - const path_unpadded = mem.split(path_padded, "\r\n").next().?; + const path_unpadded = mem.tokenize(path_padded, "\r\n").next().?; if (mem.eql(u8, path_unpadded, objname)) { if (errtxt) |msg| { warn("{}", msg); diff --git a/src-self-hosted/libc_installation.zig b/src-self-hosted/libc_installation.zig index 1c5d111c5a..18d2daf0c2 100644 --- a/src-self-hosted/libc_installation.zig +++ b/src-self-hosted/libc_installation.zig @@ -57,10 +57,10 @@ pub const LibCInstallation = struct { const contents = try std.io.readFileAlloc(allocator, libc_file); defer allocator.free(contents); - var it = std.mem.split(contents, "\n"); + var it = std.mem.tokenize(contents, "\n"); while (it.next()) |line| { if (line.len == 0 or line[0] == '#') continue; - var line_it = std.mem.split(line, "="); + var line_it = std.mem.separate(line, "="); const name = line_it.next() orelse { try stderr.print("missing equal sign after field name\n"); return error.ParseError; @@ -213,7 +213,7 @@ pub const LibCInstallation = struct { }, } - var it = std.mem.split(exec_result.stderr, "\n\r"); + var it = std.mem.tokenize(exec_result.stderr, "\n\r"); var search_paths = std.ArrayList([]const u8).init(loop.allocator); defer search_paths.deinit(); while (it.next()) |line| { @@ -410,7 +410,7 @@ async fn ccPrintFileName(loop: *event.Loop, o_file: []const u8, want_dirname: bo return error.CCompilerCrashed; }, } - var it = std.mem.split(exec_result.stdout, "\n\r"); + var it = std.mem.tokenize(exec_result.stdout, "\n\r"); const line = it.next() orelse return error.LibCRuntimeNotFound; const dirname = std.os.path.dirname(line) orelse return error.LibCRuntimeNotFound; diff --git a/src-self-hosted/main.zig b/src-self-hosted/main.zig index 1403ab860d..f6ee9a0513 100644 --- a/src-self-hosted/main.zig +++ b/src-self-hosted/main.zig @@ -351,7 +351,7 @@ fn buildOutputType(allocator: *Allocator, args: []const []const u8, out_type: Co const root_name = if (provided_name) |n| n else blk: { if (root_source_file) |file| { const basename = os.path.basename(file); - var it = mem.split(basename, "."); + var it = mem.separate(basename, "."); break :blk it.next() orelse basename; } else { try stderr.write("--name [name] not provided and unable to infer\n"); diff --git a/std/build.zig b/std/build.zig index 6f58594190..5246d97339 100644 --- a/std/build.zig +++ b/std/build.zig @@ -324,7 +324,7 @@ pub const Builder = struct { fn processNixOSEnvVars(self: *Builder) void { if (os.getEnvVarOwned(self.allocator, "NIX_CFLAGS_COMPILE")) |nix_cflags_compile| { - var it = mem.split(nix_cflags_compile, " "); + var it = mem.tokenize(nix_cflags_compile, " "); while (true) { const word = it.next() orelse break; if (mem.eql(u8, word, "-isystem")) { @@ -342,7 +342,7 @@ pub const Builder = struct { assert(err == error.EnvironmentVariableNotFound); } if (os.getEnvVarOwned(self.allocator, "NIX_LDFLAGS")) |nix_ldflags| { - var it = mem.split(nix_ldflags, " "); + var it = mem.tokenize(nix_ldflags, " "); while (true) { const word = it.next() orelse break; if (mem.eql(u8, word, "-rpath")) { @@ -689,7 +689,7 @@ pub const Builder = struct { if (os.path.isAbsolute(name)) { return name; } - var it = mem.split(PATH, []u8{os.path.delimiter}); + var it = mem.tokenize(PATH, []u8{os.path.delimiter}); while (it.next()) |path| { const full_path = try os.path.join(self.allocator, path, self.fmt("{}{}", name, exe_extension)); if (os.path.real(self.allocator, full_path)) |real_path| { diff --git a/std/mem.zig b/std/mem.zig index bec3816d88..26ae4ef089 100644 --- a/std/mem.zig +++ b/std/mem.zig @@ -689,58 +689,57 @@ pub fn eql_slice_u8(a: []const u8, b: []const u8) bool { } /// Returns an iterator that iterates over the slices of `buffer` that are not -/// any of the bytes in `split_bytes`. -/// split(" abc def ghi ", " ") +/// any of the bytes in `delimiter_bytes`. +/// tokenize(" abc def ghi ", " ") /// Will return slices for "abc", "def", "ghi", null, in that order. -/// If `split_bytes` does not exist in buffer, +/// If `buffer` is empty, the iterator will return null. +/// If `delimiter_bytes` does not exist in buffer, /// the iterator will return `buffer`, null, in that order. -pub fn split(buffer: []const u8, split_bytes: []const u8) SplitIterator { - return SplitIterator{ +/// See also the related function `separate`. +pub fn tokenize(buffer: []const u8, delimiter_bytes: []const u8) TokenIterator { + return TokenIterator{ .index = 0, .buffer = buffer, - .split_bytes = split_bytes, - .glob = true, - .spun = false, + .delimiter_bytes = delimiter_bytes, }; } -test "mem.split" { - var it = split(" abc def ghi ", " "); +test "mem.tokenize" { + var it = tokenize(" abc def ghi ", " "); assert(eql(u8, it.next().?, "abc")); assert(eql(u8, it.next().?, "def")); assert(eql(u8, it.next().?, "ghi")); assert(it.next() == null); - it = split("..\\bob", "\\"); + it = tokenize("..\\bob", "\\"); assert(eql(u8, it.next().?, "..")); assert(eql(u8, "..", "..\\bob"[0..it.index])); assert(eql(u8, it.next().?, "bob")); assert(it.next() == null); - it = split("//a/b", "/"); + it = tokenize("//a/b", "/"); assert(eql(u8, it.next().?, "a")); assert(eql(u8, it.next().?, "b")); assert(eql(u8, "//a/b", "//a/b"[0..it.index])); assert(it.next() == null); - it = split("|", "|"); + it = tokenize("|", "|"); assert(it.next() == null); - it = split("", "|"); - assert(eql(u8, it.next().?, "")); + it = tokenize("", "|"); assert(it.next() == null); - it = split("hello", ""); + it = tokenize("hello", ""); assert(eql(u8, it.next().?, "hello")); assert(it.next() == null); - it = split("hello", " "); + it = tokenize("hello", " "); assert(eql(u8, it.next().?, "hello")); assert(it.next() == null); } -test "mem.split (multibyte)" { - var it = split("a|b,c/d e", " /,|"); +test "mem.tokenize (multibyte)" { + var it = tokenize("a|b,c/d e", " /,|"); assert(eql(u8, it.next().?, "a")); assert(eql(u8, it.next().?, "b")); assert(eql(u8, it.next().?, "c")); @@ -750,18 +749,21 @@ test "mem.split (multibyte)" { } /// Returns an iterator that iterates over the slices of `buffer` that -/// seperates by bytes in `delimiter`. +/// are separated by bytes in `delimiter`. /// separate("abc|def||ghi", "|") -/// Will return slices for "abc", "def", "", "ghi", null, in that order. +/// will return slices for "abc", "def", "", "ghi", null, in that order. /// If `delimiter` does not exist in buffer, /// the iterator will return `buffer`, null, in that order. +/// The delimiter length must not be zero. +/// See also the related function `tokenize`. +/// It is planned to rename this function to `split` before 1.0.0, like this: +/// pub fn split(buffer: []const u8, delimiter: []const u8) SplitIterator { pub fn separate(buffer: []const u8, delimiter: []const u8) SplitIterator { + assert(delimiter.len != 0); return SplitIterator{ .index = 0, .buffer = buffer, - .split_bytes = delimiter, - .glob = false, - .spun = false, + .delimiter = delimiter, }; } @@ -782,19 +784,15 @@ test "mem.separate" { assert(eql(u8, it.next().?, "")); assert(it.next() == null); - it = separate("hello", ""); - assert(eql(u8, it.next().?, "hello")); - assert(it.next() == null); - it = separate("hello", " "); assert(eql(u8, it.next().?, "hello")); assert(it.next() == null); } test "mem.separate (multibyte)" { - var it = separate("a|b,c/d e", " /,|"); + var it = separate("a, b ,, c, d, e", ", "); assert(eql(u8, it.next().?, "a")); - assert(eql(u8, it.next().?, "b")); + assert(eql(u8, it.next().?, "b ,")); assert(eql(u8, it.next().?, "c")); assert(eql(u8, it.next().?, "d")); assert(eql(u8, it.next().?, "e")); @@ -819,49 +817,38 @@ test "mem.endsWith" { assert(!endsWith(u8, "Bob", "Bo")); } -pub const SplitIterator = struct { +pub const TokenIterator = struct { buffer: []const u8, - split_bytes: []const u8, + delimiter_bytes: []const u8, index: usize, - glob: bool, - spun: bool, - /// Iterates and returns null or optionally a slice the next split segment - pub fn next(self: *SplitIterator) ?[]const u8 { - if (self.spun) { - if (self.index + 1 > self.buffer.len) return null; - self.index += 1; + /// Returns a slice of the next token, or null if tokenization is complete. + pub fn next(self: *TokenIterator) ?[]const u8 { + // move to beginning of token + while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {} + const start = self.index; + if (start == self.buffer.len) { + return null; } - self.spun = true; + // move to end of token + while (self.index < self.buffer.len and !self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {} + const end = self.index; - if (self.glob) { - while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {} - } - - var cursor = self.index; - while (cursor < self.buffer.len and !self.isSplitByte(self.buffer[cursor])) : (cursor += 1) {} - - defer self.index = cursor; - - if (cursor == self.buffer.len) { - return if (self.glob and self.index == cursor and self.index > 0) null else self.buffer[self.index..]; - } - - return self.buffer[self.index..cursor]; + return self.buffer[start..end]; } /// Returns a slice of the remaining bytes. Does not affect iterator state. - pub fn rest(self: *const SplitIterator) []const u8 { + pub fn rest(self: TokenIterator) []const u8 { // move to beginning of token var index: usize = self.index; while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {} return self.buffer[index..]; } - fn isSplitByte(self: *const SplitIterator, byte: u8) bool { - for (self.split_bytes) |split_byte| { - if (byte == split_byte) { + fn isSplitByte(self: TokenIterator, byte: u8) bool { + for (self.delimiter_bytes) |delimiter_byte| { + if (byte == delimiter_byte) { return true; } } @@ -869,6 +856,32 @@ pub const SplitIterator = struct { } }; +pub const SplitIterator = struct { + buffer: []const u8, + index: ?usize, + delimiter: []const u8, + + /// Returns a slice of the next field, or null if splitting is complete. + pub fn next(self: *SplitIterator) ?[]const u8 { + const start = self.index orelse return null; + const end = if (indexOfPos(u8, self.buffer, start, self.delimiter)) |delim_start| blk: { + self.index = delim_start + self.delimiter.len; + break :blk delim_start; + } else blk: { + self.index = null; + break :blk self.buffer.len; + }; + return self.buffer[start..end]; + } + + /// Returns a slice of the remaining bytes. Does not affect iterator state. + pub fn rest(self: SplitIterator) []const u8 { + const end = self.buffer.len; + const start = self.index orelse end; + return self.buffer[start..end]; + } +}; + /// Naively combines a series of strings with a separator. /// Allocates memory for the result, which must be freed by the caller. pub fn join(allocator: *Allocator, sep: u8, strings: ...) ![]u8 { diff --git a/std/os/child_process.zig b/std/os/child_process.zig index 9f33bee905..7aa8582369 100644 --- a/std/os/child_process.zig +++ b/std/os/child_process.zig @@ -595,7 +595,7 @@ pub const ChildProcess = struct { const PATH = try os.getEnvVarOwned(self.allocator, "PATH"); defer self.allocator.free(PATH); - var it = mem.split(PATH, ";"); + var it = mem.tokenize(PATH, ";"); while (it.next()) |search_path| { const joined_path = try os.path.join(self.allocator, search_path, app_name); defer self.allocator.free(joined_path); diff --git a/std/os/index.zig b/std/os/index.zig index 0d0e07bfa3..451c0a3436 100644 --- a/std/os/index.zig +++ b/std/os/index.zig @@ -608,7 +608,7 @@ pub fn posixExecve(argv: []const []const u8, env_map: *const BufMap, allocator: // +1 for the null terminating byte const path_buf = try allocator.alloc(u8, PATH.len + exe_path.len + 2); defer allocator.free(path_buf); - var it = mem.split(PATH, ":"); + var it = mem.tokenize(PATH, ":"); var seen_eacces = false; var err: usize = undefined; while (it.next()) |search_path| { diff --git a/std/os/path.zig b/std/os/path.zig index 4d3d3d6a8b..0b960fa2da 100644 --- a/std/os/path.zig +++ b/std/os/path.zig @@ -184,7 +184,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath { return relative_path; } - var it = mem.split(path, []u8{this_sep}); + var it = mem.tokenize(path, []u8{this_sep}); _ = (it.next() orelse return relative_path); _ = (it.next() orelse return relative_path); return WindowsPath{ @@ -202,7 +202,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath { return relative_path; } - var it = mem.split(path, []u8{this_sep}); + var it = mem.tokenize(path, []u8{this_sep}); _ = (it.next() orelse return relative_path); _ = (it.next() orelse return relative_path); return WindowsPath{ @@ -264,8 +264,8 @@ fn networkShareServersEql(ns1: []const u8, ns2: []const u8) bool { const sep1 = ns1[0]; const sep2 = ns2[0]; - var it1 = mem.split(ns1, []u8{sep1}); - var it2 = mem.split(ns2, []u8{sep2}); + var it1 = mem.tokenize(ns1, []u8{sep1}); + var it2 = mem.tokenize(ns2, []u8{sep2}); // TODO ASCII is wrong, we actually need full unicode support to compare paths. return asciiEqlIgnoreCase(it1.next().?, it2.next().?); @@ -285,8 +285,8 @@ fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8 const sep1 = p1[0]; const sep2 = p2[0]; - var it1 = mem.split(p1, []u8{sep1}); - var it2 = mem.split(p2, []u8{sep2}); + var it1 = mem.tokenize(p1, []u8{sep1}); + var it2 = mem.tokenize(p2, []u8{sep2}); // TODO ASCII is wrong, we actually need full unicode support to compare paths. return asciiEqlIgnoreCase(it1.next().?, it2.next().?) and asciiEqlIgnoreCase(it1.next().?, it2.next().?); @@ -337,6 +337,8 @@ pub fn resolveSlice(allocator: *Allocator, paths: []const []const u8) ![]u8 { /// If all paths are relative it uses the current working directory as a starting point. /// Each drive has its own current working directory. /// Path separators are canonicalized to '\\' and drives are canonicalized to capital letters. +/// Note: all usage of this function should be audited due to the existence of symlinks. +/// Without performing actual syscalls, resolving `..` could be incorrect. pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 { if (paths.len == 0) { assert(is_windows); // resolveWindows called on non windows can't use getCwd @@ -416,7 +418,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 { }, WindowsPath.Kind.NetworkShare => { result = try allocator.alloc(u8, max_size); - var it = mem.split(paths[first_index], "/\\"); + var it = mem.tokenize(paths[first_index], "/\\"); const server_name = it.next().?; const other_name = it.next().?; @@ -483,7 +485,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 { if (!correct_disk_designator) { continue; } - var it = mem.split(p[parsed.disk_designator.len..], "/\\"); + var it = mem.tokenize(p[parsed.disk_designator.len..], "/\\"); while (it.next()) |component| { if (mem.eql(u8, component, ".")) { continue; @@ -516,6 +518,8 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 { /// It resolves "." and "..". /// The result does not have a trailing path separator. /// If all paths are relative it uses the current working directory as a starting point. +/// Note: all usage of this function should be audited due to the existence of symlinks. +/// Without performing actual syscalls, resolving `..` could be incorrect. pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 { if (paths.len == 0) { assert(!is_windows); // resolvePosix called on windows can't use getCwd @@ -550,7 +554,7 @@ pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 { errdefer allocator.free(result); for (paths[first_index..]) |p, i| { - var it = mem.split(p, "/"); + var it = mem.tokenize(p, "/"); while (it.next()) |component| { if (mem.eql(u8, component, ".")) { continue; @@ -937,8 +941,8 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8) return resolved_to; } - var from_it = mem.split(resolved_from, "/\\"); - var to_it = mem.split(resolved_to, "/\\"); + var from_it = mem.tokenize(resolved_from, "/\\"); + var to_it = mem.tokenize(resolved_to, "/\\"); while (true) { const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest()); const to_rest = to_it.rest(); @@ -967,14 +971,12 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8) // shave off the trailing slash result_index -= 1; - if (to_rest.len > 0) { - var rest_it = mem.split(to_rest, "/\\"); - while (rest_it.next()) |to_component| { - result[result_index] = '\\'; - result_index += 1; - mem.copy(u8, result[result_index..], to_component); - result_index += to_component.len; - } + var rest_it = mem.tokenize(to_rest, "/\\"); + while (rest_it.next()) |to_component| { + result[result_index] = '\\'; + result_index += 1; + mem.copy(u8, result[result_index..], to_component); + result_index += to_component.len; } return result[0..result_index]; @@ -990,8 +992,8 @@ pub fn relativePosix(allocator: *Allocator, from: []const u8, to: []const u8) ![ const resolved_to = try resolvePosix(allocator, [][]const u8{to}); defer allocator.free(resolved_to); - var from_it = mem.split(resolved_from, "/"); - var to_it = mem.split(resolved_to, "/"); + var from_it = mem.tokenize(resolved_from, "/"); + var to_it = mem.tokenize(resolved_to, "/"); while (true) { const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest()); const to_rest = to_it.rest();