adjustments to std.mem split / separate

* rename std.mem.split to std.mem.tokenize * add future deprecation notice to docs * (unrelated) add note to std.os.path.resolve docs * std.mem.separate - assert delimiter.len not zero * fix implementation of std.mem.separate to respect the delimiter * separate the two iterators to different structs
2025-01-08 11:12:14 +00:00 · 2019-02-04 15:24:06 -05:00 · 2019-02-04 15:24:06 -05:00 · 67bd45f0cf
commit 67bd45f0cf
parent f44ce7836a
8 changed files with 112 additions and 97 deletions
--- a/build.zig
+++ b/build.zig
@ -189,14 +189,14 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
    const prefix_output = try b.exec([][]const u8{ llvm_config_exe, "--prefix" });

    var result = LibraryDep{
-        .prefix = mem.split(prefix_output, " \r\n").next().?,
+        .prefix = mem.tokenize(prefix_output, " \r\n").next().?,
        .libs = ArrayList([]const u8).init(b.allocator),
        .system_libs = ArrayList([]const u8).init(b.allocator),
        .includes = ArrayList([]const u8).init(b.allocator),
        .libdirs = ArrayList([]const u8).init(b.allocator),
    };
    {
-        var it = mem.split(libs_output, " \r\n");
+        var it = mem.tokenize(libs_output, " \r\n");
        while (it.next()) |lib_arg| {
            if (mem.startsWith(u8, lib_arg, "-l")) {
                try result.system_libs.append(lib_arg[2..]);
@ -210,7 +210,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
        }
    }
    {
-        var it = mem.split(includes_output, " \r\n");
+        var it = mem.tokenize(includes_output, " \r\n");
        while (it.next()) |include_arg| {
            if (mem.startsWith(u8, include_arg, "-I")) {
                try result.includes.append(include_arg[2..]);
@ -220,7 +220,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
        }
    }
    {
-        var it = mem.split(libdir_output, " \r\n");
+        var it = mem.tokenize(libdir_output, " \r\n");
        while (it.next()) |libdir| {
            if (mem.startsWith(u8, libdir, "-L")) {
                try result.libdirs.append(libdir[2..]);
@ -233,7 +233,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
 }

 pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void {
-    var it = mem.split(stdlib_files, ";");
+    var it = mem.tokenize(stdlib_files, ";");
    while (it.next()) |stdlib_file| {
        const src_path = os.path.join(b.allocator, "std", stdlib_file) catch unreachable;
        const dest_path = os.path.join(b.allocator, "lib", "zig", "std", stdlib_file) catch unreachable;
@ -242,7 +242,7 @@ pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void {
 }

 pub fn installCHeaders(b: *Builder, c_header_files: []const u8) void {
-    var it = mem.split(c_header_files, ";");
+    var it = mem.tokenize(c_header_files, ";");
    while (it.next()) |c_header_file| {
        const src_path = os.path.join(b.allocator, "c_headers", c_header_file) catch unreachable;
        const dest_path = os.path.join(b.allocator, "lib", "zig", "include", c_header_file) catch unreachable;
@ -277,7 +277,7 @@ fn configureStage2(b: *Builder, exe: var, ctx: Context) !void {
    addCppLib(b, exe, ctx.cmake_binary_dir, "zig_cpp");
    if (ctx.lld_include_dir.len != 0) {
        exe.addIncludeDir(ctx.lld_include_dir);
-        var it = mem.split(ctx.lld_libraries, ";");
+        var it = mem.tokenize(ctx.lld_libraries, ";");
        while (it.next()) |lib| {
            exe.addObjectFile(lib);
        }
@ -334,7 +334,7 @@ fn addCxxKnownPath(
        ctx.cxx_compiler,
        b.fmt("-print-file-name={}", objname),
    });
-    const path_unpadded = mem.split(path_padded, "\r\n").next().?;
+    const path_unpadded = mem.tokenize(path_padded, "\r\n").next().?;
    if (mem.eql(u8, path_unpadded, objname)) {
        if (errtxt) |msg| {
            warn("{}", msg);
--- a/src-self-hosted/libc_installation.zig
+++ b/src-self-hosted/libc_installation.zig
@ -57,10 +57,10 @@ pub const LibCInstallation = struct {
        const contents = try std.io.readFileAlloc(allocator, libc_file);
        defer allocator.free(contents);

-        var it = std.mem.split(contents, "\n");
+        var it = std.mem.tokenize(contents, "\n");
        while (it.next()) |line| {
            if (line.len == 0 or line[0] == '#') continue;
-            var line_it = std.mem.split(line, "=");
+            var line_it = std.mem.separate(line, "=");
            const name = line_it.next() orelse {
                try stderr.print("missing equal sign after field name\n");
                return error.ParseError;
@ -213,7 +213,7 @@ pub const LibCInstallation = struct {
            },
        }

-        var it = std.mem.split(exec_result.stderr, "\n\r");
+        var it = std.mem.tokenize(exec_result.stderr, "\n\r");
        var search_paths = std.ArrayList([]const u8).init(loop.allocator);
        defer search_paths.deinit();
        while (it.next()) |line| {
@ -410,7 +410,7 @@ async fn ccPrintFileName(loop: *event.Loop, o_file: []const u8, want_dirname: bo
            return error.CCompilerCrashed;
        },
    }
-    var it = std.mem.split(exec_result.stdout, "\n\r");
+    var it = std.mem.tokenize(exec_result.stdout, "\n\r");
    const line = it.next() orelse return error.LibCRuntimeNotFound;
    const dirname = std.os.path.dirname(line) orelse return error.LibCRuntimeNotFound;

--- a/src-self-hosted/main.zig
+++ b/src-self-hosted/main.zig
@ -351,7 +351,7 @@ fn buildOutputType(allocator: *Allocator, args: []const []const u8, out_type: Co
    const root_name = if (provided_name) |n| n else blk: {
        if (root_source_file) |file| {
            const basename = os.path.basename(file);
-            var it = mem.split(basename, ".");
+            var it = mem.separate(basename, ".");
            break :blk it.next() orelse basename;
        } else {
            try stderr.write("--name [name] not provided and unable to infer\n");
--- a/std/build.zig
+++ b/std/build.zig
@ -324,7 +324,7 @@ pub const Builder = struct {

    fn processNixOSEnvVars(self: *Builder) void {
        if (os.getEnvVarOwned(self.allocator, "NIX_CFLAGS_COMPILE")) |nix_cflags_compile| {
-            var it = mem.split(nix_cflags_compile, " ");
+            var it = mem.tokenize(nix_cflags_compile, " ");
            while (true) {
                const word = it.next() orelse break;
                if (mem.eql(u8, word, "-isystem")) {
@ -342,7 +342,7 @@ pub const Builder = struct {
            assert(err == error.EnvironmentVariableNotFound);
        }
        if (os.getEnvVarOwned(self.allocator, "NIX_LDFLAGS")) |nix_ldflags| {
-            var it = mem.split(nix_ldflags, " ");
+            var it = mem.tokenize(nix_ldflags, " ");
            while (true) {
                const word = it.next() orelse break;
                if (mem.eql(u8, word, "-rpath")) {
@ -689,7 +689,7 @@ pub const Builder = struct {
                if (os.path.isAbsolute(name)) {
                    return name;
                }
-                var it = mem.split(PATH, []u8{os.path.delimiter});
+                var it = mem.tokenize(PATH, []u8{os.path.delimiter});
                while (it.next()) |path| {
                    const full_path = try os.path.join(self.allocator, path, self.fmt("{}{}", name, exe_extension));
                    if (os.path.real(self.allocator, full_path)) |real_path| {
--- a/std/mem.zig
+++ b/std/mem.zig
@ -689,58 +689,57 @@ pub fn eql_slice_u8(a: []const u8, b: []const u8) bool {
 }

 /// Returns an iterator that iterates over the slices of `buffer` that are not
-/// any of the bytes in `split_bytes`.
-/// split("   abc def    ghi  ", " ")
+/// any of the bytes in `delimiter_bytes`.
+/// tokenize("   abc def    ghi  ", " ")
 /// Will return slices for "abc", "def", "ghi", null, in that order.
-/// If `split_bytes` does not exist in buffer,
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter_bytes` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
-pub fn split(buffer: []const u8, split_bytes: []const u8) SplitIterator {
-    return SplitIterator{
+/// See also the related function `separate`.
+pub fn tokenize(buffer: []const u8, delimiter_bytes: []const u8) TokenIterator {
+    return TokenIterator{
        .index = 0,
        .buffer = buffer,
-        .split_bytes = split_bytes,
-        .glob = true,
-        .spun = false,
+        .delimiter_bytes = delimiter_bytes,
    };
 }

-test "mem.split" {
-    var it = split("   abc def   ghi  ", " ");
+test "mem.tokenize" {
+    var it = tokenize("   abc def   ghi  ", " ");
    assert(eql(u8, it.next().?, "abc"));
    assert(eql(u8, it.next().?, "def"));
    assert(eql(u8, it.next().?, "ghi"));
    assert(it.next() == null);

-    it = split("..\\bob", "\\");
+    it = tokenize("..\\bob", "\\");
    assert(eql(u8, it.next().?, ".."));
    assert(eql(u8, "..", "..\\bob"[0..it.index]));
    assert(eql(u8, it.next().?, "bob"));
    assert(it.next() == null);

-    it = split("//a/b", "/");
+    it = tokenize("//a/b", "/");
    assert(eql(u8, it.next().?, "a"));
    assert(eql(u8, it.next().?, "b"));
    assert(eql(u8, "//a/b", "//a/b"[0..it.index]));
    assert(it.next() == null);

-    it = split("|", "|");
+    it = tokenize("|", "|");
    assert(it.next() == null);

-    it = split("", "|");
-    assert(eql(u8, it.next().?, ""));
+    it = tokenize("", "|");
    assert(it.next() == null);

-    it = split("hello", "");
+    it = tokenize("hello", "");
    assert(eql(u8, it.next().?, "hello"));
    assert(it.next() == null);

-    it = split("hello", " ");
+    it = tokenize("hello", " ");
    assert(eql(u8, it.next().?, "hello"));
    assert(it.next() == null);
 }

-test "mem.split (multibyte)" {
-    var it = split("a|b,c/d e", " /,|");
+test "mem.tokenize (multibyte)" {
+    var it = tokenize("a|b,c/d e", " /,|");
    assert(eql(u8, it.next().?, "a"));
    assert(eql(u8, it.next().?, "b"));
    assert(eql(u8, it.next().?, "c"));
@ -750,18 +749,21 @@ test "mem.split (multibyte)" {
 }

 /// Returns an iterator that iterates over the slices of `buffer` that
-/// seperates by bytes in `delimiter`.
+/// are separated by bytes in `delimiter`.
 /// separate("abc|def||ghi", "|")
-/// Will return slices for "abc", "def", "", "ghi", null, in that order.
+/// will return slices for "abc", "def", "", "ghi", null, in that order.
 /// If `delimiter` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
+/// The delimiter length must not be zero.
+/// See also the related function `tokenize`.
+/// It is planned to rename this function to `split` before 1.0.0, like this:
+/// pub fn split(buffer: []const u8, delimiter: []const u8) SplitIterator {
 pub fn separate(buffer: []const u8, delimiter: []const u8) SplitIterator {
+    assert(delimiter.len != 0);
    return SplitIterator{
        .index = 0,
        .buffer = buffer,
-        .split_bytes = delimiter,
-        .glob = false,
-        .spun = false,
+        .delimiter = delimiter,
    };
 }

@ -782,19 +784,15 @@ test "mem.separate" {
    assert(eql(u8, it.next().?, ""));
    assert(it.next() == null);

-    it = separate("hello", "");
-    assert(eql(u8, it.next().?, "hello"));
-    assert(it.next() == null);
-
    it = separate("hello", " ");
    assert(eql(u8, it.next().?, "hello"));
    assert(it.next() == null);
 }

 test "mem.separate (multibyte)" {
-    var it = separate("a|b,c/d e", " /,|");
+    var it = separate("a, b ,, c, d, e", ", ");
    assert(eql(u8, it.next().?, "a"));
-    assert(eql(u8, it.next().?, "b"));
+    assert(eql(u8, it.next().?, "b ,"));
    assert(eql(u8, it.next().?, "c"));
    assert(eql(u8, it.next().?, "d"));
    assert(eql(u8, it.next().?, "e"));
@ -819,49 +817,38 @@ test "mem.endsWith" {
    assert(!endsWith(u8, "Bob", "Bo"));
 }

-pub const SplitIterator = struct {
+pub const TokenIterator = struct {
    buffer: []const u8,
-    split_bytes: []const u8,
+    delimiter_bytes: []const u8,
    index: usize,
-    glob: bool,
-    spun: bool,

-    /// Iterates and returns null or optionally a slice the next split segment
-    pub fn next(self: *SplitIterator) ?[]const u8 {
-        if (self.spun) {
-            if (self.index + 1 > self.buffer.len) return null;
-            self.index += 1;
+    /// Returns a slice of the next token, or null if tokenization is complete.
+    pub fn next(self: *TokenIterator) ?[]const u8 {
+        // move to beginning of token
+        while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+        const start = self.index;
+        if (start == self.buffer.len) {
+            return null;
        }

-        self.spun = true;
+        // move to end of token
+        while (self.index < self.buffer.len and !self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+        const end = self.index;

-        if (self.glob) {
-            while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
-        }
-
-        var cursor = self.index;
-        while (cursor < self.buffer.len and !self.isSplitByte(self.buffer[cursor])) : (cursor += 1) {}
-
-        defer self.index = cursor;
-
-        if (cursor == self.buffer.len) {
-            return if (self.glob and self.index == cursor and self.index > 0) null else self.buffer[self.index..];
-        }
-
-        return self.buffer[self.index..cursor];
+        return self.buffer[start..end];
    }

    /// Returns a slice of the remaining bytes. Does not affect iterator state.
-    pub fn rest(self: *const SplitIterator) []const u8 {
+    pub fn rest(self: TokenIterator) []const u8 {
        // move to beginning of token
        var index: usize = self.index;
        while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
        return self.buffer[index..];
    }

-    fn isSplitByte(self: *const SplitIterator, byte: u8) bool {
-        for (self.split_bytes) |split_byte| {
-            if (byte == split_byte) {
+    fn isSplitByte(self: TokenIterator, byte: u8) bool {
+        for (self.delimiter_bytes) |delimiter_byte| {
+            if (byte == delimiter_byte) {
                return true;
            }
        }
@ -869,6 +856,32 @@ pub const SplitIterator = struct {
    }
 };

+pub const SplitIterator = struct {
+    buffer: []const u8,
+    index: ?usize,
+    delimiter: []const u8,
+
+    /// Returns a slice of the next field, or null if splitting is complete.
+    pub fn next(self: *SplitIterator) ?[]const u8 {
+        const start = self.index orelse return null;
+        const end = if (indexOfPos(u8, self.buffer, start, self.delimiter)) |delim_start| blk: {
+            self.index = delim_start + self.delimiter.len;
+            break :blk delim_start;
+        } else blk: {
+            self.index = null;
+            break :blk self.buffer.len;
+        };
+        return self.buffer[start..end];
+    }
+
+    /// Returns a slice of the remaining bytes. Does not affect iterator state.
+    pub fn rest(self: SplitIterator) []const u8 {
+        const end = self.buffer.len;
+        const start = self.index orelse end;
+        return self.buffer[start..end];
+    }
+};
+
 /// Naively combines a series of strings with a separator.
 /// Allocates memory for the result, which must be freed by the caller.
 pub fn join(allocator: *Allocator, sep: u8, strings: ...) ![]u8 {
--- a/std/os/child_process.zig
+++ b/std/os/child_process.zig
@ -595,7 +595,7 @@ pub const ChildProcess = struct {
            const PATH = try os.getEnvVarOwned(self.allocator, "PATH");
            defer self.allocator.free(PATH);

-            var it = mem.split(PATH, ";");
+            var it = mem.tokenize(PATH, ";");
            while (it.next()) |search_path| {
                const joined_path = try os.path.join(self.allocator, search_path, app_name);
                defer self.allocator.free(joined_path);
--- a/std/os/index.zig
+++ b/std/os/index.zig
@ -608,7 +608,7 @@ pub fn posixExecve(argv: []const []const u8, env_map: *const BufMap, allocator:
    // +1 for the null terminating byte
    const path_buf = try allocator.alloc(u8, PATH.len + exe_path.len + 2);
    defer allocator.free(path_buf);
-    var it = mem.split(PATH, ":");
+    var it = mem.tokenize(PATH, ":");
    var seen_eacces = false;
    var err: usize = undefined;
    while (it.next()) |search_path| {
--- a/std/os/path.zig
+++ b/std/os/path.zig
@ -184,7 +184,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath {
                return relative_path;
            }

-            var it = mem.split(path, []u8{this_sep});
+            var it = mem.tokenize(path, []u8{this_sep});
            _ = (it.next() orelse return relative_path);
            _ = (it.next() orelse return relative_path);
            return WindowsPath{
@ -202,7 +202,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath {
                return relative_path;
            }

-            var it = mem.split(path, []u8{this_sep});
+            var it = mem.tokenize(path, []u8{this_sep});
            _ = (it.next() orelse return relative_path);
            _ = (it.next() orelse return relative_path);
            return WindowsPath{
@ -264,8 +264,8 @@ fn networkShareServersEql(ns1: []const u8, ns2: []const u8) bool {
    const sep1 = ns1[0];
    const sep2 = ns2[0];

-    var it1 = mem.split(ns1, []u8{sep1});
-    var it2 = mem.split(ns2, []u8{sep2});
+    var it1 = mem.tokenize(ns1, []u8{sep1});
+    var it2 = mem.tokenize(ns2, []u8{sep2});

    // TODO ASCII is wrong, we actually need full unicode support to compare paths.
    return asciiEqlIgnoreCase(it1.next().?, it2.next().?);
@ -285,8 +285,8 @@ fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8
            const sep1 = p1[0];
            const sep2 = p2[0];

-            var it1 = mem.split(p1, []u8{sep1});
-            var it2 = mem.split(p2, []u8{sep2});
+            var it1 = mem.tokenize(p1, []u8{sep1});
+            var it2 = mem.tokenize(p2, []u8{sep2});

            // TODO ASCII is wrong, we actually need full unicode support to compare paths.
            return asciiEqlIgnoreCase(it1.next().?, it2.next().?) and asciiEqlIgnoreCase(it1.next().?, it2.next().?);
@ -337,6 +337,8 @@ pub fn resolveSlice(allocator: *Allocator, paths: []const []const u8) ![]u8 {
 /// If all paths are relative it uses the current working directory as a starting point.
 /// Each drive has its own current working directory.
 /// Path separators are canonicalized to '\\' and drives are canonicalized to capital letters.
+/// Note: all usage of this function should be audited due to the existence of symlinks.
+/// Without performing actual syscalls, resolving `..` could be incorrect.
 pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
    if (paths.len == 0) {
        assert(is_windows); // resolveWindows called on non windows can't use getCwd
@ -416,7 +418,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
            },
            WindowsPath.Kind.NetworkShare => {
                result = try allocator.alloc(u8, max_size);
-                var it = mem.split(paths[first_index], "/\\");
+                var it = mem.tokenize(paths[first_index], "/\\");
                const server_name = it.next().?;
                const other_name = it.next().?;

@ -483,7 +485,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
        if (!correct_disk_designator) {
            continue;
        }
-        var it = mem.split(p[parsed.disk_designator.len..], "/\\");
+        var it = mem.tokenize(p[parsed.disk_designator.len..], "/\\");
        while (it.next()) |component| {
            if (mem.eql(u8, component, ".")) {
                continue;
@ -516,6 +518,8 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
 /// It resolves "." and "..".
 /// The result does not have a trailing path separator.
 /// If all paths are relative it uses the current working directory as a starting point.
+/// Note: all usage of this function should be audited due to the existence of symlinks.
+/// Without performing actual syscalls, resolving `..` could be incorrect.
 pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 {
    if (paths.len == 0) {
        assert(!is_windows); // resolvePosix called on windows can't use getCwd
@ -550,7 +554,7 @@ pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 {
    errdefer allocator.free(result);

    for (paths[first_index..]) |p, i| {
-        var it = mem.split(p, "/");
+        var it = mem.tokenize(p, "/");
        while (it.next()) |component| {
            if (mem.eql(u8, component, ".")) {
                continue;
@ -937,8 +941,8 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8)
        return resolved_to;
    }

-    var from_it = mem.split(resolved_from, "/\\");
-    var to_it = mem.split(resolved_to, "/\\");
+    var from_it = mem.tokenize(resolved_from, "/\\");
+    var to_it = mem.tokenize(resolved_to, "/\\");
    while (true) {
        const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest());
        const to_rest = to_it.rest();
@ -967,14 +971,12 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8)
        // shave off the trailing slash
        result_index -= 1;

-        if (to_rest.len > 0) {
-            var rest_it = mem.split(to_rest, "/\\");
-            while (rest_it.next()) |to_component| {
-                result[result_index] = '\\';
-                result_index += 1;
-                mem.copy(u8, result[result_index..], to_component);
-                result_index += to_component.len;
-            }
+        var rest_it = mem.tokenize(to_rest, "/\\");
+        while (rest_it.next()) |to_component| {
+            result[result_index] = '\\';
+            result_index += 1;
+            mem.copy(u8, result[result_index..], to_component);
+            result_index += to_component.len;
        }

        return result[0..result_index];
@ -990,8 +992,8 @@ pub fn relativePosix(allocator: *Allocator, from: []const u8, to: []const u8) ![
    const resolved_to = try resolvePosix(allocator, [][]const u8{to});
    defer allocator.free(resolved_to);

-    var from_it = mem.split(resolved_from, "/");
-    var to_it = mem.split(resolved_to, "/");
+    var from_it = mem.tokenize(resolved_from, "/");
+    var to_it = mem.tokenize(resolved_to, "/");
    while (true) {
        const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest());
        const to_rest = to_it.rest();