mirror of
https://github.com/ziglang/zig.git
synced 2025-02-14 08:30:21 +00:00
compress: add a deflate compressor
Replaces the inflate API from `inflateStream(reader: anytype, window_slice: []u8)` to `decompressor(allocator: mem.Allocator, reader: anytype, dictionary: ?[]const u8)` and `compressor(allocator: mem.Allocator, writer: anytype, options: CompressorOptions)`
This commit is contained in:
parent
dba04a272a
commit
490f067de8
17
build.zig
17
build.zig
@ -93,12 +93,25 @@ pub fn build(b: *Builder) !void {
|
||||
.install_dir = .lib,
|
||||
.install_subdir = "zig",
|
||||
.exclude_extensions = &[_][]const u8{
|
||||
"README.md",
|
||||
// exclude files from lib/std/compress/
|
||||
".gz",
|
||||
".z.0",
|
||||
".z.9",
|
||||
".gz",
|
||||
"rfc1951.txt",
|
||||
"rfc1952.txt",
|
||||
// exclude files from lib/std/compress/deflate/testdata
|
||||
".expect",
|
||||
".expect-noinput",
|
||||
".golden",
|
||||
".input",
|
||||
"compress-e.txt",
|
||||
"compress-gettysburg.txt",
|
||||
"compress-pi.txt",
|
||||
"rfc1951.txt",
|
||||
// exclude files from lib/std/tz/
|
||||
".tzif",
|
||||
// others
|
||||
"README.md",
|
||||
},
|
||||
.blank_extensions = &[_][]const u8{
|
||||
"test.zig",
|
||||
|
@ -5,6 +5,7 @@ pub const gzip = @import("compress/gzip.zig");
|
||||
pub const zlib = @import("compress/zlib.zig");
|
||||
|
||||
test {
|
||||
_ = deflate;
|
||||
_ = gzip;
|
||||
_ = zlib;
|
||||
}
|
||||
|
@ -1,738 +1,29 @@
|
||||
//
|
||||
// Decompressor for DEFLATE data streams (RFC1951)
|
||||
//
|
||||
// Heavily inspired by the simple decompressor puff.c by Mark Adler
|
||||
//! The deflate package is a translation of the Go code of the compress/flate package from
|
||||
//! https://go.googlesource.com/go/+/refs/tags/go1.17/src/compress/flate/
|
||||
|
||||
const std = @import("std");
|
||||
const io = std.io;
|
||||
const math = std.math;
|
||||
const mem = std.mem;
|
||||
const deflate = @import("deflate/compressor.zig");
|
||||
const inflate = @import("deflate/decompressor.zig");
|
||||
|
||||
const assert = std.debug.assert;
|
||||
pub const Compression = deflate.Compression;
|
||||
pub const Compressor = deflate.Compressor;
|
||||
pub const Decompressor = inflate.Decompressor;
|
||||
|
||||
const MAXBITS = 15;
|
||||
const MAXLCODES = 286;
|
||||
const MAXDCODES = 30;
|
||||
const MAXCODES = MAXLCODES + MAXDCODES;
|
||||
const FIXLCODES = 288;
|
||||
pub const compressor = deflate.compressor;
|
||||
pub const decompressor = inflate.decompressor;
|
||||
|
||||
// The maximum length of a Huffman code's prefix we can decode using the fast
|
||||
// path. The factor 9 is inherited from Zlib, tweaking the value showed little
|
||||
// or no changes in the profiler output.
|
||||
const PREFIX_LUT_BITS = 9;
|
||||
test {
|
||||
_ = @import("deflate/token.zig");
|
||||
_ = @import("deflate/bits_utils.zig");
|
||||
_ = @import("deflate/dict_decoder.zig");
|
||||
|
||||
const Huffman = struct {
|
||||
const LUTEntry = packed struct { symbol: u16 align(4), len: u16 };
|
||||
_ = @import("deflate/huffman_code.zig");
|
||||
_ = @import("deflate/huffman_bit_writer.zig");
|
||||
|
||||
// Number of codes for each possible length
|
||||
count: [MAXBITS + 1]u16,
|
||||
// Mapping between codes and symbols
|
||||
symbol: [MAXCODES]u16,
|
||||
_ = @import("deflate/compressor.zig");
|
||||
_ = @import("deflate/compressor_test.zig");
|
||||
|
||||
// The decoding process uses a trick explained by Mark Adler in [1].
|
||||
// We basically precompute for a fixed number of codes (0 <= x <= 2^N-1)
|
||||
// the symbol and the effective code length we'd get if the decoder was run
|
||||
// on the given N-bit sequence.
|
||||
// A code with length 0 means the sequence is not a valid prefix for this
|
||||
// canonical Huffman code and we have to decode it using a slower method.
|
||||
//
|
||||
// [1] https://github.com/madler/zlib/blob/v1.2.11/doc/algorithm.txt#L58
|
||||
prefix_lut: [1 << PREFIX_LUT_BITS]LUTEntry,
|
||||
// The following info refer to the codes of length PREFIX_LUT_BITS+1 and are
|
||||
// used to bootstrap the bit-by-bit reading method if the fast-path fails.
|
||||
last_code: u16,
|
||||
last_index: u16,
|
||||
_ = @import("deflate/deflate_fast.zig");
|
||||
_ = @import("deflate/deflate_fast_test.zig");
|
||||
|
||||
min_code_len: u16,
|
||||
|
||||
const ConstructError = error{ Oversubscribed, IncompleteSet };
|
||||
|
||||
fn construct(self: *Huffman, code_length: []const u16) ConstructError!void {
|
||||
for (self.count) |*val| {
|
||||
val.* = 0;
|
||||
}
|
||||
|
||||
self.min_code_len = math.maxInt(u16);
|
||||
for (code_length) |len| {
|
||||
if (len != 0 and len < self.min_code_len)
|
||||
self.min_code_len = len;
|
||||
self.count[len] += 1;
|
||||
}
|
||||
|
||||
// All zero.
|
||||
if (self.count[0] == code_length.len) {
|
||||
self.min_code_len = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
var left: isize = 1;
|
||||
for (self.count[1..]) |val| {
|
||||
// Each added bit doubles the amount of codes.
|
||||
left *= 2;
|
||||
// Make sure the number of codes with this length isn't too high.
|
||||
left -= @as(isize, @bitCast(i16, val));
|
||||
if (left < 0)
|
||||
return error.Oversubscribed;
|
||||
}
|
||||
|
||||
// Compute the offset of the first symbol represented by a code of a
|
||||
// given length in the symbol table, together with the first canonical
|
||||
// Huffman code for that length.
|
||||
var offset: [MAXBITS + 1]u16 = undefined;
|
||||
var codes: [MAXBITS + 1]u16 = undefined;
|
||||
{
|
||||
offset[1] = 0;
|
||||
codes[1] = 0;
|
||||
var len: usize = 1;
|
||||
while (len < MAXBITS) : (len += 1) {
|
||||
offset[len + 1] = offset[len] + self.count[len];
|
||||
codes[len + 1] = (codes[len] + self.count[len]) << 1;
|
||||
}
|
||||
}
|
||||
|
||||
self.prefix_lut = mem.zeroes(@TypeOf(self.prefix_lut));
|
||||
|
||||
for (code_length) |len, symbol| {
|
||||
if (len != 0) {
|
||||
// Fill the symbol table.
|
||||
// The symbols are assigned sequentially for each length.
|
||||
self.symbol[offset[len]] = @truncate(u16, symbol);
|
||||
// Track the last assigned offset.
|
||||
offset[len] += 1;
|
||||
}
|
||||
|
||||
if (len == 0 or len > PREFIX_LUT_BITS)
|
||||
continue;
|
||||
|
||||
// Given a Huffman code of length N we transform it into an index
|
||||
// into the lookup table by reversing its bits and filling the
|
||||
// remaining bits (PREFIX_LUT_BITS - N) with every possible
|
||||
// combination of bits to act as a wildcard.
|
||||
const bits_to_fill = @intCast(u5, PREFIX_LUT_BITS - len);
|
||||
const rev_code = bitReverse(u16, codes[len], len);
|
||||
|
||||
// Track the last used code, but only for lengths < PREFIX_LUT_BITS.
|
||||
codes[len] += 1;
|
||||
|
||||
var j: usize = 0;
|
||||
while (j < @as(usize, 1) << bits_to_fill) : (j += 1) {
|
||||
const index = rev_code | (j << @intCast(u5, len));
|
||||
assert(self.prefix_lut[index].len == 0);
|
||||
self.prefix_lut[index] = .{
|
||||
.symbol = @truncate(u16, symbol),
|
||||
.len = @truncate(u16, len),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
self.last_code = codes[PREFIX_LUT_BITS + 1];
|
||||
self.last_index = offset[PREFIX_LUT_BITS + 1] - self.count[PREFIX_LUT_BITS + 1];
|
||||
|
||||
if (left > 0)
|
||||
return error.IncompleteSet;
|
||||
}
|
||||
};
|
||||
|
||||
// Reverse bit-by-bit a N-bit code.
|
||||
fn bitReverse(comptime T: type, value: T, N: usize) T {
|
||||
const r = @bitReverse(T, value);
|
||||
return r >> @intCast(math.Log2Int(T), @typeInfo(T).Int.bits - N);
|
||||
}
|
||||
|
||||
pub fn InflateStream(comptime ReaderType: type) type {
|
||||
return struct {
|
||||
const Self = @This();
|
||||
|
||||
pub const Error = ReaderType.Error || error{
|
||||
EndOfStream,
|
||||
BadCounts,
|
||||
InvalidBlockType,
|
||||
InvalidDistance,
|
||||
InvalidFixedCode,
|
||||
InvalidLength,
|
||||
InvalidStoredSize,
|
||||
InvalidSymbol,
|
||||
InvalidTree,
|
||||
MissingEOBCode,
|
||||
NoLastLength,
|
||||
OutOfCodes,
|
||||
};
|
||||
pub const Reader = io.Reader(*Self, Error, read);
|
||||
|
||||
inner_reader: ReaderType,
|
||||
|
||||
// True if the decoder met the end of the compressed stream, no further
|
||||
// data can be decompressed
|
||||
seen_eos: bool,
|
||||
|
||||
state: union(enum) {
|
||||
// Parse a compressed block header and set up the internal state for
|
||||
// decompressing its contents.
|
||||
DecodeBlockHeader: void,
|
||||
// Decode all the symbols in a compressed block.
|
||||
DecodeBlockData: void,
|
||||
// Copy N bytes of uncompressed data from the underlying stream into
|
||||
// the window.
|
||||
Copy: usize,
|
||||
// Copy 1 byte into the window.
|
||||
CopyLit: u8,
|
||||
// Copy L bytes from the window itself, starting from D bytes
|
||||
// behind.
|
||||
CopyFrom: struct { distance: u16, length: u16 },
|
||||
},
|
||||
|
||||
// Sliding window for the LZ77 algorithm
|
||||
window: struct {
|
||||
const WSelf = @This();
|
||||
|
||||
// invariant: buffer length is always a power of 2
|
||||
buf: []u8,
|
||||
// invariant: ri <= wi
|
||||
wi: usize = 0, // Write index
|
||||
ri: usize = 0, // Read index
|
||||
el: usize = 0, // Number of readable elements
|
||||
total_written: usize = 0,
|
||||
|
||||
fn readable(self: *WSelf) usize {
|
||||
return self.el;
|
||||
}
|
||||
|
||||
fn writable(self: *WSelf) usize {
|
||||
return self.buf.len - self.el;
|
||||
}
|
||||
|
||||
// Insert a single byte into the window.
|
||||
// Returns 1 if there's enough space for the new byte and 0
|
||||
// otherwise.
|
||||
fn append(self: *WSelf, value: u8) usize {
|
||||
if (self.writable() < 1) return 0;
|
||||
self.appendUnsafe(value);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Insert a single byte into the window.
|
||||
// Assumes there's enough space.
|
||||
inline fn appendUnsafe(self: *WSelf, value: u8) void {
|
||||
self.buf[self.wi] = value;
|
||||
self.wi = (self.wi + 1) & (self.buf.len - 1);
|
||||
self.el += 1;
|
||||
self.total_written += 1;
|
||||
}
|
||||
|
||||
// Fill dest[] with data from the window, starting from the read
|
||||
// position. This updates the read pointer.
|
||||
// Returns the number of read bytes or 0 if there's nothing to read
|
||||
// yet.
|
||||
fn read(self: *WSelf, dest: []u8) usize {
|
||||
const N = math.min(dest.len, self.readable());
|
||||
|
||||
if (N == 0) return 0;
|
||||
|
||||
if (self.ri + N < self.buf.len) {
|
||||
// The data doesn't wrap around
|
||||
mem.copy(u8, dest, self.buf[self.ri .. self.ri + N]);
|
||||
} else {
|
||||
// The data wraps around the buffer, split the copy
|
||||
std.mem.copy(u8, dest, self.buf[self.ri..]);
|
||||
// How much data we've copied from `ri` to the end
|
||||
const r = self.buf.len - self.ri;
|
||||
std.mem.copy(u8, dest[r..], self.buf[0 .. N - r]);
|
||||
}
|
||||
|
||||
self.ri = (self.ri + N) & (self.buf.len - 1);
|
||||
self.el -= N;
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
// Copy `length` bytes starting from `distance` bytes behind the
|
||||
// write pointer.
|
||||
// Be careful as the length may be greater than the distance, that's
|
||||
// how the compressor encodes run-length encoded sequences.
|
||||
fn copyFrom(self: *WSelf, distance: usize, length: usize) usize {
|
||||
const N = math.min(length, self.writable());
|
||||
|
||||
if (N == 0) return 0;
|
||||
|
||||
// TODO: Profile and, if needed, replace with smarter juggling
|
||||
// of the window memory for the non-overlapping case.
|
||||
var i: usize = 0;
|
||||
while (i < N) : (i += 1) {
|
||||
const index = (self.wi -% distance) & (self.buf.len - 1);
|
||||
self.appendUnsafe(self.buf[index]);
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
},
|
||||
|
||||
// Compressor-local Huffman tables used to decompress blocks with
|
||||
// dynamic codes.
|
||||
huffman_tables: [2]Huffman = undefined,
|
||||
|
||||
// Huffman tables used for decoding length/distance pairs.
|
||||
hdist: *Huffman,
|
||||
hlen: *Huffman,
|
||||
|
||||
// Temporary buffer for the bitstream.
|
||||
// Bits 0..`bits_left` are filled with data, the remaining ones are zeros.
|
||||
bits: u32,
|
||||
bits_left: usize,
|
||||
|
||||
fn peekBits(self: *Self, bits: usize) !u32 {
|
||||
while (self.bits_left < bits) {
|
||||
const byte = try self.inner_reader.readByte();
|
||||
self.bits |= @as(u32, byte) << @intCast(u5, self.bits_left);
|
||||
self.bits_left += 8;
|
||||
}
|
||||
const mask = (@as(u32, 1) << @intCast(u5, bits)) - 1;
|
||||
return self.bits & mask;
|
||||
}
|
||||
fn readBits(self: *Self, bits: usize) !u32 {
|
||||
const val = try self.peekBits(bits);
|
||||
self.discardBits(bits);
|
||||
return val;
|
||||
}
|
||||
fn discardBits(self: *Self, bits: usize) void {
|
||||
self.bits >>= @intCast(u5, bits);
|
||||
self.bits_left -= bits;
|
||||
}
|
||||
|
||||
fn stored(self: *Self) !void {
|
||||
// Discard the remaining bits, the length field is always
|
||||
// byte-aligned (and so is the data).
|
||||
self.discardBits(self.bits_left);
|
||||
|
||||
const length = try self.inner_reader.readIntLittle(u16);
|
||||
const length_cpl = try self.inner_reader.readIntLittle(u16);
|
||||
|
||||
if (length != ~length_cpl)
|
||||
return error.InvalidStoredSize;
|
||||
|
||||
self.state = .{ .Copy = length };
|
||||
}
|
||||
|
||||
fn fixed(self: *Self) !void {
|
||||
comptime var lencode: Huffman = undefined;
|
||||
comptime var distcode: Huffman = undefined;
|
||||
|
||||
// The Huffman codes are specified in the RFC1951, section 3.2.6
|
||||
comptime {
|
||||
@setEvalBranchQuota(100000);
|
||||
|
||||
const len_lengths =
|
||||
[_]u16{8} ** 144 ++
|
||||
[_]u16{9} ** 112 ++
|
||||
[_]u16{7} ** 24 ++
|
||||
[_]u16{8} ** 8;
|
||||
assert(len_lengths.len == FIXLCODES);
|
||||
try lencode.construct(len_lengths[0..]);
|
||||
|
||||
const dist_lengths = [_]u16{5} ** MAXDCODES;
|
||||
distcode.construct(dist_lengths[0..]) catch |err| switch (err) {
|
||||
// This error is expected because we only compute distance codes
|
||||
// 0-29, which is fine since "distance codes 30-31 will never actually
|
||||
// occur in the compressed data" (from section 3.2.6 of RFC1951).
|
||||
error.IncompleteSet => {},
|
||||
else => return err,
|
||||
};
|
||||
}
|
||||
|
||||
self.hlen = &lencode;
|
||||
self.hdist = &distcode;
|
||||
self.state = .DecodeBlockData;
|
||||
}
|
||||
|
||||
fn dynamic(self: *Self) !void {
|
||||
// Number of length codes
|
||||
const nlen = (try self.readBits(5)) + 257;
|
||||
// Number of distance codes
|
||||
const ndist = (try self.readBits(5)) + 1;
|
||||
// Number of code length codes
|
||||
const ncode = (try self.readBits(4)) + 4;
|
||||
|
||||
if (nlen > MAXLCODES or ndist > MAXDCODES)
|
||||
return error.BadCounts;
|
||||
|
||||
// Permutation of code length codes
|
||||
const ORDER = [19]u16{
|
||||
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4,
|
||||
12, 3, 13, 2, 14, 1, 15,
|
||||
};
|
||||
|
||||
// Build the Huffman table to decode the code length codes
|
||||
var lencode: Huffman = undefined;
|
||||
{
|
||||
var lengths = std.mem.zeroes([19]u16);
|
||||
|
||||
// Read the code lengths, missing ones are left as zero
|
||||
for (ORDER[0..ncode]) |val| {
|
||||
lengths[val] = @intCast(u16, try self.readBits(3));
|
||||
}
|
||||
|
||||
lencode.construct(lengths[0..]) catch return error.InvalidTree;
|
||||
}
|
||||
|
||||
// Read the length/literal and distance code length tables.
|
||||
// Zero the table by default so we can avoid explicitly writing out
|
||||
// zeros for codes 17 and 18
|
||||
var lengths = std.mem.zeroes([MAXCODES]u16);
|
||||
|
||||
var i: usize = 0;
|
||||
while (i < nlen + ndist) {
|
||||
const symbol = try self.decode(&lencode);
|
||||
|
||||
switch (symbol) {
|
||||
0...15 => {
|
||||
lengths[i] = symbol;
|
||||
i += 1;
|
||||
},
|
||||
16 => {
|
||||
// repeat last length 3..6 times
|
||||
if (i == 0) return error.NoLastLength;
|
||||
|
||||
const last_length = lengths[i - 1];
|
||||
const repeat = 3 + (try self.readBits(2));
|
||||
const last_index = i + repeat;
|
||||
if (last_index > lengths.len)
|
||||
return error.InvalidLength;
|
||||
while (i < last_index) : (i += 1) {
|
||||
lengths[i] = last_length;
|
||||
}
|
||||
},
|
||||
17 => {
|
||||
// repeat zero 3..10 times
|
||||
i += 3 + (try self.readBits(3));
|
||||
},
|
||||
18 => {
|
||||
// repeat zero 11..138 times
|
||||
i += 11 + (try self.readBits(7));
|
||||
},
|
||||
else => return error.InvalidSymbol,
|
||||
}
|
||||
}
|
||||
|
||||
if (i > nlen + ndist)
|
||||
return error.InvalidLength;
|
||||
|
||||
// Check if the end of block code is present
|
||||
if (lengths[256] == 0)
|
||||
return error.MissingEOBCode;
|
||||
|
||||
self.huffman_tables[0].construct(lengths[0..nlen]) catch |err| switch (err) {
|
||||
error.Oversubscribed => return error.InvalidTree,
|
||||
error.IncompleteSet => {
|
||||
// incomplete code ok only for single length 1 code
|
||||
if (nlen != self.huffman_tables[0].count[0] + self.huffman_tables[0].count[1]) {
|
||||
return error.InvalidTree;
|
||||
}
|
||||
},
|
||||
};
|
||||
self.huffman_tables[1].construct(lengths[nlen .. nlen + ndist]) catch |err| switch (err) {
|
||||
error.Oversubscribed => return error.InvalidTree,
|
||||
error.IncompleteSet => {
|
||||
// incomplete code ok only for single length 1 code
|
||||
if (ndist != self.huffman_tables[1].count[0] + self.huffman_tables[1].count[1]) {
|
||||
return error.InvalidTree;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
self.hlen = &self.huffman_tables[0];
|
||||
self.hdist = &self.huffman_tables[1];
|
||||
self.state = .DecodeBlockData;
|
||||
}
|
||||
|
||||
fn codes(self: *Self, lencode: *Huffman, distcode: *Huffman) !bool {
|
||||
// Size base for length codes 257..285
|
||||
const LENS = [29]u16{
|
||||
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
|
||||
35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258,
|
||||
};
|
||||
// Extra bits for length codes 257..285
|
||||
const LEXT = [29]u16{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0,
|
||||
};
|
||||
// Offset base for distance codes 0..29
|
||||
const DISTS = [30]u16{
|
||||
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
|
||||
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577,
|
||||
};
|
||||
// Extra bits for distance codes 0..29
|
||||
const DEXT = [30]u16{
|
||||
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
|
||||
7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
|
||||
};
|
||||
|
||||
while (true) {
|
||||
const symbol = try self.decode(lencode);
|
||||
|
||||
switch (symbol) {
|
||||
0...255 => {
|
||||
// Literal value
|
||||
const c = @truncate(u8, symbol);
|
||||
if (self.window.append(c) == 0) {
|
||||
self.state = .{ .CopyLit = c };
|
||||
return false;
|
||||
}
|
||||
},
|
||||
256 => {
|
||||
// End of block symbol
|
||||
return true;
|
||||
},
|
||||
257...285 => {
|
||||
// Length/distance pair
|
||||
const length_symbol = symbol - 257;
|
||||
const length = LENS[length_symbol] +
|
||||
@intCast(u16, try self.readBits(LEXT[length_symbol]));
|
||||
|
||||
const distance_symbol = try self.decode(distcode);
|
||||
const distance = DISTS[distance_symbol] +
|
||||
@intCast(u16, try self.readBits(DEXT[distance_symbol]));
|
||||
|
||||
if (distance > self.window.buf.len or distance > self.window.total_written)
|
||||
return error.InvalidDistance;
|
||||
|
||||
const written = self.window.copyFrom(distance, length);
|
||||
if (written != length) {
|
||||
self.state = .{
|
||||
.CopyFrom = .{
|
||||
.distance = distance,
|
||||
.length = length - @truncate(u16, written),
|
||||
},
|
||||
};
|
||||
return false;
|
||||
}
|
||||
},
|
||||
else => return error.InvalidFixedCode,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode(self: *Self, h: *Huffman) !u16 {
|
||||
// Using u32 instead of u16 to reduce the number of casts needed.
|
||||
var prefix: u32 = 0;
|
||||
|
||||
// Fast path, read some bits and hope they're the prefix of some code.
|
||||
// We can't read PREFIX_LUT_BITS as we don't want to read past the
|
||||
// deflate stream end, use an incremental approach instead.
|
||||
var code_len = h.min_code_len;
|
||||
if (code_len == 0)
|
||||
return error.OutOfCodes;
|
||||
while (true) {
|
||||
_ = try self.peekBits(code_len);
|
||||
// Small optimization win, use as many bits as possible in the
|
||||
// table lookup.
|
||||
prefix = self.bits & ((1 << PREFIX_LUT_BITS) - 1);
|
||||
|
||||
const lut_entry = &h.prefix_lut[prefix];
|
||||
// The code is longer than PREFIX_LUT_BITS!
|
||||
if (lut_entry.len == 0)
|
||||
break;
|
||||
// If the code lenght doesn't increase we found a match.
|
||||
if (lut_entry.len <= code_len) {
|
||||
self.discardBits(code_len);
|
||||
return lut_entry.symbol;
|
||||
}
|
||||
|
||||
code_len = lut_entry.len;
|
||||
}
|
||||
|
||||
// The sequence we've read is not a prefix of any code of length <=
|
||||
// PREFIX_LUT_BITS, keep decoding it using a slower method.
|
||||
prefix = try self.readBits(PREFIX_LUT_BITS);
|
||||
|
||||
// Speed up the decoding by starting from the first code length
|
||||
// that's not covered by the table.
|
||||
var len: usize = PREFIX_LUT_BITS + 1;
|
||||
var first: usize = h.last_code;
|
||||
var index: usize = h.last_index;
|
||||
|
||||
// Reverse the prefix so that the LSB becomes the MSB and make space
|
||||
// for the next bit.
|
||||
var code = bitReverse(u32, prefix, PREFIX_LUT_BITS + 1);
|
||||
|
||||
while (len <= MAXBITS) : (len += 1) {
|
||||
code |= try self.readBits(1);
|
||||
const count = h.count[len];
|
||||
if (code < first + count) {
|
||||
return h.symbol[index + (code - first)];
|
||||
}
|
||||
index += count;
|
||||
first += count;
|
||||
first <<= 1;
|
||||
code <<= 1;
|
||||
}
|
||||
|
||||
return error.OutOfCodes;
|
||||
}
|
||||
|
||||
fn step(self: *Self) !void {
|
||||
while (true) {
|
||||
switch (self.state) {
|
||||
.DecodeBlockHeader => {
|
||||
// The compressed stream is done.
|
||||
if (self.seen_eos) return;
|
||||
|
||||
const last = @intCast(u1, try self.readBits(1));
|
||||
const kind = @intCast(u2, try self.readBits(2));
|
||||
|
||||
self.seen_eos = last != 0;
|
||||
|
||||
// The next state depends on the block type.
|
||||
switch (kind) {
|
||||
0 => try self.stored(),
|
||||
1 => try self.fixed(),
|
||||
2 => try self.dynamic(),
|
||||
3 => return error.InvalidBlockType,
|
||||
}
|
||||
},
|
||||
.DecodeBlockData => {
|
||||
if (!try self.codes(self.hlen, self.hdist)) {
|
||||
return;
|
||||
}
|
||||
|
||||
self.state = .DecodeBlockHeader;
|
||||
},
|
||||
.Copy => |*length| {
|
||||
const N = math.min(self.window.writable(), length.*);
|
||||
|
||||
// TODO: This loop can be more efficient. On the other
|
||||
// hand uncompressed blocks are not that common so...
|
||||
var i: usize = 0;
|
||||
while (i < N) : (i += 1) {
|
||||
var tmp: [1]u8 = undefined;
|
||||
if ((try self.inner_reader.read(&tmp)) != 1) {
|
||||
// Unexpected end of stream, keep this error
|
||||
// consistent with the use of readBitsNoEof.
|
||||
return error.EndOfStream;
|
||||
}
|
||||
self.window.appendUnsafe(tmp[0]);
|
||||
}
|
||||
|
||||
if (N != length.*) {
|
||||
length.* -= N;
|
||||
return;
|
||||
}
|
||||
|
||||
self.state = .DecodeBlockHeader;
|
||||
},
|
||||
.CopyLit => |c| {
|
||||
if (self.window.append(c) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
self.state = .DecodeBlockData;
|
||||
},
|
||||
.CopyFrom => |*info| {
|
||||
const written = self.window.copyFrom(info.distance, info.length);
|
||||
if (written != info.length) {
|
||||
info.length -= @truncate(u16, written);
|
||||
return;
|
||||
}
|
||||
|
||||
self.state = .DecodeBlockData;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init(source: ReaderType, window_slice: []u8) Self {
|
||||
assert(math.isPowerOfTwo(window_slice.len));
|
||||
|
||||
return Self{
|
||||
.inner_reader = source,
|
||||
.window = .{ .buf = window_slice },
|
||||
.seen_eos = false,
|
||||
.state = .DecodeBlockHeader,
|
||||
.hdist = undefined,
|
||||
.hlen = undefined,
|
||||
.bits = 0,
|
||||
.bits_left = 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Implements the io.Reader interface
|
||||
pub fn read(self: *Self, buffer: []u8) Error!usize {
|
||||
if (buffer.len == 0)
|
||||
return 0;
|
||||
|
||||
// Try reading as much as possible from the window
|
||||
var read_amt: usize = self.window.read(buffer);
|
||||
while (read_amt < buffer.len) {
|
||||
// Run the state machine, we can detect the "effective" end of
|
||||
// stream condition by checking if any progress was made.
|
||||
// Why "effective"? Because even though `seen_eos` is true we
|
||||
// may still have to finish processing other decoding steps.
|
||||
try self.step();
|
||||
// No progress was made
|
||||
if (self.window.readable() == 0)
|
||||
break;
|
||||
|
||||
read_amt += self.window.read(buffer[read_amt..]);
|
||||
}
|
||||
|
||||
return read_amt;
|
||||
}
|
||||
|
||||
pub fn reader(self: *Self) Reader {
|
||||
return .{ .context = self };
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub fn inflateStream(reader: anytype, window_slice: []u8) InflateStream(@TypeOf(reader)) {
|
||||
return InflateStream(@TypeOf(reader)).init(reader, window_slice);
|
||||
}
|
||||
|
||||
test "lengths overflow" {
|
||||
// malformed final dynamic block, tries to write 321 code lengths (MAXCODES is 316)
|
||||
// f dy hlit hdist hclen 16 17 18 0 (18) x138 (18) x138 (18) x39 (16) x6
|
||||
// 1 10 11101 11101 0000 010 010 010 010 (11) 1111111 (11) 1111111 (11) 0011100 (01) 11
|
||||
const stream = [_]u8{ 0b11101101, 0b00011101, 0b00100100, 0b11101001, 0b11111111, 0b11111111, 0b00111001, 0b00001110 };
|
||||
try std.testing.expectError(error.InvalidLength, testInflate(stream[0..]));
|
||||
}
|
||||
|
||||
test "empty distance alphabet" {
|
||||
// dynamic block with empty distance alphabet is valid if only literals and end of data symbol are used
|
||||
// f dy hlit hdist hclen 16 17 18 0 8 7 9 6 10 5 11 4 12 3 13 2 14 1 15 (18) x128 (18) x128 (1) ( 0) (256)
|
||||
// 1 10 00000 00000 1111 000 000 010 010 000 000 000 000 000 000 000 000 000 000 000 000 000 001 000 (11) 1110101 (11) 1110101 (0) (10) (0)
|
||||
const stream = [_]u8{ 0b00000101, 0b11100000, 0b00000001, 0b00001001, 0b00000000, 0b00000000, 0b00000000, 0b00000000, 0b00010000, 0b01011100, 0b10111111, 0b00101110 };
|
||||
try testInflate(stream[0..]);
|
||||
}
|
||||
|
||||
test "distance past beginning of output stream" {
|
||||
// f fx ('A') ('B') ('C') <len=4, dist=4> (end)
|
||||
// 1 01 (01110001) (01110010) (01110011) (0000010) (00011) (0000000)
|
||||
const stream = [_]u8{ 0b01110011, 0b01110100, 0b01110010, 0b00000110, 0b01100001, 0b00000000 };
|
||||
try std.testing.expectError(error.InvalidDistance, testInflate(stream[0..]));
|
||||
}
|
||||
|
||||
test "inflateStream fuzzing" {
|
||||
// see https://github.com/ziglang/zig/issues/9842
|
||||
try std.testing.expectError(error.EndOfStream, testInflate("\x95\x90=o\xc20\x10\x86\xf30"));
|
||||
try std.testing.expectError(error.OutOfCodes, testInflate("\x950\x00\x0000000"));
|
||||
|
||||
// Huffman.construct errors
|
||||
// lencode
|
||||
try std.testing.expectError(error.InvalidTree, testInflate("\x950000"));
|
||||
try std.testing.expectError(error.InvalidTree, testInflate("\x05000"));
|
||||
// hlen
|
||||
try std.testing.expectError(error.InvalidTree, testInflate("\x05\xea\x01\t\x00\x00\x00\x01\x00\\\xbf.\t\x00"));
|
||||
// hdist
|
||||
try std.testing.expectError(error.InvalidTree, testInflate("\x05\xe0\x01A\x00\x00\x00\x00\x10\\\xbf."));
|
||||
|
||||
// Huffman.construct -> error.IncompleteSet returns that shouldn't give error.InvalidTree
|
||||
// (like the "empty distance alphabet" test but for ndist instead of nlen)
|
||||
try std.testing.expectError(error.EndOfStream, testInflate("\x05\xe0\x01\t\x00\x00\x00\x00\x10\\\xbf\xce"));
|
||||
try testInflate("\x15\xe0\x01\t\x00\x00\x00\x00\x10\\\xbf.0");
|
||||
}
|
||||
|
||||
fn testInflate(data: []const u8) !void {
|
||||
var window: [0x8000]u8 = undefined;
|
||||
const reader = std.io.fixedBufferStream(data).reader();
|
||||
var inflate = inflateStream(reader, &window);
|
||||
var inflated = try inflate.reader().readAllAlloc(std.testing.allocator, std.math.maxInt(usize));
|
||||
defer std.testing.allocator.free(inflated);
|
||||
_ = @import("deflate/decompressor.zig");
|
||||
}
|
||||
|
34
lib/std/compress/deflate/bits_utils.zig
Normal file
34
lib/std/compress/deflate/bits_utils.zig
Normal file
@ -0,0 +1,34 @@
|
||||
const math = @import("std").math;
|
||||
|
||||
// Reverse bit-by-bit a N-bit code.
|
||||
pub fn bitReverse(comptime T: type, value: T, N: usize) T {
|
||||
const r = @bitReverse(T, value);
|
||||
return r >> @intCast(math.Log2Int(T), @typeInfo(T).Int.bits - N);
|
||||
}
|
||||
|
||||
test "bitReverse" {
|
||||
const std = @import("std");
|
||||
const expect = std.testing.expect;
|
||||
|
||||
const ReverseBitsTest = struct {
|
||||
in: u16,
|
||||
bit_count: u5,
|
||||
out: u16,
|
||||
};
|
||||
|
||||
var reverse_bits_tests = [_]ReverseBitsTest{
|
||||
.{ .in = 1, .bit_count = 1, .out = 1 },
|
||||
.{ .in = 1, .bit_count = 2, .out = 2 },
|
||||
.{ .in = 1, .bit_count = 3, .out = 4 },
|
||||
.{ .in = 1, .bit_count = 4, .out = 8 },
|
||||
.{ .in = 1, .bit_count = 5, .out = 16 },
|
||||
.{ .in = 17, .bit_count = 5, .out = 17 },
|
||||
.{ .in = 257, .bit_count = 9, .out = 257 },
|
||||
.{ .in = 29, .bit_count = 5, .out = 23 },
|
||||
};
|
||||
|
||||
for (reverse_bits_tests) |h| {
|
||||
var v = bitReverse(u16, h.in, h.bit_count);
|
||||
try expect(v == h.out);
|
||||
}
|
||||
}
|
1111
lib/std/compress/deflate/compressor.zig
Normal file
1111
lib/std/compress/deflate/compressor.zig
Normal file
File diff suppressed because it is too large
Load Diff
560
lib/std/compress/deflate/compressor_test.zig
Normal file
560
lib/std/compress/deflate/compressor_test.zig
Normal file
@ -0,0 +1,560 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const expect = std.testing.expect;
|
||||
const fifo = std.fifo;
|
||||
const io = std.io;
|
||||
const math = std.math;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
const ArrayList = std.ArrayList;
|
||||
|
||||
const deflate = @import("compressor.zig");
|
||||
const inflate = @import("decompressor.zig");
|
||||
|
||||
const compressor = deflate.compressor;
|
||||
const decompressor = inflate.decompressor;
|
||||
const huffman_only = deflate.huffman_only;
|
||||
|
||||
fn testSync(level: deflate.Compression, input: []const u8) !void {
|
||||
if (input.len == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
var divided_buf = fifo
|
||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
||||
.init(testing.allocator);
|
||||
defer divided_buf.deinit();
|
||||
var whole_buf = std.ArrayList(u8).init(testing.allocator);
|
||||
defer whole_buf.deinit();
|
||||
|
||||
var multi_writer = io.multiWriter(.{
|
||||
divided_buf.writer(),
|
||||
whole_buf.writer(),
|
||||
}).writer();
|
||||
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
multi_writer,
|
||||
.{ .level = level },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
{
|
||||
var decomp = try decompressor(
|
||||
testing.allocator,
|
||||
divided_buf.reader(),
|
||||
null,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
// Write first half of the input and flush()
|
||||
var half: usize = (input.len + 1) / 2;
|
||||
var half_len: usize = half - 0;
|
||||
{
|
||||
_ = try comp.writer().writeAll(input[0..half]);
|
||||
|
||||
// Flush
|
||||
try comp.flush();
|
||||
|
||||
// Read back
|
||||
var decompressed = try testing.allocator.alloc(u8, half_len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var read = try decomp.reader().readAll(decompressed); // read at least half
|
||||
try expect(read == half_len);
|
||||
try expect(mem.eql(u8, input[0..half], decompressed));
|
||||
}
|
||||
|
||||
// Write last half of the input and close()
|
||||
half_len = input.len - half;
|
||||
{
|
||||
_ = try comp.writer().writeAll(input[half..]);
|
||||
|
||||
// Close
|
||||
try comp.close();
|
||||
|
||||
// Read back
|
||||
var decompressed = try testing.allocator.alloc(u8, half_len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var read = try decomp.reader().readAll(decompressed);
|
||||
try expect(read == half_len);
|
||||
try expect(mem.eql(u8, input[half..], decompressed));
|
||||
|
||||
// Extra read
|
||||
var final: [10]u8 = undefined;
|
||||
read = try decomp.reader().readAll(&final);
|
||||
try expect(read == 0); // expect ended stream to return 0 bytes
|
||||
|
||||
_ = decomp.close();
|
||||
}
|
||||
}
|
||||
|
||||
_ = try comp.writer().writeAll(input);
|
||||
try comp.close();
|
||||
|
||||
// stream should work for ordinary reader too (reading whole_buf in one go)
|
||||
var whole_buf_reader = io.fixedBufferStream(whole_buf.items).reader();
|
||||
var decomp = try decompressor(testing.allocator, whole_buf_reader, null);
|
||||
defer decomp.deinit();
|
||||
|
||||
var decompressed = try testing.allocator.alloc(u8, input.len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
_ = try decomp.reader().readAll(decompressed);
|
||||
_ = decomp.close();
|
||||
|
||||
try expect(mem.eql(u8, input, decompressed));
|
||||
}
|
||||
|
||||
fn testToFromWithLevelAndLimit(level: deflate.Compression, input: []const u8, limit: u32) !void {
|
||||
var compressed = std.ArrayList(u8).init(testing.allocator);
|
||||
defer compressed.deinit();
|
||||
|
||||
var comp = try compressor(testing.allocator, compressed.writer(), .{ .level = level });
|
||||
defer comp.deinit();
|
||||
|
||||
try comp.writer().writeAll(input);
|
||||
try comp.close();
|
||||
|
||||
if (limit > 0) {
|
||||
try expect(compressed.items.len <= limit);
|
||||
}
|
||||
|
||||
var decomp = try decompressor(
|
||||
testing.allocator,
|
||||
io.fixedBufferStream(compressed.items).reader(),
|
||||
null,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
var decompressed = try testing.allocator.alloc(u8, input.len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var read: usize = try decomp.reader().readAll(decompressed);
|
||||
try expect(read == input.len);
|
||||
try expect(mem.eql(u8, input, decompressed));
|
||||
|
||||
try testSync(level, input);
|
||||
}
|
||||
|
||||
fn testToFromWithLimit(input: []const u8, limit: [11]u32) !void {
|
||||
try testToFromWithLevelAndLimit(.no_compression, input, limit[0]);
|
||||
try testToFromWithLevelAndLimit(.best_speed, input, limit[1]);
|
||||
try testToFromWithLevelAndLimit(.level_2, input, limit[2]);
|
||||
try testToFromWithLevelAndLimit(.level_3, input, limit[3]);
|
||||
try testToFromWithLevelAndLimit(.level_4, input, limit[4]);
|
||||
try testToFromWithLevelAndLimit(.level_5, input, limit[5]);
|
||||
try testToFromWithLevelAndLimit(.level_6, input, limit[6]);
|
||||
try testToFromWithLevelAndLimit(.level_7, input, limit[7]);
|
||||
try testToFromWithLevelAndLimit(.level_8, input, limit[8]);
|
||||
try testToFromWithLevelAndLimit(.best_compression, input, limit[9]);
|
||||
try testToFromWithLevelAndLimit(.huffman_only, input, limit[10]);
|
||||
}
|
||||
|
||||
test "deflate/inflate" {
|
||||
var limits = [_]u32{0} ** 11;
|
||||
|
||||
var test0 = [_]u8{};
|
||||
var test1 = [_]u8{0x11};
|
||||
var test2 = [_]u8{ 0x11, 0x12 };
|
||||
var test3 = [_]u8{ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
|
||||
var test4 = [_]u8{ 0x11, 0x10, 0x13, 0x41, 0x21, 0x21, 0x41, 0x13, 0x87, 0x78, 0x13 };
|
||||
|
||||
try testToFromWithLimit(&test0, limits);
|
||||
try testToFromWithLimit(&test1, limits);
|
||||
try testToFromWithLimit(&test2, limits);
|
||||
try testToFromWithLimit(&test3, limits);
|
||||
try testToFromWithLimit(&test4, limits);
|
||||
|
||||
var large_data_chunk = try testing.allocator.alloc(u8, 100_000);
|
||||
defer testing.allocator.free(large_data_chunk);
|
||||
// fill with random data
|
||||
for (large_data_chunk) |_, i| {
|
||||
var mul: u8 = @truncate(u8, i);
|
||||
_ = @mulWithOverflow(u8, mul, mul, &mul);
|
||||
large_data_chunk[i] = mul;
|
||||
}
|
||||
try testToFromWithLimit(large_data_chunk, limits);
|
||||
}
|
||||
|
||||
test "very long sparse chunk" {
|
||||
// A SparseReader returns a stream consisting of 0s ending with 65,536 (1<<16) 1s.
|
||||
// This tests missing hash references in a very large input.
|
||||
const SparseReader = struct {
|
||||
l: usize, // length
|
||||
cur: usize, // current position
|
||||
|
||||
const Self = @This();
|
||||
const Error = error{};
|
||||
|
||||
pub const Reader = io.Reader(*Self, Error, read);
|
||||
|
||||
pub fn reader(self: *Self) Reader {
|
||||
return .{ .context = self };
|
||||
}
|
||||
|
||||
fn read(s: *Self, b: []u8) Error!usize {
|
||||
var n: usize = 0; // amount read
|
||||
|
||||
if (s.cur >= s.l) {
|
||||
return 0;
|
||||
}
|
||||
n = b.len;
|
||||
var cur = s.cur + n;
|
||||
if (cur > s.l) {
|
||||
n -= cur - s.l;
|
||||
cur = s.l;
|
||||
}
|
||||
for (b[0..n]) |_, i| {
|
||||
if (s.cur + i >= s.l -| (1 << 16)) {
|
||||
b[i] = 1;
|
||||
} else {
|
||||
b[i] = 0;
|
||||
}
|
||||
}
|
||||
s.cur = cur;
|
||||
return n;
|
||||
}
|
||||
};
|
||||
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
io.null_writer,
|
||||
.{ .level = .best_speed },
|
||||
);
|
||||
defer comp.deinit();
|
||||
var writer = comp.writer();
|
||||
|
||||
var sparse = SparseReader{ .l = 0x23e8, .cur = 0 };
|
||||
var reader = sparse.reader();
|
||||
|
||||
var read: usize = 1;
|
||||
var written: usize = 0;
|
||||
while (read > 0) {
|
||||
var buf: [1 << 15]u8 = undefined; // 32,768 bytes buffer
|
||||
read = try reader.read(&buf);
|
||||
written += try writer.write(buf[0..read]);
|
||||
}
|
||||
try expect(written == 0x23e8);
|
||||
}
|
||||
|
||||
test "compressor reset" {
|
||||
for (std.enums.values(deflate.Compression)) |c| {
|
||||
try testWriterReset(c, null);
|
||||
try testWriterReset(c, "dict");
|
||||
try testWriterReset(c, "hello");
|
||||
}
|
||||
}
|
||||
|
||||
fn testWriterReset(level: deflate.Compression, dict: ?[]const u8) !void {
|
||||
const filler = struct {
|
||||
fn writeData(c: anytype) !void {
|
||||
const msg = "all your base are belong to us";
|
||||
try c.writer().writeAll(msg);
|
||||
try c.flush();
|
||||
|
||||
const hello = "hello world";
|
||||
var i: usize = 0;
|
||||
while (i < 1024) : (i += 1) {
|
||||
try c.writer().writeAll(hello);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (i < 65000) : (i += 1) {
|
||||
try c.writer().writeAll("x");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var buf1 = ArrayList(u8).init(testing.allocator);
|
||||
defer buf1.deinit();
|
||||
var buf2 = ArrayList(u8).init(testing.allocator);
|
||||
defer buf2.deinit();
|
||||
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
buf1.writer(),
|
||||
.{ .level = level, .dictionary = dict },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
try filler.writeData(&comp);
|
||||
try comp.close();
|
||||
|
||||
comp.reset(buf2.writer());
|
||||
try filler.writeData(&comp);
|
||||
try comp.close();
|
||||
|
||||
try expect(mem.eql(u8, buf1.items, buf2.items));
|
||||
}
|
||||
|
||||
test "decompressor dictionary" {
|
||||
const dict = "hello world"; // dictionary
|
||||
const text = "hello again world";
|
||||
|
||||
var compressed = fifo
|
||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
||||
.init(testing.allocator);
|
||||
defer compressed.deinit();
|
||||
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
compressed.writer(),
|
||||
.{
|
||||
.level = .level_5,
|
||||
.dictionary = null, // no dictionary
|
||||
},
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
// imitate a compressor with a dictionary
|
||||
try comp.writer().writeAll(dict);
|
||||
try comp.flush();
|
||||
compressed.discard(compressed.readableLength()); // empty the output
|
||||
try comp.writer().writeAll(text);
|
||||
try comp.close();
|
||||
|
||||
var decompressed = try testing.allocator.alloc(u8, text.len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var decomp = try decompressor(
|
||||
testing.allocator,
|
||||
compressed.reader(),
|
||||
dict,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
_ = try decomp.reader().readAll(decompressed);
|
||||
try expect(mem.eql(u8, decompressed, "hello again world"));
|
||||
}
|
||||
|
||||
test "compressor dictionary" {
|
||||
const dict = "hello world";
|
||||
const text = "hello again world";
|
||||
|
||||
var compressed_nd = fifo
|
||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
||||
.init(testing.allocator); // compressed with no dictionary
|
||||
defer compressed_nd.deinit();
|
||||
|
||||
var compressed_d = ArrayList(u8).init(testing.allocator); // compressed with a dictionary
|
||||
defer compressed_d.deinit();
|
||||
|
||||
// imitate a compressor with a dictionary
|
||||
var comp_nd = try compressor(
|
||||
testing.allocator,
|
||||
compressed_nd.writer(),
|
||||
.{
|
||||
.level = .level_5,
|
||||
.dictionary = null, // no dictionary
|
||||
},
|
||||
);
|
||||
defer comp_nd.deinit();
|
||||
try comp_nd.writer().writeAll(dict);
|
||||
try comp_nd.flush();
|
||||
compressed_nd.discard(compressed_nd.readableLength()); // empty the output
|
||||
try comp_nd.writer().writeAll(text);
|
||||
try comp_nd.close();
|
||||
|
||||
// use a compressor with a dictionary
|
||||
var comp_d = try compressor(
|
||||
testing.allocator,
|
||||
compressed_d.writer(),
|
||||
.{
|
||||
.level = .level_5,
|
||||
.dictionary = dict, // with a dictionary
|
||||
},
|
||||
);
|
||||
defer comp_d.deinit();
|
||||
try comp_d.writer().writeAll(text);
|
||||
try comp_d.close();
|
||||
|
||||
try expect(mem.eql(u8, compressed_nd.readableSlice(0), compressed_d.items));
|
||||
}
|
||||
|
||||
// Update the hash for best_speed only if d.index < d.maxInsertIndex
|
||||
// See https://golang.org/issue/2508
|
||||
test "Go non-regression test for 2508" {
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
io.null_writer,
|
||||
.{ .level = .best_speed },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
var buf = [_]u8{0} ** 1024;
|
||||
|
||||
var i: usize = 0;
|
||||
while (i < 131_072) : (i += 1) {
|
||||
try comp.writer().writeAll(&buf);
|
||||
try comp.close();
|
||||
}
|
||||
}
|
||||
|
||||
test "deflate/inflate string" {
|
||||
// Skip wasi because it does not support std.fs.openDirAbsolute()
|
||||
if (builtin.os.tag == .wasi) return error.SkipZigTest;
|
||||
|
||||
const current_dir = try std.fs.openDirAbsolute(std.fs.path.dirname(@src().file).?, .{});
|
||||
const testdata_dir = try current_dir.openDir("testdata", .{});
|
||||
|
||||
const StringTest = struct {
|
||||
filename: []const u8,
|
||||
limit: [11]u32,
|
||||
};
|
||||
|
||||
var deflate_inflate_string_tests = [_]StringTest{
|
||||
.{
|
||||
.filename = "compress-e.txt",
|
||||
.limit = [11]u32{
|
||||
100_018, // no_compression
|
||||
50_650, // best_speed
|
||||
50_960, // 2
|
||||
51_150, // 3
|
||||
50_930, // 4
|
||||
50_790, // 5
|
||||
50_790, // 6
|
||||
50_790, // 7
|
||||
50_790, // 8
|
||||
50_790, // best_compression
|
||||
43_683, // huffman_only
|
||||
},
|
||||
},
|
||||
.{
|
||||
.filename = "rfc1951.txt",
|
||||
.limit = [11]u32{
|
||||
36_954, // no_compression
|
||||
12_952, // best_speed
|
||||
12_228, // 2
|
||||
12_016, // 3
|
||||
11_466, // 4
|
||||
11_191, // 5
|
||||
11_129, // 6
|
||||
11_120, // 7
|
||||
11_112, // 8
|
||||
11_109, // best_compression
|
||||
20_273, // huffman_only
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
for (deflate_inflate_string_tests) |t| {
|
||||
const golden_file = try testdata_dir.openFile(t.filename, .{ .read = true });
|
||||
defer golden_file.close();
|
||||
var golden = try golden_file.reader().readAllAlloc(testing.allocator, math.maxInt(usize));
|
||||
defer testing.allocator.free(golden);
|
||||
|
||||
try testToFromWithLimit(golden, t.limit);
|
||||
}
|
||||
}
|
||||
|
||||
test "inflate reset" {
|
||||
const strings = [_][]const u8{
|
||||
"lorem ipsum izzle fo rizzle",
|
||||
"the quick brown fox jumped over",
|
||||
};
|
||||
|
||||
var compressed_strings = [_]ArrayList(u8){
|
||||
ArrayList(u8).init(testing.allocator),
|
||||
ArrayList(u8).init(testing.allocator),
|
||||
};
|
||||
defer compressed_strings[0].deinit();
|
||||
defer compressed_strings[1].deinit();
|
||||
|
||||
for (strings) |s, i| {
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
compressed_strings[i].writer(),
|
||||
.{ .level = .level_6 },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
try comp.writer().writeAll(s);
|
||||
try comp.close();
|
||||
}
|
||||
|
||||
var decomp = try decompressor(
|
||||
testing.allocator,
|
||||
io.fixedBufferStream(compressed_strings[0].items).reader(),
|
||||
null,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
var decompressed_0: []u8 = try decomp.reader()
|
||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
||||
defer testing.allocator.free(decompressed_0);
|
||||
|
||||
try decomp.reset(
|
||||
io.fixedBufferStream(compressed_strings[1].items).reader(),
|
||||
null,
|
||||
);
|
||||
|
||||
var decompressed_1: []u8 = try decomp.reader()
|
||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
||||
defer testing.allocator.free(decompressed_1);
|
||||
|
||||
_ = decomp.close();
|
||||
|
||||
try expect(strings[0].len == decompressed_0.len);
|
||||
try expect(strings[1].len == decompressed_1.len);
|
||||
|
||||
try expect(mem.eql(u8, strings[0], decompressed_0));
|
||||
try expect(mem.eql(u8, strings[1], decompressed_1));
|
||||
}
|
||||
|
||||
test "inflate reset dictionary" {
|
||||
const dict = "the lorem fox";
|
||||
const strings = [_][]const u8{
|
||||
"lorem ipsum izzle fo rizzle",
|
||||
"the quick brown fox jumped over",
|
||||
};
|
||||
|
||||
var compressed_strings = [_]ArrayList(u8){
|
||||
ArrayList(u8).init(testing.allocator),
|
||||
ArrayList(u8).init(testing.allocator),
|
||||
};
|
||||
defer compressed_strings[0].deinit();
|
||||
defer compressed_strings[1].deinit();
|
||||
|
||||
for (strings) |s, i| {
|
||||
var comp = try compressor(
|
||||
testing.allocator,
|
||||
compressed_strings[i].writer(),
|
||||
.{ .level = .level_6 },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
try comp.writer().writeAll(s);
|
||||
try comp.close();
|
||||
}
|
||||
|
||||
var decomp = try decompressor(
|
||||
testing.allocator,
|
||||
io.fixedBufferStream(compressed_strings[0].items).reader(),
|
||||
dict,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
var decompressed_0: []u8 = try decomp.reader()
|
||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
||||
defer testing.allocator.free(decompressed_0);
|
||||
|
||||
try decomp.reset(
|
||||
io.fixedBufferStream(compressed_strings[1].items).reader(),
|
||||
dict,
|
||||
);
|
||||
|
||||
var decompressed_1: []u8 = try decomp.reader()
|
||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
||||
defer testing.allocator.free(decompressed_1);
|
||||
|
||||
_ = decomp.close();
|
||||
|
||||
try expect(strings[0].len == decompressed_0.len);
|
||||
try expect(strings[1].len == decompressed_1.len);
|
||||
|
||||
try expect(mem.eql(u8, strings[0], decompressed_0));
|
||||
try expect(mem.eql(u8, strings[1], decompressed_1));
|
||||
}
|
1090
lib/std/compress/deflate/decompressor.zig
Normal file
1090
lib/std/compress/deflate/decompressor.zig
Normal file
File diff suppressed because it is too large
Load Diff
28
lib/std/compress/deflate/deflate_const.zig
Normal file
28
lib/std/compress/deflate/deflate_const.zig
Normal file
@ -0,0 +1,28 @@
|
||||
// Deflate
|
||||
|
||||
// Biggest block size for uncompressed block.
|
||||
pub const max_store_block_size = 65535;
|
||||
// The special code used to mark the end of a block.
|
||||
pub const end_block_marker = 256;
|
||||
|
||||
// LZ77
|
||||
|
||||
// The smallest match length per the RFC section 3.2.5
|
||||
pub const base_match_length = 3;
|
||||
// The smallest match offset.
|
||||
pub const base_match_offset = 1;
|
||||
// The largest match length.
|
||||
pub const max_match_length = 258;
|
||||
// The largest match offset.
|
||||
pub const max_match_offset = 1 << 15;
|
||||
|
||||
// Huffman Codes
|
||||
|
||||
// The largest offset code.
|
||||
pub const offset_code_count = 30;
|
||||
// Max number of frequencies used for a Huffman Code
|
||||
// Possible lengths are codegenCodeCount (19), offset_code_count (30) and max_num_lit (286).
|
||||
// The largest of these is max_num_lit.
|
||||
pub const max_num_frequencies = max_num_lit;
|
||||
// Maximum number of literals.
|
||||
pub const max_num_lit = 286;
|
721
lib/std/compress/deflate/deflate_fast.zig
Normal file
721
lib/std/compress/deflate/deflate_fast.zig
Normal file
@ -0,0 +1,721 @@
|
||||
// This encoding algorithm, which prioritizes speed over output size, is
|
||||
// based on Snappy's LZ77-style encoder: github.com/golang/snappy
|
||||
|
||||
const std = @import("std");
|
||||
const math = std.math;
|
||||
const mem = std.mem;
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
const deflate_const = @import("deflate_const.zig");
|
||||
const deflate = @import("compressor.zig");
|
||||
const token = @import("token.zig");
|
||||
|
||||
const base_match_length = deflate_const.base_match_length;
|
||||
const base_match_offset = deflate_const.base_match_offset;
|
||||
const max_match_length = deflate_const.max_match_length;
|
||||
const max_match_offset = deflate_const.max_match_offset;
|
||||
const max_store_block_size = deflate_const.max_store_block_size;
|
||||
|
||||
const table_bits = 14; // Bits used in the table.
|
||||
const table_mask = table_size - 1; // Mask for table indices. Redundant, but can eliminate bounds checks.
|
||||
const table_shift = 32 - table_bits; // Right-shift to get the table_bits most significant bits of a uint32.
|
||||
const table_size = 1 << table_bits; // Size of the table.
|
||||
|
||||
// Reset the buffer offset when reaching this.
|
||||
// Offsets are stored between blocks as i32 values.
|
||||
// Since the offset we are checking against is at the beginning
|
||||
// of the buffer, we need to subtract the current and input
|
||||
// buffer to not risk overflowing the i32.
|
||||
const buffer_reset = math.maxInt(i32) - max_store_block_size * 2;
|
||||
|
||||
fn load32(b: []u8, i: i32) u32 {
|
||||
var s = b[@intCast(usize, i) .. @intCast(usize, i) + 4];
|
||||
return @intCast(u32, s[0]) |
|
||||
@intCast(u32, s[1]) << 8 |
|
||||
@intCast(u32, s[2]) << 16 |
|
||||
@intCast(u32, s[3]) << 24;
|
||||
}
|
||||
|
||||
fn load64(b: []u8, i: i32) u64 {
|
||||
var s = b[@intCast(usize, i)..@intCast(usize, i + 8)];
|
||||
return @intCast(u64, s[0]) |
|
||||
@intCast(u64, s[1]) << 8 |
|
||||
@intCast(u64, s[2]) << 16 |
|
||||
@intCast(u64, s[3]) << 24 |
|
||||
@intCast(u64, s[4]) << 32 |
|
||||
@intCast(u64, s[5]) << 40 |
|
||||
@intCast(u64, s[6]) << 48 |
|
||||
@intCast(u64, s[7]) << 56;
|
||||
}
|
||||
|
||||
fn hash(u: u32) u32 {
|
||||
return (u *% 0x1e35a7bd) >> table_shift;
|
||||
}
|
||||
|
||||
// These constants are defined by the Snappy implementation so that its
|
||||
// assembly implementation can fast-path some 16-bytes-at-a-time copies.
|
||||
// They aren't necessary in the pure Go implementation, and may not be
|
||||
// necessary in Zig, but using the same thresholds doesn't really hurt.
|
||||
const input_margin = 16 - 1;
|
||||
const min_non_literal_block_size = 1 + 1 + input_margin;
|
||||
|
||||
const TableEntry = struct {
|
||||
val: u32, // Value at destination
|
||||
offset: i32,
|
||||
};
|
||||
|
||||
pub fn deflateFast() DeflateFast {
|
||||
return DeflateFast{
|
||||
.table = [_]TableEntry{.{ .val = 0, .offset = 0 }} ** table_size,
|
||||
.prev = undefined,
|
||||
.prev_len = 0,
|
||||
.cur = max_store_block_size,
|
||||
.allocator = undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// DeflateFast maintains the table for matches,
|
||||
// and the previous byte block for cross block matching.
|
||||
pub const DeflateFast = struct {
|
||||
table: [table_size]TableEntry,
|
||||
prev: []u8, // Previous block, zero length if unknown.
|
||||
prev_len: u32, // Previous block length
|
||||
cur: i32, // Current match offset.
|
||||
allocator: Allocator,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(self: *Self, allocator: Allocator) !void {
|
||||
self.allocator = allocator;
|
||||
self.prev = try allocator.alloc(u8, max_store_block_size);
|
||||
self.prev_len = 0;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.allocator.free(self.prev);
|
||||
self.prev_len = 0;
|
||||
}
|
||||
|
||||
// Encodes a block given in `src` and appends tokens to `dst` and returns the result.
|
||||
pub fn encode(self: *Self, dst: []token.Token, tokens_count: *u16, src: []u8) void {
|
||||
|
||||
// Ensure that self.cur doesn't wrap.
|
||||
if (self.cur >= buffer_reset) {
|
||||
self.shiftOffsets();
|
||||
}
|
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if (src.len < min_non_literal_block_size) {
|
||||
self.cur += max_store_block_size;
|
||||
self.prev_len = 0;
|
||||
emitLiteral(dst, tokens_count, src);
|
||||
return;
|
||||
}
|
||||
|
||||
// s_limit is when to stop looking for offset/length copies. The input_margin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
var s_limit = @intCast(i32, src.len - input_margin);
|
||||
|
||||
// next_emit is where in src the next emitLiteral should start from.
|
||||
var next_emit: i32 = 0;
|
||||
var s: i32 = 0;
|
||||
var cv: u32 = load32(src, s);
|
||||
var next_hash: u32 = hash(cv);
|
||||
|
||||
outer: while (true) {
|
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
var skip: i32 = 32;
|
||||
|
||||
var next_s: i32 = s;
|
||||
var candidate: TableEntry = undefined;
|
||||
while (true) {
|
||||
s = next_s;
|
||||
var bytes_between_hash_lookups = skip >> 5;
|
||||
next_s = s + bytes_between_hash_lookups;
|
||||
skip += bytes_between_hash_lookups;
|
||||
if (next_s > s_limit) {
|
||||
break :outer;
|
||||
}
|
||||
candidate = self.table[next_hash & table_mask];
|
||||
var now = load32(src, next_s);
|
||||
self.table[next_hash & table_mask] = .{ .offset = s + self.cur, .val = cv };
|
||||
next_hash = hash(now);
|
||||
|
||||
var offset = s - (candidate.offset - self.cur);
|
||||
if (offset > max_match_offset or cv != candidate.val) {
|
||||
// Out of range or not matched.
|
||||
cv = now;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[next_emit..s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
emitLiteral(dst, tokens_count, src[@intCast(usize, next_emit)..@intCast(usize, s)]);
|
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
while (true) {
|
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
s += 4;
|
||||
var t = candidate.offset - self.cur + 4;
|
||||
var l = self.matchLen(s, t, src);
|
||||
|
||||
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
|
||||
dst[tokens_count.*] = token.matchToken(
|
||||
@intCast(u32, l + 4 - base_match_length),
|
||||
@intCast(u32, s - t - base_match_offset),
|
||||
);
|
||||
tokens_count.* += 1;
|
||||
s += l;
|
||||
next_emit = s;
|
||||
if (s >= s_limit) {
|
||||
break :outer;
|
||||
}
|
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate next_hash
|
||||
// at s+1. At least on amd64 architecture, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
var x = load64(src, s - 1);
|
||||
var prev_hash = hash(@truncate(u32, x));
|
||||
self.table[prev_hash & table_mask] = TableEntry{
|
||||
.offset = self.cur + s - 1,
|
||||
.val = @truncate(u32, x),
|
||||
};
|
||||
x >>= 8;
|
||||
var curr_hash = hash(@truncate(u32, x));
|
||||
candidate = self.table[curr_hash & table_mask];
|
||||
self.table[curr_hash & table_mask] = TableEntry{
|
||||
.offset = self.cur + s,
|
||||
.val = @truncate(u32, x),
|
||||
};
|
||||
|
||||
var offset = s - (candidate.offset - self.cur);
|
||||
if (offset > max_match_offset or @truncate(u32, x) != candidate.val) {
|
||||
cv = @truncate(u32, x >> 8);
|
||||
next_hash = hash(cv);
|
||||
s += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (@intCast(u32, next_emit) < src.len) {
|
||||
emitLiteral(dst, tokens_count, src[@intCast(usize, next_emit)..]);
|
||||
}
|
||||
self.cur += @intCast(i32, src.len);
|
||||
self.prev_len = @intCast(u32, src.len);
|
||||
mem.copy(u8, self.prev[0..self.prev_len], src);
|
||||
return;
|
||||
}
|
||||
|
||||
fn emitLiteral(dst: []token.Token, tokens_count: *u16, lit: []u8) void {
|
||||
for (lit) |v| {
|
||||
dst[tokens_count.*] = token.literalToken(@intCast(u32, v));
|
||||
tokens_count.* += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// matchLen returns the match length between src[s..] and src[t..].
|
||||
// t can be negative to indicate the match is starting in self.prev.
|
||||
// We assume that src[s-4 .. s] and src[t-4 .. t] already match.
|
||||
fn matchLen(self: *Self, s: i32, t: i32, src: []u8) i32 {
|
||||
var s1 = @intCast(u32, s) + max_match_length - 4;
|
||||
if (s1 > src.len) {
|
||||
s1 = @intCast(u32, src.len);
|
||||
}
|
||||
|
||||
// If we are inside the current block
|
||||
if (t >= 0) {
|
||||
var b = src[@intCast(usize, t)..];
|
||||
var a = src[@intCast(usize, s)..@intCast(usize, s1)];
|
||||
b = b[0..a.len];
|
||||
// Extend the match to be as long as possible.
|
||||
for (a) |_, i| {
|
||||
if (a[i] != b[i]) {
|
||||
return @intCast(i32, i);
|
||||
}
|
||||
}
|
||||
return @intCast(i32, a.len);
|
||||
}
|
||||
|
||||
// We found a match in the previous block.
|
||||
var tp = @intCast(i32, self.prev_len) + t;
|
||||
if (tp < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Extend the match to be as long as possible.
|
||||
var a = src[@intCast(usize, s)..@intCast(usize, s1)];
|
||||
var b = self.prev[@intCast(usize, tp)..@intCast(usize, self.prev_len)];
|
||||
if (b.len > a.len) {
|
||||
b = b[0..a.len];
|
||||
}
|
||||
a = a[0..b.len];
|
||||
for (b) |_, i| {
|
||||
if (a[i] != b[i]) {
|
||||
return @intCast(i32, i);
|
||||
}
|
||||
}
|
||||
|
||||
// If we reached our limit, we matched everything we are
|
||||
// allowed to in the previous block and we return.
|
||||
var n = @intCast(i32, b.len);
|
||||
if (@intCast(u32, s + n) == s1) {
|
||||
return n;
|
||||
}
|
||||
|
||||
// Continue looking for more matches in the current block.
|
||||
a = src[@intCast(usize, s + n)..@intCast(usize, s1)];
|
||||
b = src[0..a.len];
|
||||
for (a) |_, i| {
|
||||
if (a[i] != b[i]) {
|
||||
return @intCast(i32, i) + n;
|
||||
}
|
||||
}
|
||||
return @intCast(i32, a.len) + n;
|
||||
}
|
||||
|
||||
// Reset resets the encoding history.
|
||||
// This ensures that no matches are made to the previous block.
|
||||
pub fn reset(self: *Self) void {
|
||||
self.prev_len = 0;
|
||||
// Bump the offset, so all matches will fail distance check.
|
||||
// Nothing should be >= self.cur in the table.
|
||||
self.cur += max_match_offset;
|
||||
|
||||
// Protect against self.cur wraparound.
|
||||
if (self.cur >= buffer_reset) {
|
||||
self.shiftOffsets();
|
||||
}
|
||||
}
|
||||
|
||||
// shiftOffsets will shift down all match offset.
|
||||
// This is only called in rare situations to prevent integer overflow.
|
||||
//
|
||||
// See https://golang.org/issue/18636 and https://golang.org/issues/34121.
|
||||
fn shiftOffsets(self: *Self) void {
|
||||
if (self.prev_len == 0) {
|
||||
// We have no history; just clear the table.
|
||||
for (self.table) |_, i| {
|
||||
self.table[i] = TableEntry{ .val = 0, .offset = 0 };
|
||||
}
|
||||
self.cur = max_match_offset + 1;
|
||||
return;
|
||||
}
|
||||
|
||||
// Shift down everything in the table that isn't already too far away.
|
||||
for (self.table) |_, i| {
|
||||
var v = self.table[i].offset - self.cur + max_match_offset + 1;
|
||||
if (v < 0) {
|
||||
// We want to reset self.cur to max_match_offset + 1, so we need to shift
|
||||
// all table entries down by (self.cur - (max_match_offset + 1)).
|
||||
// Because we ignore matches > max_match_offset, we can cap
|
||||
// any negative offsets at 0.
|
||||
v = 0;
|
||||
}
|
||||
self.table[i].offset = v;
|
||||
}
|
||||
self.cur = max_match_offset + 1;
|
||||
}
|
||||
};
|
||||
|
||||
test "best speed match 1/3" {
|
||||
const expect = std.testing.expect;
|
||||
|
||||
{
|
||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(3, -3, ¤t);
|
||||
try expect(got == 6);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 2, 4, 5, 0, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(3, -3, ¤t);
|
||||
try expect(got == 3);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 0, 0, 0, 1, 1 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(3, -3, ¤t);
|
||||
try expect(got == 2);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(0, -1, ¤t);
|
||||
try expect(got == 4);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 0, 0, 0, 1, 2, 3, 4, 5, 2, 2 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(4, -7, ¤t);
|
||||
try expect(got == 5);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 9, 9, 9, 9, 9 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(0, -1, ¤t);
|
||||
try expect(got == 0);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 9, 9, 9, 9, 9 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(1, 0, ¤t);
|
||||
try expect(got == 0);
|
||||
}
|
||||
}
|
||||
|
||||
test "best speed match 2/3" {
|
||||
const expect = std.testing.expect;
|
||||
|
||||
{
|
||||
var previous = [_]u8{};
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(1, -5, ¤t);
|
||||
try expect(got == 0);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{};
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(1, -1, ¤t);
|
||||
try expect(got == 0);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{};
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(1, 0, ¤t);
|
||||
try expect(got == 3);
|
||||
}
|
||||
{
|
||||
var previous = [_]u8{ 3, 4, 5 };
|
||||
var e = DeflateFast{
|
||||
.prev = &previous,
|
||||
.prev_len = previous.len,
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var current = [_]u8{ 3, 4, 5 };
|
||||
var got: i32 = e.matchLen(0, -3, ¤t);
|
||||
try expect(got == 3);
|
||||
}
|
||||
}
|
||||
|
||||
test "best speed match 2/2" {
|
||||
const testing = std.testing;
|
||||
const expect = testing.expect;
|
||||
|
||||
const Case = struct {
|
||||
previous: u32,
|
||||
current: u32,
|
||||
s: i32,
|
||||
t: i32,
|
||||
expected: i32,
|
||||
};
|
||||
|
||||
const cases = [_]Case{
|
||||
.{
|
||||
.previous = 1000,
|
||||
.current = 1000,
|
||||
.s = 0,
|
||||
.t = -1000,
|
||||
.expected = max_match_length - 4,
|
||||
},
|
||||
.{
|
||||
.previous = 200,
|
||||
.s = 0,
|
||||
.t = -200,
|
||||
.current = 500,
|
||||
.expected = max_match_length - 4,
|
||||
},
|
||||
.{
|
||||
.previous = 200,
|
||||
.s = 1,
|
||||
.t = 0,
|
||||
.current = 500,
|
||||
.expected = max_match_length - 4,
|
||||
},
|
||||
.{
|
||||
.previous = max_match_length - 4,
|
||||
.s = 0,
|
||||
.t = -(max_match_length - 4),
|
||||
.current = 500,
|
||||
.expected = max_match_length - 4,
|
||||
},
|
||||
.{
|
||||
.previous = 200,
|
||||
.s = 400,
|
||||
.t = -200,
|
||||
.current = 500,
|
||||
.expected = 100,
|
||||
},
|
||||
.{
|
||||
.previous = 10,
|
||||
.s = 400,
|
||||
.t = 200,
|
||||
.current = 500,
|
||||
.expected = 100,
|
||||
},
|
||||
};
|
||||
|
||||
for (cases) |c| {
|
||||
var previous = try testing.allocator.alloc(u8, c.previous);
|
||||
defer testing.allocator.free(previous);
|
||||
mem.set(u8, previous, 0);
|
||||
|
||||
var current = try testing.allocator.alloc(u8, c.current);
|
||||
defer testing.allocator.free(current);
|
||||
mem.set(u8, current, 0);
|
||||
|
||||
var e = DeflateFast{
|
||||
.prev = previous,
|
||||
.prev_len = @intCast(u32, previous.len),
|
||||
.table = undefined,
|
||||
.allocator = undefined,
|
||||
.cur = 0,
|
||||
};
|
||||
var got: i32 = e.matchLen(c.s, c.t, current);
|
||||
try expect(got == c.expected);
|
||||
}
|
||||
}
|
||||
|
||||
test "best speed shift offsets" {
|
||||
const testing = std.testing;
|
||||
const expect = std.testing.expect;
|
||||
|
||||
// Test if shiftoffsets properly preserves matches and resets out-of-range matches
|
||||
// seen in https://github.com/golang/go/issues/4142
|
||||
var enc = deflateFast();
|
||||
try enc.init(testing.allocator);
|
||||
defer enc.deinit();
|
||||
|
||||
// test_data may not generate internal matches.
|
||||
var test_data = [32]u8{
|
||||
0xf5, 0x25, 0xf2, 0x55, 0xf6, 0xc1, 0x1f, 0x0b, 0x10, 0xa1,
|
||||
0xd0, 0x77, 0x56, 0x38, 0xf1, 0x9c, 0x7f, 0x85, 0xc5, 0xbd,
|
||||
0x16, 0x28, 0xd4, 0xf9, 0x03, 0xd4, 0xc0, 0xa1, 0x1e, 0x58,
|
||||
0x5b, 0xc9,
|
||||
};
|
||||
|
||||
var tokens = [_]token.Token{0} ** 32;
|
||||
var tokens_count: u16 = 0;
|
||||
|
||||
// Encode the testdata with clean state.
|
||||
// Second part should pick up matches from the first block.
|
||||
tokens_count = 0;
|
||||
enc.encode(&tokens, &tokens_count, &test_data);
|
||||
var want_first_tokens = tokens_count;
|
||||
tokens_count = 0;
|
||||
enc.encode(&tokens, &tokens_count, &test_data);
|
||||
var want_second_tokens = tokens_count;
|
||||
|
||||
try expect(want_first_tokens > want_second_tokens);
|
||||
|
||||
// Forward the current indicator to before wraparound.
|
||||
enc.cur = buffer_reset - @intCast(i32, test_data.len);
|
||||
|
||||
// Part 1 before wrap, should match clean state.
|
||||
tokens_count = 0;
|
||||
enc.encode(&tokens, &tokens_count, &test_data);
|
||||
var got = tokens_count;
|
||||
try expect(want_first_tokens == got);
|
||||
|
||||
// Verify we are about to wrap.
|
||||
try expect(enc.cur == buffer_reset);
|
||||
|
||||
// Part 2 should match clean state as well even if wrapped.
|
||||
tokens_count = 0;
|
||||
enc.encode(&tokens, &tokens_count, &test_data);
|
||||
got = tokens_count;
|
||||
try expect(want_second_tokens == got);
|
||||
|
||||
// Verify that we wrapped.
|
||||
try expect(enc.cur < buffer_reset);
|
||||
|
||||
// Forward the current buffer, leaving the matches at the bottom.
|
||||
enc.cur = buffer_reset;
|
||||
enc.shiftOffsets();
|
||||
|
||||
// Ensure that no matches were picked up.
|
||||
tokens_count = 0;
|
||||
enc.encode(&tokens, &tokens_count, &test_data);
|
||||
got = tokens_count;
|
||||
try expect(want_first_tokens == got);
|
||||
}
|
||||
|
||||
test "best speed reset" {
|
||||
// test that encoding is consistent across a warparound of the table offset.
|
||||
// See https://github.com/golang/go/issues/34121
|
||||
const expect = std.testing.expect;
|
||||
const fmt = std.fmt;
|
||||
const testing = std.testing;
|
||||
|
||||
const ArrayList = std.ArrayList;
|
||||
|
||||
const input_size = 65536;
|
||||
var input = try testing.allocator.alloc(u8, input_size);
|
||||
defer testing.allocator.free(input);
|
||||
|
||||
var i: usize = 0;
|
||||
while (i < input_size) : (i += 1) {
|
||||
_ = try fmt.bufPrint(input, "asdfasdfasdfasdf{d}{d}fghfgujyut{d}yutyu\n", .{ i, i, i });
|
||||
}
|
||||
// This is specific to level 1 (best_speed).
|
||||
const level = .best_speed;
|
||||
const offset: usize = 1;
|
||||
|
||||
// We do an encode with a clean buffer to compare.
|
||||
var want = ArrayList(u8).init(testing.allocator);
|
||||
defer want.deinit();
|
||||
var clean_comp = try deflate.compressor(
|
||||
testing.allocator,
|
||||
want.writer(),
|
||||
.{ .level = level },
|
||||
);
|
||||
defer clean_comp.deinit();
|
||||
|
||||
// Write 3 times, close.
|
||||
try clean_comp.writer().writeAll(input);
|
||||
try clean_comp.writer().writeAll(input);
|
||||
try clean_comp.writer().writeAll(input);
|
||||
try clean_comp.close();
|
||||
|
||||
var o = offset;
|
||||
while (o <= 256) : (o *= 2) {
|
||||
var discard = ArrayList(u8).init(testing.allocator);
|
||||
defer discard.deinit();
|
||||
|
||||
var comp = try deflate.compressor(
|
||||
testing.allocator,
|
||||
discard.writer(),
|
||||
.{ .level = level },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
// Reset until we are right before the wraparound.
|
||||
// Each reset adds max_match_offset to the offset.
|
||||
i = 0;
|
||||
var limit = (buffer_reset - input.len - o - max_match_offset) / max_match_offset;
|
||||
while (i < limit) : (i += 1) {
|
||||
// skip ahead to where we are close to wrap around...
|
||||
comp.reset(discard.writer());
|
||||
}
|
||||
var got = ArrayList(u8).init(testing.allocator);
|
||||
defer got.deinit();
|
||||
comp.reset(got.writer());
|
||||
|
||||
// Write 3 times, close.
|
||||
try comp.writer().writeAll(input);
|
||||
try comp.writer().writeAll(input);
|
||||
try comp.writer().writeAll(input);
|
||||
try comp.close();
|
||||
|
||||
// output must match at wraparound
|
||||
try expect(mem.eql(u8, got.items, want.items));
|
||||
}
|
||||
}
|
166
lib/std/compress/deflate/deflate_fast_test.zig
Normal file
166
lib/std/compress/deflate/deflate_fast_test.zig
Normal file
@ -0,0 +1,166 @@
|
||||
const std = @import("std");
|
||||
const expect = std.testing.expect;
|
||||
const io = std.io;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
const ArrayList = std.ArrayList;
|
||||
|
||||
const deflate = @import("compressor.zig");
|
||||
const inflate = @import("decompressor.zig");
|
||||
const deflate_const = @import("deflate_const.zig");
|
||||
|
||||
test "best speed" {
|
||||
// Tests that round-tripping through deflate and then inflate recovers the original input.
|
||||
// The Write sizes are near the thresholds in the compressor.encSpeed method (0, 16, 128), as well
|
||||
// as near `deflate_const.max_store_block_size` (65535).
|
||||
|
||||
var abcabc = try testing.allocator.alloc(u8, 131_072);
|
||||
defer testing.allocator.free(abcabc);
|
||||
|
||||
for (abcabc) |_, i| {
|
||||
abcabc[i] = @intCast(u8, i % 128);
|
||||
}
|
||||
|
||||
var tc_01 = [_]u32{ 65536, 0 };
|
||||
var tc_02 = [_]u32{ 65536, 1 };
|
||||
var tc_03 = [_]u32{ 65536, 1, 256 };
|
||||
var tc_04 = [_]u32{ 65536, 1, 65536 };
|
||||
var tc_05 = [_]u32{ 65536, 14 };
|
||||
var tc_06 = [_]u32{ 65536, 15 };
|
||||
var tc_07 = [_]u32{ 65536, 16 };
|
||||
var tc_08 = [_]u32{ 65536, 16, 256 };
|
||||
var tc_09 = [_]u32{ 65536, 16, 65536 };
|
||||
var tc_10 = [_]u32{ 65536, 127 };
|
||||
var tc_11 = [_]u32{ 65536, 127 };
|
||||
var tc_12 = [_]u32{ 65536, 128 };
|
||||
var tc_13 = [_]u32{ 65536, 128, 256 };
|
||||
var tc_14 = [_]u32{ 65536, 128, 65536 };
|
||||
var tc_15 = [_]u32{ 65536, 129 };
|
||||
var tc_16 = [_]u32{ 65536, 65536, 256 };
|
||||
var tc_17 = [_]u32{ 65536, 65536, 65536 };
|
||||
var test_cases = [_][]u32{
|
||||
&tc_01, &tc_02, &tc_03, &tc_04, &tc_05, &tc_06, &tc_07, &tc_08, &tc_09, &tc_10,
|
||||
&tc_11, &tc_12, &tc_13, &tc_14, &tc_15, &tc_16, &tc_17,
|
||||
};
|
||||
|
||||
for (test_cases) |tc| {
|
||||
var firsts = [_]u32{ 1, 65534, 65535, 65536, 65537, 131072 };
|
||||
|
||||
for (firsts) |first_n| {
|
||||
tc[0] = first_n;
|
||||
|
||||
var to_flush = [_]bool{ false, true };
|
||||
for (to_flush) |flush| {
|
||||
var compressed = ArrayList(u8).init(testing.allocator);
|
||||
defer compressed.deinit();
|
||||
|
||||
var want = ArrayList(u8).init(testing.allocator);
|
||||
defer want.deinit();
|
||||
|
||||
var comp = try deflate.compressor(
|
||||
testing.allocator,
|
||||
compressed.writer(),
|
||||
.{ .level = .best_speed },
|
||||
);
|
||||
defer comp.deinit();
|
||||
|
||||
for (tc) |n| {
|
||||
try want.appendSlice(abcabc[0..n]);
|
||||
try comp.writer().writeAll(abcabc[0..n]);
|
||||
if (flush) {
|
||||
try comp.flush();
|
||||
}
|
||||
}
|
||||
|
||||
try comp.close();
|
||||
|
||||
var decompressed = try testing.allocator.alloc(u8, want.items.len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var decomp = try inflate.decompressor(
|
||||
testing.allocator,
|
||||
io.fixedBufferStream(compressed.items).reader(),
|
||||
null,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
|
||||
var read = try decomp.reader().readAll(decompressed);
|
||||
_ = decomp.close();
|
||||
|
||||
try expect(read == want.items.len);
|
||||
try expect(mem.eql(u8, want.items, decompressed));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "best speed max match offset" {
|
||||
const abc = "abcdefgh";
|
||||
const xyz = "stuvwxyz";
|
||||
const input_margin = 16 - 1;
|
||||
|
||||
const match_before = [_]bool{ false, true };
|
||||
for (match_before) |do_match_before| {
|
||||
const extras = [_]u32{
|
||||
0,
|
||||
input_margin - 1,
|
||||
input_margin,
|
||||
input_margin + 1,
|
||||
2 * input_margin,
|
||||
};
|
||||
for (extras) |extra| {
|
||||
var offset_adj: i32 = -5;
|
||||
while (offset_adj <= 5) : (offset_adj += 1) {
|
||||
var offset = deflate_const.max_match_offset + offset_adj;
|
||||
|
||||
// Make src to be a []u8 of the form
|
||||
// fmt("{s}{s}{s}{s}{s}", .{abc, zeros0, xyzMaybe, abc, zeros1})
|
||||
// where:
|
||||
// zeros0 is approximately max_match_offset zeros.
|
||||
// xyzMaybe is either xyz or the empty string.
|
||||
// zeros1 is between 0 and 30 zeros.
|
||||
// The difference between the two abc's will be offset, which
|
||||
// is max_match_offset plus or minus a small adjustment.
|
||||
var src_len: usize = @intCast(usize, offset + abc.len + @intCast(i32, extra));
|
||||
var src = try testing.allocator.alloc(u8, src_len);
|
||||
defer testing.allocator.free(src);
|
||||
|
||||
mem.copy(u8, src, abc);
|
||||
if (!do_match_before) {
|
||||
var src_offset: usize = @intCast(usize, offset - xyz.len);
|
||||
mem.copy(u8, src[src_offset..], xyz);
|
||||
}
|
||||
var src_offset: usize = @intCast(usize, offset);
|
||||
mem.copy(u8, src[src_offset..], abc);
|
||||
|
||||
var compressed = ArrayList(u8).init(testing.allocator);
|
||||
defer compressed.deinit();
|
||||
|
||||
var comp = try deflate.compressor(
|
||||
testing.allocator,
|
||||
compressed.writer(),
|
||||
.{ .level = .best_speed },
|
||||
);
|
||||
defer comp.deinit();
|
||||
try comp.writer().writeAll(src);
|
||||
_ = try comp.close();
|
||||
|
||||
var decompressed = try testing.allocator.alloc(u8, src.len);
|
||||
defer testing.allocator.free(decompressed);
|
||||
|
||||
var decomp = try inflate.decompressor(
|
||||
testing.allocator,
|
||||
io.fixedBufferStream(compressed.items).reader(),
|
||||
null,
|
||||
);
|
||||
defer decomp.deinit();
|
||||
var read = try decomp.reader().readAll(decompressed);
|
||||
_ = decomp.close();
|
||||
|
||||
try expect(read == src.len);
|
||||
try expect(mem.eql(u8, decompressed, src));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
420
lib/std/compress/deflate/dict_decoder.zig
Normal file
420
lib/std/compress/deflate/dict_decoder.zig
Normal file
@ -0,0 +1,420 @@
|
||||
const std = @import("std");
|
||||
const assert = std.debug.assert;
|
||||
const mem = std.mem;
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
// Implements the LZ77 sliding dictionary as used in decompression.
|
||||
// LZ77 decompresses data through sequences of two forms of commands:
|
||||
//
|
||||
// * Literal insertions: Runs of one or more symbols are inserted into the data
|
||||
// stream as is. This is accomplished through the writeByte method for a
|
||||
// single symbol, or combinations of writeSlice/writeMark for multiple symbols.
|
||||
// Any valid stream must start with a literal insertion if no preset dictionary
|
||||
// is used.
|
||||
//
|
||||
// * Backward copies: Runs of one or more symbols are copied from previously
|
||||
// emitted data. Backward copies come as the tuple (dist, length) where dist
|
||||
// determines how far back in the stream to copy from and length determines how
|
||||
// many bytes to copy. Note that it is valid for the length to be greater than
|
||||
// the distance. Since LZ77 uses forward copies, that situation is used to
|
||||
// perform a form of run-length encoding on repeated runs of symbols.
|
||||
// The writeCopy and tryWriteCopy are used to implement this command.
|
||||
//
|
||||
// For performance reasons, this implementation performs little to no sanity
|
||||
// checks about the arguments. As such, the invariants documented for each
|
||||
// method call must be respected.
|
||||
pub const DictDecoder = struct {
|
||||
const Self = @This();
|
||||
|
||||
allocator: Allocator = undefined,
|
||||
|
||||
hist: []u8 = undefined, // Sliding window history
|
||||
|
||||
// Invariant: 0 <= rd_pos <= wr_pos <= hist.len
|
||||
wr_pos: u32 = 0, // Current output position in buffer
|
||||
rd_pos: u32 = 0, // Have emitted hist[0..rd_pos] already
|
||||
full: bool = false, // Has a full window length been written yet?
|
||||
|
||||
// init initializes DictDecoder to have a sliding window dictionary of the given
|
||||
// size. If a preset dict is provided, it will initialize the dictionary with
|
||||
// the contents of dict.
|
||||
pub fn init(self: *Self, allocator: Allocator, size: u32, dict: ?[]const u8) !void {
|
||||
self.allocator = allocator;
|
||||
|
||||
self.hist = try allocator.alloc(u8, size);
|
||||
|
||||
self.wr_pos = 0;
|
||||
|
||||
if (dict != null) {
|
||||
mem.copy(u8, self.hist, dict.?[dict.?.len -| self.hist.len..]);
|
||||
self.wr_pos = @intCast(u32, dict.?.len);
|
||||
}
|
||||
|
||||
if (self.wr_pos == self.hist.len) {
|
||||
self.wr_pos = 0;
|
||||
self.full = true;
|
||||
}
|
||||
self.rd_pos = self.wr_pos;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.allocator.free(self.hist);
|
||||
}
|
||||
|
||||
// Reports the total amount of historical data in the dictionary.
|
||||
pub fn histSize(self: *Self) u32 {
|
||||
if (self.full) {
|
||||
return @intCast(u32, self.hist.len);
|
||||
}
|
||||
return self.wr_pos;
|
||||
}
|
||||
|
||||
// Reports the number of bytes that can be flushed by readFlush.
|
||||
pub fn availRead(self: *Self) u32 {
|
||||
return self.wr_pos - self.rd_pos;
|
||||
}
|
||||
|
||||
// Reports the available amount of output buffer space.
|
||||
pub fn availWrite(self: *Self) u32 {
|
||||
return @intCast(u32, self.hist.len - self.wr_pos);
|
||||
}
|
||||
|
||||
// Returns a slice of the available buffer to write data to.
|
||||
//
|
||||
// This invariant will be kept: s.len <= availWrite()
|
||||
pub fn writeSlice(self: *Self) []u8 {
|
||||
return self.hist[self.wr_pos..];
|
||||
}
|
||||
|
||||
// Advances the writer pointer by `count`.
|
||||
//
|
||||
// This invariant must be kept: 0 <= count <= availWrite()
|
||||
pub fn writeMark(self: *Self, count: u32) void {
|
||||
assert(0 <= count and count <= self.availWrite());
|
||||
self.wr_pos += count;
|
||||
}
|
||||
|
||||
// Writes a single byte to the dictionary.
|
||||
//
|
||||
// This invariant must be kept: 0 < availWrite()
|
||||
pub fn writeByte(self: *Self, byte: u8) void {
|
||||
self.hist[self.wr_pos] = byte;
|
||||
self.wr_pos += 1;
|
||||
}
|
||||
|
||||
fn copy(dst: []u8, src: []const u8) u32 {
|
||||
if (src.len > dst.len) {
|
||||
mem.copy(u8, dst, src[0..dst.len]);
|
||||
return @intCast(u32, dst.len);
|
||||
}
|
||||
mem.copy(u8, dst, src);
|
||||
return @intCast(u32, src.len);
|
||||
}
|
||||
|
||||
// Copies a string at a given (dist, length) to the output.
|
||||
// This returns the number of bytes copied and may be less than the requested
|
||||
// length if the available space in the output buffer is too small.
|
||||
//
|
||||
// This invariant must be kept: 0 < dist <= histSize()
|
||||
pub fn writeCopy(self: *Self, dist: u32, length: u32) u32 {
|
||||
assert(0 < dist and dist <= self.histSize());
|
||||
var dst_base = self.wr_pos;
|
||||
var dst_pos = dst_base;
|
||||
var src_pos: i32 = @intCast(i32, dst_pos) - @intCast(i32, dist);
|
||||
var end_pos = dst_pos + length;
|
||||
if (end_pos > self.hist.len) {
|
||||
end_pos = @intCast(u32, self.hist.len);
|
||||
}
|
||||
|
||||
// Copy non-overlapping section after destination position.
|
||||
//
|
||||
// This section is non-overlapping in that the copy length for this section
|
||||
// is always less than or equal to the backwards distance. This can occur
|
||||
// if a distance refers to data that wraps-around in the buffer.
|
||||
// Thus, a backwards copy is performed here; that is, the exact bytes in
|
||||
// the source prior to the copy is placed in the destination.
|
||||
if (src_pos < 0) {
|
||||
src_pos += @intCast(i32, self.hist.len);
|
||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@intCast(usize, src_pos)..]);
|
||||
src_pos = 0;
|
||||
}
|
||||
|
||||
// Copy possibly overlapping section before destination position.
|
||||
//
|
||||
// This section can overlap if the copy length for this section is larger
|
||||
// than the backwards distance. This is allowed by LZ77 so that repeated
|
||||
// strings can be succinctly represented using (dist, length) pairs.
|
||||
// Thus, a forwards copy is performed here; that is, the bytes copied is
|
||||
// possibly dependent on the resulting bytes in the destination as the copy
|
||||
// progresses along. This is functionally equivalent to the following:
|
||||
//
|
||||
// var i = 0;
|
||||
// while(i < end_pos - dst_pos) : (i+=1) {
|
||||
// self.hist[dst_pos+i] = self.hist[src_pos+i];
|
||||
// }
|
||||
// dst_pos = end_pos;
|
||||
//
|
||||
while (dst_pos < end_pos) {
|
||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@intCast(usize, src_pos)..dst_pos]);
|
||||
}
|
||||
|
||||
self.wr_pos = dst_pos;
|
||||
return dst_pos - dst_base;
|
||||
}
|
||||
|
||||
// Tries to copy a string at a given (distance, length) to the
|
||||
// output. This specialized version is optimized for short distances.
|
||||
//
|
||||
// This method is designed to be inlined for performance reasons.
|
||||
//
|
||||
// This invariant must be kept: 0 < dist <= histSize()
|
||||
pub fn tryWriteCopy(self: *Self, dist: u32, length: u32) u32 {
|
||||
var dst_pos = self.wr_pos;
|
||||
var end_pos = dst_pos + length;
|
||||
if (dst_pos < dist or end_pos > self.hist.len) {
|
||||
return 0;
|
||||
}
|
||||
var dst_base = dst_pos;
|
||||
var src_pos = dst_pos - dist;
|
||||
|
||||
// Copy possibly overlapping section before destination position.
|
||||
while (dst_pos < end_pos) {
|
||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[src_pos..dst_pos]);
|
||||
}
|
||||
|
||||
self.wr_pos = dst_pos;
|
||||
return dst_pos - dst_base;
|
||||
}
|
||||
|
||||
// Returns a slice of the historical buffer that is ready to be
|
||||
// emitted to the user. The data returned by readFlush must be fully consumed
|
||||
// before calling any other DictDecoder methods.
|
||||
pub fn readFlush(self: *Self) []u8 {
|
||||
var to_read = self.hist[self.rd_pos..self.wr_pos];
|
||||
self.rd_pos = self.wr_pos;
|
||||
if (self.wr_pos == self.hist.len) {
|
||||
self.wr_pos = 0;
|
||||
self.rd_pos = 0;
|
||||
self.full = true;
|
||||
}
|
||||
return to_read;
|
||||
}
|
||||
};
|
||||
|
||||
// tests
|
||||
|
||||
test "dictionary decoder" {
|
||||
const ArrayList = std.ArrayList;
|
||||
const expect = std.testing.expect;
|
||||
const testing = std.testing;
|
||||
|
||||
const abc = "ABC\n";
|
||||
const fox = "The quick brown fox jumped over the lazy dog!\n";
|
||||
const poem: []const u8 =
|
||||
\\The Road Not Taken
|
||||
\\Robert Frost
|
||||
\\
|
||||
\\Two roads diverged in a yellow wood,
|
||||
\\And sorry I could not travel both
|
||||
\\And be one traveler, long I stood
|
||||
\\And looked down one as far as I could
|
||||
\\To where it bent in the undergrowth;
|
||||
\\
|
||||
\\Then took the other, as just as fair,
|
||||
\\And having perhaps the better claim,
|
||||
\\Because it was grassy and wanted wear;
|
||||
\\Though as for that the passing there
|
||||
\\Had worn them really about the same,
|
||||
\\
|
||||
\\And both that morning equally lay
|
||||
\\In leaves no step had trodden black.
|
||||
\\Oh, I kept the first for another day!
|
||||
\\Yet knowing how way leads on to way,
|
||||
\\I doubted if I should ever come back.
|
||||
\\
|
||||
\\I shall be telling this with a sigh
|
||||
\\Somewhere ages and ages hence:
|
||||
\\Two roads diverged in a wood, and I-
|
||||
\\I took the one less traveled by,
|
||||
\\And that has made all the difference.
|
||||
\\
|
||||
;
|
||||
|
||||
const uppercase: []const u8 =
|
||||
\\THE ROAD NOT TAKEN
|
||||
\\ROBERT FROST
|
||||
\\
|
||||
\\TWO ROADS DIVERGED IN A YELLOW WOOD,
|
||||
\\AND SORRY I COULD NOT TRAVEL BOTH
|
||||
\\AND BE ONE TRAVELER, LONG I STOOD
|
||||
\\AND LOOKED DOWN ONE AS FAR AS I COULD
|
||||
\\TO WHERE IT BENT IN THE UNDERGROWTH;
|
||||
\\
|
||||
\\THEN TOOK THE OTHER, AS JUST AS FAIR,
|
||||
\\AND HAVING PERHAPS THE BETTER CLAIM,
|
||||
\\BECAUSE IT WAS GRASSY AND WANTED WEAR;
|
||||
\\THOUGH AS FOR THAT THE PASSING THERE
|
||||
\\HAD WORN THEM REALLY ABOUT THE SAME,
|
||||
\\
|
||||
\\AND BOTH THAT MORNING EQUALLY LAY
|
||||
\\IN LEAVES NO STEP HAD TRODDEN BLACK.
|
||||
\\OH, I KEPT THE FIRST FOR ANOTHER DAY!
|
||||
\\YET KNOWING HOW WAY LEADS ON TO WAY,
|
||||
\\I DOUBTED IF I SHOULD EVER COME BACK.
|
||||
\\
|
||||
\\I SHALL BE TELLING THIS WITH A SIGH
|
||||
\\SOMEWHERE AGES AND AGES HENCE:
|
||||
\\TWO ROADS DIVERGED IN A WOOD, AND I-
|
||||
\\I TOOK THE ONE LESS TRAVELED BY,
|
||||
\\AND THAT HAS MADE ALL THE DIFFERENCE.
|
||||
\\
|
||||
;
|
||||
|
||||
const PoemRefs = struct {
|
||||
dist: u32, // Backward distance (0 if this is an insertion)
|
||||
length: u32, // Length of copy or insertion
|
||||
};
|
||||
|
||||
var poem_refs = [_]PoemRefs{
|
||||
.{ .dist = 0, .length = 38 }, .{ .dist = 33, .length = 3 }, .{ .dist = 0, .length = 48 },
|
||||
.{ .dist = 79, .length = 3 }, .{ .dist = 0, .length = 11 }, .{ .dist = 34, .length = 5 },
|
||||
.{ .dist = 0, .length = 6 }, .{ .dist = 23, .length = 7 }, .{ .dist = 0, .length = 8 },
|
||||
.{ .dist = 50, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 69, .length = 3 },
|
||||
.{ .dist = 34, .length = 5 }, .{ .dist = 0, .length = 4 }, .{ .dist = 97, .length = 3 },
|
||||
.{ .dist = 0, .length = 4 }, .{ .dist = 43, .length = 5 }, .{ .dist = 0, .length = 6 },
|
||||
.{ .dist = 7, .length = 4 }, .{ .dist = 88, .length = 7 }, .{ .dist = 0, .length = 12 },
|
||||
.{ .dist = 80, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 141, .length = 4 },
|
||||
.{ .dist = 0, .length = 1 }, .{ .dist = 196, .length = 3 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 157, .length = 3 }, .{ .dist = 0, .length = 6 }, .{ .dist = 181, .length = 3 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 23, .length = 3 }, .{ .dist = 77, .length = 3 },
|
||||
.{ .dist = 28, .length = 5 }, .{ .dist = 128, .length = 3 }, .{ .dist = 110, .length = 4 },
|
||||
.{ .dist = 70, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 85, .length = 6 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 182, .length = 6 }, .{ .dist = 0, .length = 4 },
|
||||
.{ .dist = 133, .length = 3 }, .{ .dist = 0, .length = 7 }, .{ .dist = 47, .length = 5 },
|
||||
.{ .dist = 0, .length = 20 }, .{ .dist = 112, .length = 5 }, .{ .dist = 0, .length = 1 },
|
||||
.{ .dist = 58, .length = 3 }, .{ .dist = 0, .length = 8 }, .{ .dist = 59, .length = 3 },
|
||||
.{ .dist = 0, .length = 4 }, .{ .dist = 173, .length = 3 }, .{ .dist = 0, .length = 5 },
|
||||
.{ .dist = 114, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 92, .length = 5 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 71, .length = 3 }, .{ .dist = 0, .length = 2 },
|
||||
.{ .dist = 76, .length = 5 }, .{ .dist = 0, .length = 1 }, .{ .dist = 46, .length = 3 },
|
||||
.{ .dist = 96, .length = 4 }, .{ .dist = 130, .length = 4 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 360, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 178, .length = 5 },
|
||||
.{ .dist = 0, .length = 7 }, .{ .dist = 75, .length = 3 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 45, .length = 6 }, .{ .dist = 0, .length = 6 }, .{ .dist = 299, .length = 6 },
|
||||
.{ .dist = 180, .length = 3 }, .{ .dist = 70, .length = 6 }, .{ .dist = 0, .length = 1 },
|
||||
.{ .dist = 48, .length = 3 }, .{ .dist = 66, .length = 4 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 47, .length = 5 }, .{ .dist = 0, .length = 9 }, .{ .dist = 325, .length = 3 },
|
||||
.{ .dist = 0, .length = 1 }, .{ .dist = 359, .length = 3 }, .{ .dist = 318, .length = 3 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 199, .length = 3 }, .{ .dist = 0, .length = 1 },
|
||||
.{ .dist = 344, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 248, .length = 3 },
|
||||
.{ .dist = 0, .length = 10 }, .{ .dist = 310, .length = 3 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 93, .length = 6 }, .{ .dist = 0, .length = 3 }, .{ .dist = 252, .length = 3 },
|
||||
.{ .dist = 157, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 273, .length = 5 },
|
||||
.{ .dist = 0, .length = 14 }, .{ .dist = 99, .length = 4 }, .{ .dist = 0, .length = 1 },
|
||||
.{ .dist = 464, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 92, .length = 4 },
|
||||
.{ .dist = 495, .length = 3 }, .{ .dist = 0, .length = 1 }, .{ .dist = 322, .length = 4 },
|
||||
.{ .dist = 16, .length = 4 }, .{ .dist = 0, .length = 3 }, .{ .dist = 402, .length = 3 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 237, .length = 4 }, .{ .dist = 0, .length = 2 },
|
||||
.{ .dist = 432, .length = 4 }, .{ .dist = 0, .length = 1 }, .{ .dist = 483, .length = 5 },
|
||||
.{ .dist = 0, .length = 2 }, .{ .dist = 294, .length = 4 }, .{ .dist = 0, .length = 2 },
|
||||
.{ .dist = 306, .length = 3 }, .{ .dist = 113, .length = 5 }, .{ .dist = 0, .length = 1 },
|
||||
.{ .dist = 26, .length = 4 }, .{ .dist = 164, .length = 3 }, .{ .dist = 488, .length = 4 },
|
||||
.{ .dist = 0, .length = 1 }, .{ .dist = 542, .length = 3 }, .{ .dist = 248, .length = 6 },
|
||||
.{ .dist = 0, .length = 5 }, .{ .dist = 205, .length = 3 }, .{ .dist = 0, .length = 8 },
|
||||
.{ .dist = 48, .length = 3 }, .{ .dist = 449, .length = 6 }, .{ .dist = 0, .length = 2 },
|
||||
.{ .dist = 192, .length = 3 }, .{ .dist = 328, .length = 4 }, .{ .dist = 9, .length = 5 },
|
||||
.{ .dist = 433, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 622, .length = 25 },
|
||||
.{ .dist = 615, .length = 5 }, .{ .dist = 46, .length = 5 }, .{ .dist = 0, .length = 2 },
|
||||
.{ .dist = 104, .length = 3 }, .{ .dist = 475, .length = 10 }, .{ .dist = 549, .length = 3 },
|
||||
.{ .dist = 0, .length = 4 }, .{ .dist = 597, .length = 8 }, .{ .dist = 314, .length = 3 },
|
||||
.{ .dist = 0, .length = 1 }, .{ .dist = 473, .length = 6 }, .{ .dist = 317, .length = 5 },
|
||||
.{ .dist = 0, .length = 1 }, .{ .dist = 400, .length = 3 }, .{ .dist = 0, .length = 3 },
|
||||
.{ .dist = 109, .length = 3 }, .{ .dist = 151, .length = 3 }, .{ .dist = 48, .length = 4 },
|
||||
.{ .dist = 0, .length = 4 }, .{ .dist = 125, .length = 3 }, .{ .dist = 108, .length = 3 },
|
||||
.{ .dist = 0, .length = 2 },
|
||||
};
|
||||
|
||||
var got_list = ArrayList(u8).init(testing.allocator);
|
||||
defer got_list.deinit();
|
||||
var got = got_list.writer();
|
||||
|
||||
var want_list = ArrayList(u8).init(testing.allocator);
|
||||
defer want_list.deinit();
|
||||
var want = want_list.writer();
|
||||
|
||||
var dd = DictDecoder{};
|
||||
try dd.init(testing.allocator, 1 << 11, null);
|
||||
defer dd.deinit();
|
||||
|
||||
const util = struct {
|
||||
fn writeCopy(dst_dd: *DictDecoder, dst: anytype, dist: u32, length: u32) !void {
|
||||
var len = length;
|
||||
while (len > 0) {
|
||||
var n = dst_dd.tryWriteCopy(dist, len);
|
||||
if (n == 0) {
|
||||
n = dst_dd.writeCopy(dist, len);
|
||||
}
|
||||
|
||||
len -= n;
|
||||
if (dst_dd.availWrite() == 0) {
|
||||
_ = try dst.write(dst_dd.readFlush());
|
||||
}
|
||||
}
|
||||
}
|
||||
fn writeString(dst_dd: *DictDecoder, dst: anytype, str: []const u8) !void {
|
||||
var string = str;
|
||||
while (string.len > 0) {
|
||||
var cnt = DictDecoder.copy(dst_dd.writeSlice(), string);
|
||||
dst_dd.writeMark(cnt);
|
||||
string = string[cnt..];
|
||||
if (dst_dd.availWrite() == 0) {
|
||||
_ = try dst.write(dst_dd.readFlush());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try util.writeString(&dd, got, ".");
|
||||
_ = try want.write(".");
|
||||
|
||||
var str = poem;
|
||||
for (poem_refs) |ref, i| {
|
||||
_ = i;
|
||||
if (ref.dist == 0) {
|
||||
try util.writeString(&dd, got, str[0..ref.length]);
|
||||
} else {
|
||||
try util.writeCopy(&dd, got, ref.dist, ref.length);
|
||||
}
|
||||
str = str[ref.length..];
|
||||
}
|
||||
_ = try want.write(poem);
|
||||
|
||||
try util.writeCopy(&dd, got, dd.histSize(), 33);
|
||||
_ = try want.write(want_list.items[0..33]);
|
||||
|
||||
try util.writeString(&dd, got, abc);
|
||||
try util.writeCopy(&dd, got, abc.len, 59 * abc.len);
|
||||
_ = try want.write(abc ** 60);
|
||||
|
||||
try util.writeString(&dd, got, fox);
|
||||
try util.writeCopy(&dd, got, fox.len, 9 * fox.len);
|
||||
_ = try want.write(fox ** 10);
|
||||
|
||||
try util.writeString(&dd, got, ".");
|
||||
try util.writeCopy(&dd, got, 1, 9);
|
||||
_ = try want.write("." ** 10);
|
||||
|
||||
try util.writeString(&dd, got, uppercase);
|
||||
try util.writeCopy(&dd, got, uppercase.len, 7 * uppercase.len);
|
||||
var i: u8 = 0;
|
||||
while (i < 8) : (i += 1) {
|
||||
_ = try want.write(uppercase);
|
||||
}
|
||||
|
||||
try util.writeCopy(&dd, got, dd.histSize(), 10);
|
||||
_ = try want.write(want_list.items[want_list.items.len - dd.histSize() ..][0..10]);
|
||||
|
||||
_ = try got.write(dd.readFlush());
|
||||
try expect(mem.eql(u8, got_list.items, want_list.items));
|
||||
}
|
1722
lib/std/compress/deflate/huffman_bit_writer.zig
Normal file
1722
lib/std/compress/deflate/huffman_bit_writer.zig
Normal file
File diff suppressed because it is too large
Load Diff
432
lib/std/compress/deflate/huffman_code.zig
Normal file
432
lib/std/compress/deflate/huffman_code.zig
Normal file
@ -0,0 +1,432 @@
|
||||
const std = @import("std");
|
||||
const assert = std.debug.assert;
|
||||
const math = std.math;
|
||||
const mem = std.mem;
|
||||
const sort = std.sort;
|
||||
const testing = std.testing;
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
const bu = @import("bits_utils.zig");
|
||||
const deflate_const = @import("deflate_const.zig");
|
||||
|
||||
const max_bits_limit = 16;
|
||||
|
||||
const LiteralNode = struct {
|
||||
literal: u16,
|
||||
freq: u16,
|
||||
};
|
||||
|
||||
// Describes the state of the constructed tree for a given depth.
|
||||
const LevelInfo = struct {
|
||||
// Our level. for better printing
|
||||
level: u32,
|
||||
|
||||
// The frequency of the last node at this level
|
||||
last_freq: u32,
|
||||
|
||||
// The frequency of the next character to add to this level
|
||||
next_char_freq: u32,
|
||||
|
||||
// The frequency of the next pair (from level below) to add to this level.
|
||||
// Only valid if the "needed" value of the next lower level is 0.
|
||||
next_pair_freq: u32,
|
||||
|
||||
// The number of chains remaining to generate for this level before moving
|
||||
// up to the next level
|
||||
needed: u32,
|
||||
};
|
||||
|
||||
// hcode is a huffman code with a bit code and bit length.
|
||||
pub const HuffCode = struct {
|
||||
code: u16 = 0,
|
||||
len: u16 = 0,
|
||||
|
||||
// set sets the code and length of an hcode.
|
||||
fn set(self: *HuffCode, code: u16, length: u16) void {
|
||||
self.len = length;
|
||||
self.code = code;
|
||||
}
|
||||
};
|
||||
|
||||
pub const HuffmanEncoder = struct {
|
||||
codes: []HuffCode,
|
||||
freq_cache: []LiteralNode = undefined,
|
||||
bit_count: [17]u32 = undefined,
|
||||
lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
|
||||
lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
|
||||
allocator: Allocator,
|
||||
|
||||
pub fn deinit(self: *HuffmanEncoder) void {
|
||||
self.allocator.free(self.codes);
|
||||
self.allocator.free(self.freq_cache);
|
||||
}
|
||||
|
||||
// Update this Huffman Code object to be the minimum code for the specified frequency count.
|
||||
//
|
||||
// freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
|
||||
// max_bits The maximum number of bits to use for any literal.
|
||||
pub fn generate(self: *HuffmanEncoder, freq: []u16, max_bits: u32) void {
|
||||
var list = self.freq_cache[0 .. freq.len + 1];
|
||||
// Number of non-zero literals
|
||||
var count: u32 = 0;
|
||||
// Set list to be the set of all non-zero literals and their frequencies
|
||||
for (freq) |f, i| {
|
||||
if (f != 0) {
|
||||
list[count] = LiteralNode{ .literal = @intCast(u16, i), .freq = f };
|
||||
count += 1;
|
||||
} else {
|
||||
list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
||||
self.codes[i].len = 0;
|
||||
}
|
||||
}
|
||||
list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
||||
|
||||
list = list[0..count];
|
||||
if (count <= 2) {
|
||||
// Handle the small cases here, because they are awkward for the general case code. With
|
||||
// two or fewer literals, everything has bit length 1.
|
||||
for (list) |node, i| {
|
||||
// "list" is in order of increasing literal value.
|
||||
self.codes[node.literal].set(@intCast(u16, i), 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
self.lfs = list;
|
||||
sort.sort(LiteralNode, self.lfs, {}, byFreq);
|
||||
|
||||
// Get the number of literals for each bit count
|
||||
var bit_count = self.bitCounts(list, max_bits);
|
||||
// And do the assignment
|
||||
self.assignEncodingAndSize(bit_count, list);
|
||||
}
|
||||
|
||||
pub fn bitLength(self: *HuffmanEncoder, freq: []u16) u32 {
|
||||
var total: u32 = 0;
|
||||
for (freq) |f, i| {
|
||||
if (f != 0) {
|
||||
total += @intCast(u32, f) * @intCast(u32, self.codes[i].len);
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
// Return the number of literals assigned to each bit size in the Huffman encoding
|
||||
//
|
||||
// This method is only called when list.len >= 3
|
||||
// The cases of 0, 1, and 2 literals are handled by special case code.
|
||||
//
|
||||
// list: An array of the literals with non-zero frequencies
|
||||
// and their associated frequencies. The array is in order of increasing
|
||||
// frequency, and has as its last element a special element with frequency
|
||||
// std.math.maxInt(i32)
|
||||
//
|
||||
// max_bits: The maximum number of bits that should be used to encode any literal.
|
||||
// Must be less than 16.
|
||||
//
|
||||
// Returns an integer array in which array[i] indicates the number of literals
|
||||
// that should be encoded in i bits.
|
||||
fn bitCounts(self: *HuffmanEncoder, list: []LiteralNode, max_bits_to_use: usize) []u32 {
|
||||
var max_bits = max_bits_to_use;
|
||||
var n = list.len;
|
||||
|
||||
assert(max_bits < max_bits_limit);
|
||||
|
||||
// The tree can't have greater depth than n - 1, no matter what. This
|
||||
// saves a little bit of work in some small cases
|
||||
max_bits = @minimum(max_bits, n - 1);
|
||||
|
||||
// Create information about each of the levels.
|
||||
// A bogus "Level 0" whose sole purpose is so that
|
||||
// level1.prev.needed == 0. This makes level1.next_pair_freq
|
||||
// be a legitimate value that never gets chosen.
|
||||
var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
|
||||
// leaf_counts[i] counts the number of literals at the left
|
||||
// of ancestors of the rightmost node at level i.
|
||||
// leaf_counts[i][j] is the number of literals at the left
|
||||
// of the level j ancestor.
|
||||
var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
|
||||
|
||||
{
|
||||
var level = @as(u32, 1);
|
||||
while (level <= max_bits) : (level += 1) {
|
||||
// For every level, the first two items are the first two characters.
|
||||
// We initialize the levels as if we had already figured this out.
|
||||
levels[level] = LevelInfo{
|
||||
.level = level,
|
||||
.last_freq = list[1].freq,
|
||||
.next_char_freq = list[2].freq,
|
||||
.next_pair_freq = list[0].freq + list[1].freq,
|
||||
.needed = 0,
|
||||
};
|
||||
leaf_counts[level][level] = 2;
|
||||
if (level == 1) {
|
||||
levels[level].next_pair_freq = math.maxInt(i32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We need a total of 2*n - 2 items at top level and have already generated 2.
|
||||
levels[max_bits].needed = 2 * @intCast(u32, n) - 4;
|
||||
|
||||
{
|
||||
var level = max_bits;
|
||||
while (true) {
|
||||
var l = &levels[level];
|
||||
if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
|
||||
// We've run out of both leafs and pairs.
|
||||
// End all calculations for this level.
|
||||
// To make sure we never come back to this level or any lower level,
|
||||
// set next_pair_freq impossibly large.
|
||||
l.needed = 0;
|
||||
levels[level + 1].next_pair_freq = math.maxInt(i32);
|
||||
level += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
var prev_freq = l.last_freq;
|
||||
if (l.next_char_freq < l.next_pair_freq) {
|
||||
// The next item on this row is a leaf node.
|
||||
var next = leaf_counts[level][level] + 1;
|
||||
l.last_freq = l.next_char_freq;
|
||||
// Lower leaf_counts are the same of the previous node.
|
||||
leaf_counts[level][level] = next;
|
||||
if (next >= list.len) {
|
||||
l.next_char_freq = maxNode().freq;
|
||||
} else {
|
||||
l.next_char_freq = list[next].freq;
|
||||
}
|
||||
} else {
|
||||
// The next item on this row is a pair from the previous row.
|
||||
// next_pair_freq isn't valid until we generate two
|
||||
// more values in the level below
|
||||
l.last_freq = l.next_pair_freq;
|
||||
// Take leaf counts from the lower level, except counts[level] remains the same.
|
||||
mem.copy(u32, leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
|
||||
levels[l.level - 1].needed = 2;
|
||||
}
|
||||
|
||||
l.needed -= 1;
|
||||
if (l.needed == 0) {
|
||||
// We've done everything we need to do for this level.
|
||||
// Continue calculating one level up. Fill in next_pair_freq
|
||||
// of that level with the sum of the two nodes we've just calculated on
|
||||
// this level.
|
||||
if (l.level == max_bits) {
|
||||
// All done!
|
||||
break;
|
||||
}
|
||||
levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
|
||||
level += 1;
|
||||
} else {
|
||||
// If we stole from below, move down temporarily to replenish it.
|
||||
while (levels[level - 1].needed > 0) {
|
||||
level -= 1;
|
||||
if (level == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Somethings is wrong if at the end, the top level is null or hasn't used
|
||||
// all of the leaves.
|
||||
assert(leaf_counts[max_bits][max_bits] == n);
|
||||
|
||||
var bit_count = self.bit_count[0 .. max_bits + 1];
|
||||
var bits: u32 = 1;
|
||||
var counts = &leaf_counts[max_bits];
|
||||
{
|
||||
var level = max_bits;
|
||||
while (level > 0) : (level -= 1) {
|
||||
// counts[level] gives the number of literals requiring at least "bits"
|
||||
// bits to encode.
|
||||
bit_count[bits] = counts[level] - counts[level - 1];
|
||||
bits += 1;
|
||||
if (level == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bit_count;
|
||||
}
|
||||
|
||||
// Look at the leaves and assign them a bit count and an encoding as specified
|
||||
// in RFC 1951 3.2.2
|
||||
fn assignEncodingAndSize(self: *HuffmanEncoder, bit_count: []u32, list_arg: []LiteralNode) void {
|
||||
var code = @as(u16, 0);
|
||||
var list = list_arg;
|
||||
|
||||
for (bit_count) |bits, n| {
|
||||
code <<= 1;
|
||||
if (n == 0 or bits == 0) {
|
||||
continue;
|
||||
}
|
||||
// The literals list[list.len-bits] .. list[list.len-bits]
|
||||
// are encoded using "bits" bits, and get the values
|
||||
// code, code + 1, .... The code values are
|
||||
// assigned in literal order (not frequency order).
|
||||
var chunk = list[list.len - @intCast(u32, bits) ..];
|
||||
|
||||
self.lns = chunk;
|
||||
sort.sort(LiteralNode, self.lns, {}, byLiteral);
|
||||
|
||||
for (chunk) |node| {
|
||||
self.codes[node.literal] = HuffCode{
|
||||
.code = bu.bitReverse(u16, code, @intCast(u5, n)),
|
||||
.len = @intCast(u16, n),
|
||||
};
|
||||
code += 1;
|
||||
}
|
||||
list = list[0 .. list.len - @intCast(u32, bits)];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
fn maxNode() LiteralNode {
|
||||
return LiteralNode{
|
||||
.literal = math.maxInt(u16),
|
||||
.freq = math.maxInt(u16),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn newHuffmanEncoder(allocator: Allocator, size: u32) !HuffmanEncoder {
|
||||
return HuffmanEncoder{
|
||||
.codes = try allocator.alloc(HuffCode, size),
|
||||
// Allocate a reusable buffer with the longest possible frequency table.
|
||||
// (deflate_const.max_num_frequencies).
|
||||
.freq_cache = try allocator.alloc(LiteralNode, deflate_const.max_num_frequencies + 1),
|
||||
.allocator = allocator,
|
||||
};
|
||||
}
|
||||
|
||||
// Generates a HuffmanCode corresponding to the fixed literal table
|
||||
pub fn generateFixedLiteralEncoding(allocator: Allocator) !HuffmanEncoder {
|
||||
var h = try newHuffmanEncoder(allocator, deflate_const.max_num_frequencies);
|
||||
var codes = h.codes;
|
||||
var ch: u16 = 0;
|
||||
|
||||
while (ch < deflate_const.max_num_frequencies) : (ch += 1) {
|
||||
var bits: u16 = undefined;
|
||||
var size: u16 = undefined;
|
||||
switch (ch) {
|
||||
0...143 => {
|
||||
// size 8, 000110000 .. 10111111
|
||||
bits = ch + 48;
|
||||
size = 8;
|
||||
},
|
||||
144...255 => {
|
||||
// size 9, 110010000 .. 111111111
|
||||
bits = ch + 400 - 144;
|
||||
size = 9;
|
||||
},
|
||||
256...279 => {
|
||||
// size 7, 0000000 .. 0010111
|
||||
bits = ch - 256;
|
||||
size = 7;
|
||||
},
|
||||
else => {
|
||||
// size 8, 11000000 .. 11000111
|
||||
bits = ch + 192 - 280;
|
||||
size = 8;
|
||||
},
|
||||
}
|
||||
codes[ch] = HuffCode{ .code = bu.bitReverse(u16, bits, @intCast(u5, size)), .len = size };
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
pub fn generateFixedOffsetEncoding(allocator: Allocator) !HuffmanEncoder {
|
||||
var h = try newHuffmanEncoder(allocator, 30);
|
||||
var codes = h.codes;
|
||||
for (codes) |_, ch| {
|
||||
codes[ch] = HuffCode{ .code = bu.bitReverse(u16, @intCast(u16, ch), 5), .len = 5 };
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
|
||||
_ = context;
|
||||
return a.literal < b.literal;
|
||||
}
|
||||
|
||||
fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
|
||||
_ = context;
|
||||
if (a.freq == b.freq) {
|
||||
return a.literal < b.literal;
|
||||
}
|
||||
return a.freq < b.freq;
|
||||
}
|
||||
|
||||
test "generate a Huffman code from an array of frequencies" {
|
||||
var freqs: [19]u16 = [_]u16{
|
||||
8, // 0
|
||||
1, // 1
|
||||
1, // 2
|
||||
2, // 3
|
||||
5, // 4
|
||||
10, // 5
|
||||
9, // 6
|
||||
1, // 7
|
||||
0, // 8
|
||||
0, // 9
|
||||
0, // 10
|
||||
0, // 11
|
||||
0, // 12
|
||||
0, // 13
|
||||
0, // 14
|
||||
0, // 15
|
||||
1, // 16
|
||||
3, // 17
|
||||
5, // 18
|
||||
};
|
||||
|
||||
var enc = try newHuffmanEncoder(testing.allocator, freqs.len);
|
||||
defer enc.deinit();
|
||||
enc.generate(freqs[0..], 7);
|
||||
|
||||
try testing.expect(enc.bitLength(freqs[0..]) == 141);
|
||||
|
||||
try testing.expect(enc.codes[0].len == 3);
|
||||
try testing.expect(enc.codes[1].len == 6);
|
||||
try testing.expect(enc.codes[2].len == 6);
|
||||
try testing.expect(enc.codes[3].len == 5);
|
||||
try testing.expect(enc.codes[4].len == 3);
|
||||
try testing.expect(enc.codes[5].len == 2);
|
||||
try testing.expect(enc.codes[6].len == 2);
|
||||
try testing.expect(enc.codes[7].len == 6);
|
||||
try testing.expect(enc.codes[8].len == 0);
|
||||
try testing.expect(enc.codes[9].len == 0);
|
||||
try testing.expect(enc.codes[10].len == 0);
|
||||
try testing.expect(enc.codes[11].len == 0);
|
||||
try testing.expect(enc.codes[12].len == 0);
|
||||
try testing.expect(enc.codes[13].len == 0);
|
||||
try testing.expect(enc.codes[14].len == 0);
|
||||
try testing.expect(enc.codes[15].len == 0);
|
||||
try testing.expect(enc.codes[16].len == 6);
|
||||
try testing.expect(enc.codes[17].len == 5);
|
||||
try testing.expect(enc.codes[18].len == 3);
|
||||
|
||||
try testing.expect(enc.codes[5].code == 0x0);
|
||||
try testing.expect(enc.codes[6].code == 0x2);
|
||||
try testing.expect(enc.codes[0].code == 0x1);
|
||||
try testing.expect(enc.codes[4].code == 0x5);
|
||||
try testing.expect(enc.codes[18].code == 0x3);
|
||||
try testing.expect(enc.codes[3].code == 0x7);
|
||||
try testing.expect(enc.codes[17].code == 0x17);
|
||||
try testing.expect(enc.codes[1].code == 0x0f);
|
||||
try testing.expect(enc.codes[2].code == 0x2f);
|
||||
try testing.expect(enc.codes[7].code == 0x1f);
|
||||
try testing.expect(enc.codes[16].code == 0x3f);
|
||||
}
|
||||
|
||||
test "generate a Huffman code for the fixed litteral table specific to Deflate" {
|
||||
var enc = try generateFixedLiteralEncoding(testing.allocator);
|
||||
defer enc.deinit();
|
||||
}
|
||||
|
||||
test "generate a Huffman code for the 30 possible relative offsets (LZ77 distances) of Deflate" {
|
||||
var enc = try generateFixedOffsetEncoding(testing.allocator);
|
||||
defer enc.deinit();
|
||||
}
|
15
lib/std/compress/deflate/mem_utils.zig
Normal file
15
lib/std/compress/deflate/mem_utils.zig
Normal file
@ -0,0 +1,15 @@
|
||||
const std = @import("std");
|
||||
const math = std.math;
|
||||
const mem = std.mem;
|
||||
|
||||
// Copies elements from a source `src` slice into a destination `dst` slice.
|
||||
// The copy never returns an error but might not be complete if the destination is too small.
|
||||
// Returns the number of elements copied, which will be the minimum of `src.len` and `dst.len`.
|
||||
pub fn copy(dst: []u8, src: []const u8) usize {
|
||||
if (dst.len <= src.len) {
|
||||
mem.copy(u8, dst[0..], src[0..dst.len]);
|
||||
} else {
|
||||
mem.copy(u8, dst[0..src.len], src[0..]);
|
||||
}
|
||||
return math.min(dst.len, src.len);
|
||||
}
|
1
lib/std/compress/deflate/testdata/compress-e.txt
vendored
Normal file
1
lib/std/compress/deflate/testdata/compress-e.txt
vendored
Normal file
File diff suppressed because one or more lines are too long
29
lib/std/compress/deflate/testdata/compress-gettysburg.txt
vendored
Normal file
29
lib/std/compress/deflate/testdata/compress-gettysburg.txt
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
Four score and seven years ago our fathers brought forth on
|
||||
this continent, a new nation, conceived in Liberty, and dedicated
|
||||
to the proposition that all men are created equal.
|
||||
Now we are engaged in a great Civil War, testing whether that
|
||||
nation, or any nation so conceived and so dedicated, can long
|
||||
endure.
|
||||
We are met on a great battle-field of that war.
|
||||
We have come to dedicate a portion of that field, as a final
|
||||
resting place for those who here gave their lives that that
|
||||
nation might live. It is altogether fitting and proper that
|
||||
we should do this.
|
||||
But, in a larger sense, we can not dedicate - we can not
|
||||
consecrate - we can not hallow - this ground.
|
||||
The brave men, living and dead, who struggled here, have
|
||||
consecrated it, far above our poor power to add or detract.
|
||||
The world will little note, nor long remember what we say here,
|
||||
but it can never forget what they did here.
|
||||
It is for us the living, rather, to be dedicated here to the
|
||||
unfinished work which they who fought here have thus far so
|
||||
nobly advanced. It is rather for us to be here dedicated to
|
||||
the great task remaining before us - that from these honored
|
||||
dead we take increased devotion to that cause for which they
|
||||
gave the last full measure of devotion -
|
||||
that we here highly resolve that these dead shall not have
|
||||
died in vain - that this nation, under God, shall have a new
|
||||
birth of freedom - and that government of the people, by the
|
||||
people, for the people, shall not perish from this earth.
|
||||
|
||||
Abraham Lincoln, November 19, 1863, Gettysburg, Pennsylvania
|
1
lib/std/compress/deflate/testdata/compress-pi.txt
vendored
Normal file
1
lib/std/compress/deflate/testdata/compress-pi.txt
vendored
Normal file
File diff suppressed because one or more lines are too long
BIN
lib/std/compress/deflate/testdata/huffman-null-max.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-null-max.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-null-max.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.golden
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-null-max.input
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.input
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-null-max.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-null-max.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-null-max.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-pi.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-pi.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-pi.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-pi.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-pi.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-pi.golden
vendored
Normal file
Binary file not shown.
1
lib/std/compress/deflate/testdata/huffman-pi.input
vendored
Normal file
1
lib/std/compress/deflate/testdata/huffman-pi.input
vendored
Normal file
@ -0,0 +1 @@
|
||||
3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233786783165271201909145648566923460348610454326648213393607260249141273724587006606315588174881520920962829254091715364367892590360011330530548820466521384146951941511609433057270365759591953092186117381932611793105118548074462379962749567351885752724891227938183011949129833673362440656643086021394946395224737190702179860943702770539217176293176752384674818467669405132000568127145263560827785771342757789609173637178721468440901224953430146549585371050792279689258923542019956112129021960864034418159813629774771309960518707211349999998372978049951059731732816096318595024459455346908302642522308253344685035261931188171010003137838752886587533208381420617177669147303598253490428755468731159562863882353787593751957781857780532171226806613001927876611195909216420198938095257201065485863278865936153381827968230301952035301852968995773622599413891249721775283479131515574857242454150695950829533116861727855889075098381754637464939319255060400927701671139009848824012858361603563707660104710181942955596198946767837449448255379774726847104047534646208046684259069491293313677028989152104752162056966024058038150193511253382430035587640247496473263914199272604269922796782354781636009341721641219924586315030286182974555706749838505494588586926995690927210797509302955321165344987202755960236480665499119881834797753566369807426542527862551818417574672890977772793800081647060016145249192173217214772350141441973568548161361157352552133475741849468438523323907394143334547762416862518983569485562099219222184272550254256887671790494601653466804988627232791786085784383827967976681454100953883786360950680064225125205117392984896084128488626945604241965285022210661186306744278622039194945047123713786960956364371917287467764657573962413890865832645995813390478027590099465764078951269468398352595709825822620522489407726719478268482601476990902640136394437455305068203496252451749399651431429809190659250937221696461515709858387410597885959772975498930161753928468138268683868942774155991855925245953959431049972524680845987273644695848653836736222626099124608051243884390451244136549762780797715691435997700129616089441694868555848406353422072225828488648158456028506016842739452267467678895252138522549954666727823986456596116354886230577456498035593634568174324112515076069479451096596094025228879710893145669136867228748940560101503308617928680920874760917824938589009714909675985261365549781893129784821682998948722658804857564014270477555132379641451523746234364542858444795265867821051141354735739523113427166102135969536231442952484937187110145765403590279934403742007310578539062198387447808478489683321445713868751943506430218453191048481005370614680674919278191197939952061419663428754440643745123718192179998391015919561814675142691239748940907186494231961567945208095146550225231603881930142093762137855956638937787083039069792077346722182562599661501421503068038447734549202605414665925201497442850732518666002132434088190710486331734649651453905796268561005508106658796998163574736384052571459102897064140110971206280439039759515677157700420337869936007230558763176359421873125147120532928191826186125867321579198414848829164470609575270695722091756711672291098169091528017350671274858322287183520935396572512108357915136988209144421006751033467110314126711136990865851639831501970165151168517143765761835155650884909989859982387345528331635507647918535893226185489632132933089857064204675259070915481416549859461637180
|
BIN
lib/std/compress/deflate/testdata/huffman-pi.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-pi.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-pi.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-pi.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.golden
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.input
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.input
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-1k.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.golden
vendored
Normal file
Binary file not shown.
4
lib/std/compress/deflate/testdata/huffman-rand-limit.input
vendored
Normal file
4
lib/std/compress/deflate/testdata/huffman-rand-limit.input
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
ř‹–vH
|
||||
…”%€ŻÂţŤč ë†É·ĹŢę}‹ç>Úß˙lsŢĚçmŤIGH°čžň1YŢ4´[ĺŕ 0Â<30>[|]o#©
|
||||
Ľ-#ľŮíul™ßýpfćîٱžn<C5BE>YŐÔ€Y<E282AC>w‰C8ÉŻ02š F=gn×ržN!OĆŕÔ{ŤĄö›kÜ*“w(ý´bÚ ç«kQC9/ ’lu>ô5ýC.÷¤uÚę›
|
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-limit.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-max.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-max.golden
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-rand-max.input
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-rand-max.input
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-shifts.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-shifts.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-shifts.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-shifts.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-shifts.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-shifts.golden
vendored
Normal file
Binary file not shown.
2
lib/std/compress/deflate/testdata/huffman-shifts.input
vendored
Normal file
2
lib/std/compress/deflate/testdata/huffman-shifts.input
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010
|
||||
232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323
|
BIN
lib/std/compress/deflate/testdata/huffman-shifts.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-shifts.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-shifts.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-shifts.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.golden
vendored
Normal file
Binary file not shown.
14
lib/std/compress/deflate/testdata/huffman-text-shift.input
vendored
Normal file
14
lib/std/compress/deflate/testdata/huffman-text-shift.input
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
//Copyright2009ThGoAuthor.Allrightrrvd.
|
||||
//UofthiourccodigovrndbyBSD-tyl
|
||||
//licnthtcnbfoundinthLICENSEfil.
|
||||
|
||||
pckgmin
|
||||
|
||||
import"o"
|
||||
|
||||
funcmin(){
|
||||
vrb=mk([]byt,65535)
|
||||
f,_:=o.Crt("huffmn-null-mx.in")
|
||||
f.Writ(b)
|
||||
}
|
||||
ABCDEFGHIJKLMNOPQRSTUVXxyz!"#¤%&/?"
|
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text-shift.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text.golden
vendored
Normal file
Binary file not shown.
14
lib/std/compress/deflate/testdata/huffman-text.input
vendored
Normal file
14
lib/std/compress/deflate/testdata/huffman-text.input
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
// zig v0.10.0
|
||||
// create a file filled with 0x00
|
||||
const std = @import("std");
|
||||
|
||||
pub fn main() !void {
|
||||
var b = [1]u8{0} ** 65535;
|
||||
const f = try std.fs.cwd().createFile(
|
||||
"huffman-null-max.in",
|
||||
.{ .read = true },
|
||||
);
|
||||
defer f.close();
|
||||
|
||||
_ = try f.writeAll(b[0..]);
|
||||
}
|
BIN
lib/std/compress/deflate/testdata/huffman-text.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-text.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-text.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-zero.dyn.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-zero.dyn.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-zero.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-zero.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-zero.golden
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-zero.golden
vendored
Normal file
Binary file not shown.
1
lib/std/compress/deflate/testdata/huffman-zero.input
vendored
Normal file
1
lib/std/compress/deflate/testdata/huffman-zero.input
vendored
Normal file
@ -0,0 +1 @@
|
||||
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
|
BIN
lib/std/compress/deflate/testdata/huffman-zero.wb.expect
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-zero.wb.expect
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-zero.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/huffman-zero.wb.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/null-long-match.dyn.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/null-long-match.dyn.expect-noinput
vendored
Normal file
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/null-long-match.wb.expect-noinput
vendored
Normal file
BIN
lib/std/compress/deflate/testdata/null-long-match.wb.expect-noinput
vendored
Normal file
Binary file not shown.
955
lib/std/compress/deflate/testdata/rfc1951.txt
vendored
Normal file
955
lib/std/compress/deflate/testdata/rfc1951.txt
vendored
Normal file
@ -0,0 +1,955 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Network Working Group P. Deutsch
|
||||
Request for Comments: 1951 Aladdin Enterprises
|
||||
Category: Informational May 1996
|
||||
|
||||
|
||||
DEFLATE Compressed Data Format Specification version 1.3
|
||||
|
||||
Status of This Memo
|
||||
|
||||
This memo provides information for the Internet community. This memo
|
||||
does not specify an Internet standard of any kind. Distribution of
|
||||
this memo is unlimited.
|
||||
|
||||
IESG Note:
|
||||
|
||||
The IESG takes no position on the validity of any Intellectual
|
||||
Property Rights statements contained in this document.
|
||||
|
||||
Notices
|
||||
|
||||
Copyright (c) 1996 L. Peter Deutsch
|
||||
|
||||
Permission is granted to copy and distribute this document for any
|
||||
purpose and without charge, including translations into other
|
||||
languages and incorporation into compilations, provided that the
|
||||
copyright notice and this notice are preserved, and that any
|
||||
substantive changes or deletions from the original are clearly
|
||||
marked.
|
||||
|
||||
A pointer to the latest version of this and related documentation in
|
||||
HTML format can be found at the URL
|
||||
<ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
|
||||
|
||||
Abstract
|
||||
|
||||
This specification defines a lossless compressed data format that
|
||||
compresses data using a combination of the LZ77 algorithm and Huffman
|
||||
coding, with efficiency comparable to the best currently available
|
||||
general-purpose compression methods. The data can be produced or
|
||||
consumed, even for an arbitrarily long sequentially presented input
|
||||
data stream, using only an a priori bounded amount of intermediate
|
||||
storage. The format can be implemented readily in a manner not
|
||||
covered by patents.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 1]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
Table of Contents
|
||||
|
||||
1. Introduction ................................................... 2
|
||||
1.1. Purpose ................................................... 2
|
||||
1.2. Intended audience ......................................... 3
|
||||
1.3. Scope ..................................................... 3
|
||||
1.4. Compliance ................................................ 3
|
||||
1.5. Definitions of terms and conventions used ................ 3
|
||||
1.6. Changes from previous versions ............................ 4
|
||||
2. Compressed representation overview ............................. 4
|
||||
3. Detailed specification ......................................... 5
|
||||
3.1. Overall conventions ....................................... 5
|
||||
3.1.1. Packing into bytes .................................. 5
|
||||
3.2. Compressed block format ................................... 6
|
||||
3.2.1. Synopsis of prefix and Huffman coding ............... 6
|
||||
3.2.2. Use of Huffman coding in the "deflate" format ....... 7
|
||||
3.2.3. Details of block format ............................. 9
|
||||
3.2.4. Non-compressed blocks (BTYPE=00) ................... 11
|
||||
3.2.5. Compressed blocks (length and distance codes) ...... 11
|
||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01) .... 12
|
||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10) .. 13
|
||||
3.3. Compliance ............................................... 14
|
||||
4. Compression algorithm details ................................. 14
|
||||
5. References .................................................... 16
|
||||
6. Security Considerations ....................................... 16
|
||||
7. Source code ................................................... 16
|
||||
8. Acknowledgements .............................................. 16
|
||||
9. Author's Address .............................................. 17
|
||||
|
||||
1. Introduction
|
||||
|
||||
1.1. Purpose
|
||||
|
||||
The purpose of this specification is to define a lossless
|
||||
compressed data format that:
|
||||
* Is independent of CPU type, operating system, file system,
|
||||
and character set, and hence can be used for interchange;
|
||||
* Can be produced or consumed, even for an arbitrarily long
|
||||
sequentially presented input data stream, using only an a
|
||||
priori bounded amount of intermediate storage, and hence
|
||||
can be used in data communications or similar structures
|
||||
such as Unix filters;
|
||||
* Compresses data with efficiency comparable to the best
|
||||
currently available general-purpose compression methods,
|
||||
and in particular considerably better than the "compress"
|
||||
program;
|
||||
* Can be implemented readily in a manner not covered by
|
||||
patents, and hence can be practiced freely;
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 2]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
* Is compatible with the file format produced by the current
|
||||
widely used gzip utility, in that conforming decompressors
|
||||
will be able to read data produced by the existing gzip
|
||||
compressor.
|
||||
|
||||
The data format defined by this specification does not attempt to:
|
||||
|
||||
* Allow random access to compressed data;
|
||||
* Compress specialized data (e.g., raster graphics) as well
|
||||
as the best currently available specialized algorithms.
|
||||
|
||||
A simple counting argument shows that no lossless compression
|
||||
algorithm can compress every possible input data set. For the
|
||||
format defined here, the worst case expansion is 5 bytes per 32K-
|
||||
byte block, i.e., a size increase of 0.015% for large data sets.
|
||||
English text usually compresses by a factor of 2.5 to 3;
|
||||
executable files usually compress somewhat less; graphical data
|
||||
such as raster images may compress much more.
|
||||
|
||||
1.2. Intended audience
|
||||
|
||||
This specification is intended for use by implementors of software
|
||||
to compress data into "deflate" format and/or decompress data from
|
||||
"deflate" format.
|
||||
|
||||
The text of the specification assumes a basic background in
|
||||
programming at the level of bits and other primitive data
|
||||
representations. Familiarity with the technique of Huffman coding
|
||||
is helpful but not required.
|
||||
|
||||
1.3. Scope
|
||||
|
||||
The specification specifies a method for representing a sequence
|
||||
of bytes as a (usually shorter) sequence of bits, and a method for
|
||||
packing the latter bit sequence into bytes.
|
||||
|
||||
1.4. Compliance
|
||||
|
||||
Unless otherwise indicated below, a compliant decompressor must be
|
||||
able to accept and decompress any data set that conforms to all
|
||||
the specifications presented here; a compliant compressor must
|
||||
produce data sets that conform to all the specifications presented
|
||||
here.
|
||||
|
||||
1.5. Definitions of terms and conventions used
|
||||
|
||||
Byte: 8 bits stored or transmitted as a unit (same as an octet).
|
||||
For this specification, a byte is exactly 8 bits, even on machines
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 3]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
which store a character on a number of bits different from eight.
|
||||
See below, for the numbering of bits within a byte.
|
||||
|
||||
String: a sequence of arbitrary bytes.
|
||||
|
||||
1.6. Changes from previous versions
|
||||
|
||||
There have been no technical changes to the deflate format since
|
||||
version 1.1 of this specification. In version 1.2, some
|
||||
terminology was changed. Version 1.3 is a conversion of the
|
||||
specification to RFC style.
|
||||
|
||||
2. Compressed representation overview
|
||||
|
||||
A compressed data set consists of a series of blocks, corresponding
|
||||
to successive blocks of input data. The block sizes are arbitrary,
|
||||
except that non-compressible blocks are limited to 65,535 bytes.
|
||||
|
||||
Each block is compressed using a combination of the LZ77 algorithm
|
||||
and Huffman coding. The Huffman trees for each block are independent
|
||||
of those for previous or subsequent blocks; the LZ77 algorithm may
|
||||
use a reference to a duplicated string occurring in a previous block,
|
||||
up to 32K input bytes before.
|
||||
|
||||
Each block consists of two parts: a pair of Huffman code trees that
|
||||
describe the representation of the compressed data part, and a
|
||||
compressed data part. (The Huffman trees themselves are compressed
|
||||
using Huffman encoding.) The compressed data consists of a series of
|
||||
elements of two types: literal bytes (of strings that have not been
|
||||
detected as duplicated within the previous 32K input bytes), and
|
||||
pointers to duplicated strings, where a pointer is represented as a
|
||||
pair <length, backward distance>. The representation used in the
|
||||
"deflate" format limits distances to 32K bytes and lengths to 258
|
||||
bytes, but does not limit the size of a block, except for
|
||||
uncompressible blocks, which are limited as noted above.
|
||||
|
||||
Each type of value (literals, distances, and lengths) in the
|
||||
compressed data is represented using a Huffman code, using one code
|
||||
tree for literals and lengths and a separate code tree for distances.
|
||||
The code trees for each block appear in a compact form just before
|
||||
the compressed data for that block.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 4]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
3. Detailed specification
|
||||
|
||||
3.1. Overall conventions In the diagrams below, a box like this:
|
||||
|
||||
+---+
|
||||
| | <-- the vertical bars might be missing
|
||||
+---+
|
||||
|
||||
represents one byte; a box like this:
|
||||
|
||||
+==============+
|
||||
| |
|
||||
+==============+
|
||||
|
||||
represents a variable number of bytes.
|
||||
|
||||
Bytes stored within a computer do not have a "bit order", since
|
||||
they are always treated as a unit. However, a byte considered as
|
||||
an integer between 0 and 255 does have a most- and least-
|
||||
significant bit, and since we write numbers with the most-
|
||||
significant digit on the left, we also write bytes with the most-
|
||||
significant bit on the left. In the diagrams below, we number the
|
||||
bits of a byte so that bit 0 is the least-significant bit, i.e.,
|
||||
the bits are numbered:
|
||||
|
||||
+--------+
|
||||
|76543210|
|
||||
+--------+
|
||||
|
||||
Within a computer, a number may occupy multiple bytes. All
|
||||
multi-byte numbers in the format described here are stored with
|
||||
the least-significant byte first (at the lower memory address).
|
||||
For example, the decimal number 520 is stored as:
|
||||
|
||||
0 1
|
||||
+--------+--------+
|
||||
|00001000|00000010|
|
||||
+--------+--------+
|
||||
^ ^
|
||||
| |
|
||||
| + more significant byte = 2 x 256
|
||||
+ less significant byte = 8
|
||||
|
||||
3.1.1. Packing into bytes
|
||||
|
||||
This document does not address the issue of the order in which
|
||||
bits of a byte are transmitted on a bit-sequential medium,
|
||||
since the final data format described here is byte- rather than
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 5]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
bit-oriented. However, we describe the compressed block format
|
||||
in below, as a sequence of data elements of various bit
|
||||
lengths, not a sequence of bytes. We must therefore specify
|
||||
how to pack these data elements into bytes to form the final
|
||||
compressed byte sequence:
|
||||
|
||||
* Data elements are packed into bytes in order of
|
||||
increasing bit number within the byte, i.e., starting
|
||||
with the least-significant bit of the byte.
|
||||
* Data elements other than Huffman codes are packed
|
||||
starting with the least-significant bit of the data
|
||||
element.
|
||||
* Huffman codes are packed starting with the most-
|
||||
significant bit of the code.
|
||||
|
||||
In other words, if one were to print out the compressed data as
|
||||
a sequence of bytes, starting with the first byte at the
|
||||
*right* margin and proceeding to the *left*, with the most-
|
||||
significant bit of each byte on the left as usual, one would be
|
||||
able to parse the result from right to left, with fixed-width
|
||||
elements in the correct MSB-to-LSB order and Huffman codes in
|
||||
bit-reversed order (i.e., with the first bit of the code in the
|
||||
relative LSB position).
|
||||
|
||||
3.2. Compressed block format
|
||||
|
||||
3.2.1. Synopsis of prefix and Huffman coding
|
||||
|
||||
Prefix coding represents symbols from an a priori known
|
||||
alphabet by bit sequences (codes), one code for each symbol, in
|
||||
a manner such that different symbols may be represented by bit
|
||||
sequences of different lengths, but a parser can always parse
|
||||
an encoded string unambiguously symbol-by-symbol.
|
||||
|
||||
We define a prefix code in terms of a binary tree in which the
|
||||
two edges descending from each non-leaf node are labeled 0 and
|
||||
1 and in which the leaf nodes correspond one-for-one with (are
|
||||
labeled with) the symbols of the alphabet; then the code for a
|
||||
symbol is the sequence of 0's and 1's on the edges leading from
|
||||
the root to the leaf labeled with that symbol. For example:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 6]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
/\ Symbol Code
|
||||
0 1 ------ ----
|
||||
/ \ A 00
|
||||
/\ B B 1
|
||||
0 1 C 011
|
||||
/ \ D 010
|
||||
A /\
|
||||
0 1
|
||||
/ \
|
||||
D C
|
||||
|
||||
A parser can decode the next symbol from an encoded input
|
||||
stream by walking down the tree from the root, at each step
|
||||
choosing the edge corresponding to the next input bit.
|
||||
|
||||
Given an alphabet with known symbol frequencies, the Huffman
|
||||
algorithm allows the construction of an optimal prefix code
|
||||
(one which represents strings with those symbol frequencies
|
||||
using the fewest bits of any possible prefix codes for that
|
||||
alphabet). Such a code is called a Huffman code. (See
|
||||
reference [1] in Chapter 5, references for additional
|
||||
information on Huffman codes.)
|
||||
|
||||
Note that in the "deflate" format, the Huffman codes for the
|
||||
various alphabets must not exceed certain maximum code lengths.
|
||||
This constraint complicates the algorithm for computing code
|
||||
lengths from symbol frequencies. Again, see Chapter 5,
|
||||
references for details.
|
||||
|
||||
3.2.2. Use of Huffman coding in the "deflate" format
|
||||
|
||||
The Huffman codes used for each alphabet in the "deflate"
|
||||
format have two additional rules:
|
||||
|
||||
* All codes of a given bit length have lexicographically
|
||||
consecutive values, in the same order as the symbols
|
||||
they represent;
|
||||
|
||||
* Shorter codes lexicographically precede longer codes.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 7]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
We could recode the example above to follow this rule as
|
||||
follows, assuming that the order of the alphabet is ABCD:
|
||||
|
||||
Symbol Code
|
||||
------ ----
|
||||
A 10
|
||||
B 0
|
||||
C 110
|
||||
D 111
|
||||
|
||||
I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are
|
||||
lexicographically consecutive.
|
||||
|
||||
Given this rule, we can define the Huffman code for an alphabet
|
||||
just by giving the bit lengths of the codes for each symbol of
|
||||
the alphabet in order; this is sufficient to determine the
|
||||
actual codes. In our example, the code is completely defined
|
||||
by the sequence of bit lengths (2, 1, 3, 3). The following
|
||||
algorithm generates the codes as integers, intended to be read
|
||||
from most- to least-significant bit. The code lengths are
|
||||
initially in tree[I].Len; the codes are produced in
|
||||
tree[I].Code.
|
||||
|
||||
1) Count the number of codes for each code length. Let
|
||||
bl_count[N] be the number of codes of length N, N >= 1.
|
||||
|
||||
2) Find the numerical value of the smallest code for each
|
||||
code length:
|
||||
|
||||
code = 0;
|
||||
bl_count[0] = 0;
|
||||
for (bits = 1; bits <= MAX_BITS; bits++) {
|
||||
code = (code + bl_count[bits-1]) << 1;
|
||||
next_code[bits] = code;
|
||||
}
|
||||
|
||||
3) Assign numerical values to all codes, using consecutive
|
||||
values for all codes of the same length with the base
|
||||
values determined at step 2. Codes that are never used
|
||||
(which have a bit length of zero) must not be assigned a
|
||||
value.
|
||||
|
||||
for (n = 0; n <= max_code; n++) {
|
||||
len = tree[n].Len;
|
||||
if (len != 0) {
|
||||
tree[n].Code = next_code[len];
|
||||
next_code[len]++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 8]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
}
|
||||
|
||||
Example:
|
||||
|
||||
Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3,
|
||||
3, 2, 4, 4). After step 1, we have:
|
||||
|
||||
N bl_count[N]
|
||||
- -----------
|
||||
2 1
|
||||
3 5
|
||||
4 2
|
||||
|
||||
Step 2 computes the following next_code values:
|
||||
|
||||
N next_code[N]
|
||||
- ------------
|
||||
1 0
|
||||
2 0
|
||||
3 2
|
||||
4 14
|
||||
|
||||
Step 3 produces the following code values:
|
||||
|
||||
Symbol Length Code
|
||||
------ ------ ----
|
||||
A 3 010
|
||||
B 3 011
|
||||
C 3 100
|
||||
D 3 101
|
||||
E 3 110
|
||||
F 2 00
|
||||
G 4 1110
|
||||
H 4 1111
|
||||
|
||||
3.2.3. Details of block format
|
||||
|
||||
Each block of compressed data begins with 3 header bits
|
||||
containing the following data:
|
||||
|
||||
first bit BFINAL
|
||||
next 2 bits BTYPE
|
||||
|
||||
Note that the header bits do not necessarily begin on a byte
|
||||
boundary, since a block does not necessarily occupy an integral
|
||||
number of bytes.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 9]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
BFINAL is set if and only if this is the last block of the data
|
||||
set.
|
||||
|
||||
BTYPE specifies how the data are compressed, as follows:
|
||||
|
||||
00 - no compression
|
||||
01 - compressed with fixed Huffman codes
|
||||
10 - compressed with dynamic Huffman codes
|
||||
11 - reserved (error)
|
||||
|
||||
The only difference between the two compressed cases is how the
|
||||
Huffman codes for the literal/length and distance alphabets are
|
||||
defined.
|
||||
|
||||
In all cases, the decoding algorithm for the actual data is as
|
||||
follows:
|
||||
|
||||
do
|
||||
read block header from input stream.
|
||||
if stored with no compression
|
||||
skip any remaining bits in current partially
|
||||
processed byte
|
||||
read LEN and NLEN (see next section)
|
||||
copy LEN bytes of data to output
|
||||
otherwise
|
||||
if compressed with dynamic Huffman codes
|
||||
read representation of code trees (see
|
||||
subsection below)
|
||||
loop (until end of block code recognized)
|
||||
decode literal/length value from input stream
|
||||
if value < 256
|
||||
copy value (literal byte) to output stream
|
||||
otherwise
|
||||
if value = end of block (256)
|
||||
break from loop
|
||||
otherwise (value = 257..285)
|
||||
decode distance from input stream
|
||||
|
||||
move backwards distance bytes in the output
|
||||
stream, and copy length bytes from this
|
||||
position to the output stream.
|
||||
end loop
|
||||
while not last block
|
||||
|
||||
Note that a duplicated string reference may refer to a string
|
||||
in a previous block; i.e., the backward distance may cross one
|
||||
or more block boundaries. However a distance cannot refer past
|
||||
the beginning of the output stream. (An application using a
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 10]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
preset dictionary might discard part of the output stream; a
|
||||
distance can refer to that part of the output stream anyway)
|
||||
Note also that the referenced string may overlap the current
|
||||
position; for example, if the last 2 bytes decoded have values
|
||||
X and Y, a string reference with <length = 5, distance = 2>
|
||||
adds X,Y,X,Y,X to the output stream.
|
||||
|
||||
We now specify each compression method in turn.
|
||||
|
||||
3.2.4. Non-compressed blocks (BTYPE=00)
|
||||
|
||||
Any bits of input up to the next byte boundary are ignored.
|
||||
The rest of the block consists of the following information:
|
||||
|
||||
0 1 2 3 4...
|
||||
+---+---+---+---+================================+
|
||||
| LEN | NLEN |... LEN bytes of literal data...|
|
||||
+---+---+---+---+================================+
|
||||
|
||||
LEN is the number of data bytes in the block. NLEN is the
|
||||
one's complement of LEN.
|
||||
|
||||
3.2.5. Compressed blocks (length and distance codes)
|
||||
|
||||
As noted above, encoded data blocks in the "deflate" format
|
||||
consist of sequences of symbols drawn from three conceptually
|
||||
distinct alphabets: either literal bytes, from the alphabet of
|
||||
byte values (0..255), or <length, backward distance> pairs,
|
||||
where the length is drawn from (3..258) and the distance is
|
||||
drawn from (1..32,768). In fact, the literal and length
|
||||
alphabets are merged into a single alphabet (0..285), where
|
||||
values 0..255 represent literal bytes, the value 256 indicates
|
||||
end-of-block, and values 257..285 represent length codes
|
||||
(possibly in conjunction with extra bits following the symbol
|
||||
code) as follows:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 11]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
Extra Extra Extra
|
||||
Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
|
||||
---- ---- ------ ---- ---- ------- ---- ---- -------
|
||||
257 0 3 267 1 15,16 277 4 67-82
|
||||
258 0 4 268 1 17,18 278 4 83-98
|
||||
259 0 5 269 2 19-22 279 4 99-114
|
||||
260 0 6 270 2 23-26 280 4 115-130
|
||||
261 0 7 271 2 27-30 281 5 131-162
|
||||
262 0 8 272 2 31-34 282 5 163-194
|
||||
263 0 9 273 3 35-42 283 5 195-226
|
||||
264 0 10 274 3 43-50 284 5 227-257
|
||||
265 1 11,12 275 3 51-58 285 0 258
|
||||
266 1 13,14 276 3 59-66
|
||||
|
||||
The extra bits should be interpreted as a machine integer
|
||||
stored with the most-significant bit first, e.g., bits 1110
|
||||
represent the value 14.
|
||||
|
||||
Extra Extra Extra
|
||||
Code Bits Dist Code Bits Dist Code Bits Distance
|
||||
---- ---- ---- ---- ---- ------ ---- ---- --------
|
||||
0 0 1 10 4 33-48 20 9 1025-1536
|
||||
1 0 2 11 4 49-64 21 9 1537-2048
|
||||
2 0 3 12 5 65-96 22 10 2049-3072
|
||||
3 0 4 13 5 97-128 23 10 3073-4096
|
||||
4 1 5,6 14 6 129-192 24 11 4097-6144
|
||||
5 1 7,8 15 6 193-256 25 11 6145-8192
|
||||
6 2 9-12 16 7 257-384 26 12 8193-12288
|
||||
7 2 13-16 17 7 385-512 27 12 12289-16384
|
||||
8 3 17-24 18 8 513-768 28 13 16385-24576
|
||||
9 3 25-32 19 8 769-1024 29 13 24577-32768
|
||||
|
||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01)
|
||||
|
||||
The Huffman codes for the two alphabets are fixed, and are not
|
||||
represented explicitly in the data. The Huffman code lengths
|
||||
for the literal/length alphabet are:
|
||||
|
||||
Lit Value Bits Codes
|
||||
--------- ---- -----
|
||||
0 - 143 8 00110000 through
|
||||
10111111
|
||||
144 - 255 9 110010000 through
|
||||
111111111
|
||||
256 - 279 7 0000000 through
|
||||
0010111
|
||||
280 - 287 8 11000000 through
|
||||
11000111
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 12]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
The code lengths are sufficient to generate the actual codes,
|
||||
as described above; we show the codes in the table for added
|
||||
clarity. Literal/length values 286-287 will never actually
|
||||
occur in the compressed data, but participate in the code
|
||||
construction.
|
||||
|
||||
Distance codes 0-31 are represented by (fixed-length) 5-bit
|
||||
codes, with possible additional bits as shown in the table
|
||||
shown in Paragraph 3.2.5, above. Note that distance codes 30-
|
||||
31 will never actually occur in the compressed data.
|
||||
|
||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10)
|
||||
|
||||
The Huffman codes for the two alphabets appear in the block
|
||||
immediately after the header bits and before the actual
|
||||
compressed data, first the literal/length code and then the
|
||||
distance code. Each code is defined by a sequence of code
|
||||
lengths, as discussed in Paragraph 3.2.2, above. For even
|
||||
greater compactness, the code length sequences themselves are
|
||||
compressed using a Huffman code. The alphabet for code lengths
|
||||
is as follows:
|
||||
|
||||
0 - 15: Represent code lengths of 0 - 15
|
||||
16: Copy the previous code length 3 - 6 times.
|
||||
The next 2 bits indicate repeat length
|
||||
(0 = 3, ... , 3 = 6)
|
||||
Example: Codes 8, 16 (+2 bits 11),
|
||||
16 (+2 bits 10) will expand to
|
||||
12 code lengths of 8 (1 + 6 + 5)
|
||||
17: Repeat a code length of 0 for 3 - 10 times.
|
||||
(3 bits of length)
|
||||
18: Repeat a code length of 0 for 11 - 138 times
|
||||
(7 bits of length)
|
||||
|
||||
A code length of 0 indicates that the corresponding symbol in
|
||||
the literal/length or distance alphabet will not occur in the
|
||||
block, and should not participate in the Huffman code
|
||||
construction algorithm given earlier. If only one distance
|
||||
code is used, it is encoded using one bit, not zero bits; in
|
||||
this case there is a single code length of one, with one unused
|
||||
code. One distance code of zero bits means that there are no
|
||||
distance codes used at all (the data is all literals).
|
||||
|
||||
We can now define the format of the block:
|
||||
|
||||
5 Bits: HLIT, # of Literal/Length codes - 257 (257 - 286)
|
||||
5 Bits: HDIST, # of Distance codes - 1 (1 - 32)
|
||||
4 Bits: HCLEN, # of Code Length codes - 4 (4 - 19)
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 13]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
(HCLEN + 4) x 3 bits: code lengths for the code length
|
||||
alphabet given just above, in the order: 16, 17, 18,
|
||||
0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
||||
|
||||
These code lengths are interpreted as 3-bit integers
|
||||
(0-7); as above, a code length of 0 means the
|
||||
corresponding symbol (literal/length or distance code
|
||||
length) is not used.
|
||||
|
||||
HLIT + 257 code lengths for the literal/length alphabet,
|
||||
encoded using the code length Huffman code
|
||||
|
||||
HDIST + 1 code lengths for the distance alphabet,
|
||||
encoded using the code length Huffman code
|
||||
|
||||
The actual compressed data of the block,
|
||||
encoded using the literal/length and distance Huffman
|
||||
codes
|
||||
|
||||
The literal/length symbol 256 (end of data),
|
||||
encoded using the literal/length Huffman code
|
||||
|
||||
The code length repeat codes can cross from HLIT + 257 to the
|
||||
HDIST + 1 code lengths. In other words, all code lengths form
|
||||
a single sequence of HLIT + HDIST + 258 values.
|
||||
|
||||
3.3. Compliance
|
||||
|
||||
A compressor may limit further the ranges of values specified in
|
||||
the previous section and still be compliant; for example, it may
|
||||
limit the range of backward pointers to some value smaller than
|
||||
32K. Similarly, a compressor may limit the size of blocks so that
|
||||
a compressible block fits in memory.
|
||||
|
||||
A compliant decompressor must accept the full range of possible
|
||||
values defined in the previous section, and must accept blocks of
|
||||
arbitrary size.
|
||||
|
||||
4. Compression algorithm details
|
||||
|
||||
While it is the intent of this document to define the "deflate"
|
||||
compressed data format without reference to any particular
|
||||
compression algorithm, the format is related to the compressed
|
||||
formats produced by LZ77 (Lempel-Ziv 1977, see reference [2] below);
|
||||
since many variations of LZ77 are patented, it is strongly
|
||||
recommended that the implementor of a compressor follow the general
|
||||
algorithm presented here, which is known not to be patented per se.
|
||||
The material in this section is not part of the definition of the
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 14]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
specification per se, and a compressor need not follow it in order to
|
||||
be compliant.
|
||||
|
||||
The compressor terminates a block when it determines that starting a
|
||||
new block with fresh trees would be useful, or when the block size
|
||||
fills up the compressor's block buffer.
|
||||
|
||||
The compressor uses a chained hash table to find duplicated strings,
|
||||
using a hash function that operates on 3-byte sequences. At any
|
||||
given point during compression, let XYZ be the next 3 input bytes to
|
||||
be examined (not necessarily all different, of course). First, the
|
||||
compressor examines the hash chain for XYZ. If the chain is empty,
|
||||
the compressor simply writes out X as a literal byte and advances one
|
||||
byte in the input. If the hash chain is not empty, indicating that
|
||||
the sequence XYZ (or, if we are unlucky, some other 3 bytes with the
|
||||
same hash function value) has occurred recently, the compressor
|
||||
compares all strings on the XYZ hash chain with the actual input data
|
||||
sequence starting at the current point, and selects the longest
|
||||
match.
|
||||
|
||||
The compressor searches the hash chains starting with the most recent
|
||||
strings, to favor small distances and thus take advantage of the
|
||||
Huffman encoding. The hash chains are singly linked. There are no
|
||||
deletions from the hash chains; the algorithm simply discards matches
|
||||
that are too old. To avoid a worst-case situation, very long hash
|
||||
chains are arbitrarily truncated at a certain length, determined by a
|
||||
run-time parameter.
|
||||
|
||||
To improve overall compression, the compressor optionally defers the
|
||||
selection of matches ("lazy matching"): after a match of length N has
|
||||
been found, the compressor searches for a longer match starting at
|
||||
the next input byte. If it finds a longer match, it truncates the
|
||||
previous match to a length of one (thus producing a single literal
|
||||
byte) and then emits the longer match. Otherwise, it emits the
|
||||
original match, and, as described above, advances N bytes before
|
||||
continuing.
|
||||
|
||||
Run-time parameters also control this "lazy match" procedure. If
|
||||
compression ratio is most important, the compressor attempts a
|
||||
complete second search regardless of the length of the first match.
|
||||
In the normal case, if the current match is "long enough", the
|
||||
compressor reduces the search for a longer match, thus speeding up
|
||||
the process. If speed is most important, the compressor inserts new
|
||||
strings in the hash table only when no match was found, or when the
|
||||
match is not "too long". This degrades the compression ratio but
|
||||
saves time since there are both fewer insertions and fewer searches.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 15]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
5. References
|
||||
|
||||
[1] Huffman, D. A., "A Method for the Construction of Minimum
|
||||
Redundancy Codes", Proceedings of the Institute of Radio
|
||||
Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101.
|
||||
|
||||
[2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data
|
||||
Compression", IEEE Transactions on Information Theory, Vol. 23,
|
||||
No. 3, pp. 337-343.
|
||||
|
||||
[3] Gailly, J.-L., and Adler, M., ZLIB documentation and sources,
|
||||
available in ftp://ftp.uu.net/pub/archiving/zip/doc/
|
||||
|
||||
[4] Gailly, J.-L., and Adler, M., GZIP documentation and sources,
|
||||
available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/
|
||||
|
||||
[5] Schwartz, E. S., and Kallick, B. "Generating a canonical prefix
|
||||
encoding." Comm. ACM, 7,3 (Mar. 1964), pp. 166-169.
|
||||
|
||||
[6] Hirschberg and Lelewer, "Efficient decoding of prefix codes,"
|
||||
Comm. ACM, 33,4, April 1990, pp. 449-459.
|
||||
|
||||
6. Security Considerations
|
||||
|
||||
Any data compression method involves the reduction of redundancy in
|
||||
the data. Consequently, any corruption of the data is likely to have
|
||||
severe effects and be difficult to correct. Uncompressed text, on
|
||||
the other hand, will probably still be readable despite the presence
|
||||
of some corrupted bytes.
|
||||
|
||||
It is recommended that systems using this data format provide some
|
||||
means of validating the integrity of the compressed data. See
|
||||
reference [3], for example.
|
||||
|
||||
7. Source code
|
||||
|
||||
Source code for a C language implementation of a "deflate" compliant
|
||||
compressor and decompressor is available within the zlib package at
|
||||
ftp://ftp.uu.net/pub/archiving/zip/zlib/.
|
||||
|
||||
8. Acknowledgements
|
||||
|
||||
Trademarks cited in this document are the property of their
|
||||
respective owners.
|
||||
|
||||
Phil Katz designed the deflate format. Jean-Loup Gailly and Mark
|
||||
Adler wrote the related software described in this specification.
|
||||
Glenn Randers-Pehrson converted this document to RFC and HTML format.
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 16]
|
||||
|
||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
||||
|
||||
|
||||
9. Author's Address
|
||||
|
||||
L. Peter Deutsch
|
||||
Aladdin Enterprises
|
||||
203 Santa Margarita Ave.
|
||||
Menlo Park, CA 94025
|
||||
|
||||
Phone: (415) 322-0103 (AM only)
|
||||
FAX: (415) 322-1734
|
||||
EMail: <ghost@aladdin.com>
|
||||
|
||||
Questions about the technical content of this specification can be
|
||||
sent by email to:
|
||||
|
||||
Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
|
||||
Mark Adler <madler@alumni.caltech.edu>
|
||||
|
||||
Editorial comments on this specification can be sent by email to:
|
||||
|
||||
L. Peter Deutsch <ghost@aladdin.com> and
|
||||
Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Deutsch Informational [Page 17]
|
||||
|
104
lib/std/compress/deflate/token.zig
Normal file
104
lib/std/compress/deflate/token.zig
Normal file
@ -0,0 +1,104 @@
|
||||
// 2 bits: type, can be 0 (literal), 1 (EOF), 2 (Match) or 3 (Unused).
|
||||
// 8 bits: xlength (length - MIN_MATCH_LENGTH).
|
||||
// 22 bits: xoffset (offset - MIN_OFFSET_SIZE), or literal.
|
||||
const length_shift = 22;
|
||||
const offset_mask = (1 << length_shift) - 1; // 4_194_303
|
||||
const literal_type = 0 << 30; // 0
|
||||
pub const match_type = 1 << 30; // 1_073_741_824
|
||||
|
||||
// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
|
||||
// is length_codes[length - MIN_MATCH_LENGTH]
|
||||
var length_codes = [_]u32{
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
|
||||
9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
|
||||
13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
|
||||
15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
|
||||
18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
|
||||
19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
||||
21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
|
||||
22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
|
||||
22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
|
||||
23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||
26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 27, 27, 28,
|
||||
};
|
||||
|
||||
var offset_codes = [_]u32{
|
||||
0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
};
|
||||
|
||||
pub const Token = u32;
|
||||
|
||||
// Convert a literal into a literal token.
|
||||
pub fn literalToken(lit: u32) Token {
|
||||
return literal_type + lit;
|
||||
}
|
||||
|
||||
// Convert a < xlength, xoffset > pair into a match token.
|
||||
pub fn matchToken(xlength: u32, xoffset: u32) Token {
|
||||
return match_type + (xlength << length_shift) + xoffset;
|
||||
}
|
||||
|
||||
// Returns the literal of a literal token
|
||||
pub fn literal(t: Token) u32 {
|
||||
return @intCast(u32, t - literal_type);
|
||||
}
|
||||
|
||||
// Returns the extra offset of a match token
|
||||
pub fn offset(t: Token) u32 {
|
||||
return @intCast(u32, t) & offset_mask;
|
||||
}
|
||||
|
||||
pub fn length(t: Token) u32 {
|
||||
return @intCast(u32, (t - match_type) >> length_shift);
|
||||
}
|
||||
|
||||
pub fn lengthCode(len: u32) u32 {
|
||||
return length_codes[len];
|
||||
}
|
||||
|
||||
// Returns the offset code corresponding to a specific offset
|
||||
pub fn offsetCode(off: u32) u32 {
|
||||
if (off < @intCast(u32, offset_codes.len)) {
|
||||
return offset_codes[off];
|
||||
}
|
||||
if (off >> 7 < @intCast(u32, offset_codes.len)) {
|
||||
return offset_codes[off >> 7] + 14;
|
||||
}
|
||||
return offset_codes[off >> 14] + 28;
|
||||
}
|
||||
|
||||
test {
|
||||
const std = @import("std");
|
||||
const expect = std.testing.expect;
|
||||
try expect(matchToken(555, 555) == 3_401_581_099);
|
||||
}
|
@ -20,15 +20,14 @@ pub fn GzipStream(comptime ReaderType: type) type {
|
||||
const Self = @This();
|
||||
|
||||
pub const Error = ReaderType.Error ||
|
||||
deflate.InflateStream(ReaderType).Error ||
|
||||
deflate.Decompressor(ReaderType).Error ||
|
||||
error{ CorruptedData, WrongChecksum };
|
||||
pub const Reader = io.Reader(*Self, Error, read);
|
||||
|
||||
allocator: mem.Allocator,
|
||||
inflater: deflate.InflateStream(ReaderType),
|
||||
inflater: deflate.Decompressor(ReaderType),
|
||||
in_reader: ReaderType,
|
||||
hasher: std.hash.Crc32,
|
||||
window_slice: []u8,
|
||||
read_amt: usize,
|
||||
|
||||
info: struct {
|
||||
@ -93,16 +92,11 @@ pub fn GzipStream(comptime ReaderType: type) type {
|
||||
_ = try source.readIntLittle(u16);
|
||||
}
|
||||
|
||||
// The RFC doesn't say anything about the DEFLATE window size to be
|
||||
// used, default to 32K.
|
||||
var window_slice = try allocator.alloc(u8, 32 * 1024);
|
||||
|
||||
return Self{
|
||||
.allocator = allocator,
|
||||
.inflater = deflate.inflateStream(source, window_slice),
|
||||
.inflater = try deflate.decompressor(allocator, source, null),
|
||||
.in_reader = source,
|
||||
.hasher = std.hash.Crc32.init(),
|
||||
.window_slice = window_slice,
|
||||
.info = .{
|
||||
.filename = filename,
|
||||
.comment = comment,
|
||||
@ -113,7 +107,7 @@ pub fn GzipStream(comptime ReaderType: type) type {
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.allocator.free(self.window_slice);
|
||||
self.inflater.deinit();
|
||||
if (self.info.filename) |filename|
|
||||
self.allocator.free(filename);
|
||||
if (self.info.comment) |comment|
|
||||
|
@ -13,15 +13,14 @@ pub fn ZlibStream(comptime ReaderType: type) type {
|
||||
const Self = @This();
|
||||
|
||||
pub const Error = ReaderType.Error ||
|
||||
deflate.InflateStream(ReaderType).Error ||
|
||||
deflate.Decompressor(ReaderType).Error ||
|
||||
error{ WrongChecksum, Unsupported };
|
||||
pub const Reader = io.Reader(*Self, Error, read);
|
||||
|
||||
allocator: mem.Allocator,
|
||||
inflater: deflate.InflateStream(ReaderType),
|
||||
inflater: deflate.Decompressor(ReaderType),
|
||||
in_reader: ReaderType,
|
||||
hasher: std.hash.Adler32,
|
||||
window_slice: []u8,
|
||||
|
||||
fn init(allocator: mem.Allocator, source: ReaderType) !Self {
|
||||
// Zlib header format is specified in RFC1950
|
||||
@ -38,28 +37,25 @@ pub fn ZlibStream(comptime ReaderType: type) type {
|
||||
|
||||
// The CM field must be 8 to indicate the use of DEFLATE
|
||||
if (CM != 8) return error.InvalidCompression;
|
||||
// CINFO is the base-2 logarithm of the window size, minus 8.
|
||||
// CINFO is the base-2 logarithm of the LZ77 window size, minus 8.
|
||||
// Values above 7 are unspecified and therefore rejected.
|
||||
if (CINFO > 7) return error.InvalidWindowSize;
|
||||
const window_size: u16 = @as(u16, 1) << (CINFO + 8);
|
||||
|
||||
const dictionary = null;
|
||||
// TODO: Support this case
|
||||
if (FDICT != 0)
|
||||
return error.Unsupported;
|
||||
|
||||
var window_slice = try allocator.alloc(u8, window_size);
|
||||
|
||||
return Self{
|
||||
.allocator = allocator,
|
||||
.inflater = deflate.inflateStream(source, window_slice),
|
||||
.inflater = try deflate.decompressor(allocator, source, dictionary),
|
||||
.in_reader = source,
|
||||
.hasher = std.hash.Adler32.init(),
|
||||
.window_slice = window_slice,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.allocator.free(self.window_slice);
|
||||
self.inflater.deinit();
|
||||
}
|
||||
|
||||
// Implements the io.Reader interface
|
||||
|
Loading…
Reference in New Issue
Block a user