Add ASTC compression and decompression with Arm astcenc.

Co-authored-by: Gordon A Macpherson <gordon.a.macpherson@gmail.com>
Co-authored-by: Rémi Verschelde <rverschelde@gmail.com>
This commit is contained in:
K. S. Ernest (iFire) Lee 2022-12-20 10:54:01 -08:00 committed by Rémi Verschelde
parent 14fdd28de9
commit 696346f4cc
No known key found for this signature in database
GPG Key ID: C3336907360768E1
44 changed files with 29247 additions and 6 deletions

View File

@ -141,6 +141,11 @@ Comment: AMD FidelityFX Super Resolution
Copyright: 2021, Advanced Micro Devices, Inc. Copyright: 2021, Advanced Micro Devices, Inc.
License: Expat License: Expat
Files: ./thirdparty/astcenc/
Comment: Arm ASTC Encoder
Copyright: 2011-2023, Arm Limited
License: Apache-2.0
Files: ./thirdparty/basis_universal/ Files: ./thirdparty/basis_universal/
Comment: Basis Universal Comment: Basis Universal
Copyright: 2022, Binomial LLC. Copyright: 2022, Binomial LLC.

55
modules/astcenc/SCsub Normal file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
Import("env")
Import("env_modules")
env_astcenc = env_modules.Clone()
# Thirdparty source files
thirdparty_obj = []
thirdparty_dir = "#thirdparty/astcenc/"
thirdparty_sources = [
"astcenc_averages_and_directions.cpp",
"astcenc_block_sizes.cpp",
"astcenc_color_quantize.cpp",
"astcenc_color_unquantize.cpp",
"astcenc_compress_symbolic.cpp",
"astcenc_compute_variance.cpp",
"astcenc_decompress_symbolic.cpp",
"astcenc_diagnostic_trace.cpp",
"astcenc_entry.cpp",
"astcenc_find_best_partitioning.cpp",
"astcenc_ideal_endpoints_and_weights.cpp",
"astcenc_image.cpp",
"astcenc_integer_sequence.cpp",
"astcenc_mathlib.cpp",
"astcenc_mathlib_softfloat.cpp",
"astcenc_partition_tables.cpp",
"astcenc_percentile_tables.cpp",
"astcenc_pick_best_endpoint_format.cpp",
"astcenc_platform_isa_detection.cpp",
"astcenc_quantization.cpp",
"astcenc_symbolic_physical.cpp",
"astcenc_weight_align.cpp",
"astcenc_weight_quant_xfer_tables.cpp",
]
thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
env_astcenc.Prepend(CPPPATH=[thirdparty_dir])
env_thirdparty = env_astcenc.Clone()
env_thirdparty.disable_warnings()
env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
env.modules_sources += thirdparty_obj
# Godot source files
module_obj = []
env_astcenc.add_source_files(module_obj, "*.cpp")
env.modules_sources += module_obj
# Needed to force rebuilding the module files when the thirdparty library is updated.
env.Depends(module_obj, thirdparty_obj)

View File

@ -0,0 +1,6 @@
def can_build(env, platform):
return env.editor_build
def configure(env):
pass

View File

@ -0,0 +1,251 @@
/**************************************************************************/
/* image_compress_astcenc.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#include "image_compress_astcenc.h"
#include "core/os/os.h"
#include "core/string/print_string.h"
#include <astcenc.h>
void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format) {
uint64_t start_time = OS::get_singleton()->get_ticks_msec();
// TODO: See how to handle lossy quality.
Image::Format img_format = r_img->get_format();
if (img_format >= Image::FORMAT_DXT1) {
return; // Do not compress, already compressed.
}
bool is_hdr = false;
if ((img_format >= Image::FORMAT_RH) && (img_format <= Image::FORMAT_RGBE9995)) {
is_hdr = true;
r_img->convert(Image::FORMAT_RGBAF);
} else {
r_img->convert(Image::FORMAT_RGBA8);
}
// Determine encoder output format from our enum.
Image::Format target_format = Image::FORMAT_RGBA8;
astcenc_profile profile = ASTCENC_PRF_LDR;
unsigned int block_x = 4;
unsigned int block_y = 4;
if (p_format == Image::ASTCFormat::ASTC_FORMAT_4x4) {
if (is_hdr) {
target_format = Image::FORMAT_ASTC_4x4_HDR;
profile = ASTCENC_PRF_HDR;
} else {
target_format = Image::FORMAT_ASTC_4x4;
}
} else if (p_format == Image::ASTCFormat::ASTC_FORMAT_8x8) {
if (is_hdr) {
target_format = Image::FORMAT_ASTC_8x8_HDR;
profile = ASTCENC_PRF_HDR;
} else {
target_format = Image::FORMAT_ASTC_8x8;
}
block_x = 8;
block_y = 8;
}
// Compress image data and (if required) mipmaps.
const bool mipmaps = r_img->has_mipmaps();
int width = r_img->get_width();
int height = r_img->get_height();
print_verbose(vformat("astcenc: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
// Initialize astcenc.
astcenc_config config;
config.block_x = block_x;
config.block_y = block_y;
config.profile = profile;
const float quality = ASTCENC_PRE_MEDIUM;
astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
// Context allocation.
astcenc_context *context;
const unsigned int thread_count = OS::get_singleton()->get_processor_count();
status = astcenc_context_alloc(&config, thread_count, &context);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
// Compress image.
Vector<uint8_t> image_data = r_img->get_data();
uint8_t *slices = image_data.ptrw();
astcenc_image image;
image.dim_x = width;
image.dim_y = height;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
if (is_hdr) {
image.data_type = ASTCENC_TYPE_F32;
}
image.data = reinterpret_cast<void **>(&slices);
// Compute the number of ASTC blocks in each dimension.
unsigned int block_count_x = (width + block_x - 1) / block_x;
unsigned int block_count_y = (height + block_y - 1) / block_y;
size_t comp_len = block_count_x * block_count_y * 16;
Vector<uint8_t> compressed_data;
compressed_data.resize(comp_len);
compressed_data.fill(0);
const astcenc_swizzle swizzle = {
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
};
status = astcenc_compress_image(context, &image, &swizzle, compressed_data.ptrw(), comp_len, 0);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: ASTC image compression failed: %s.", astcenc_get_error_string(status)));
// Replace original image with compressed one.
r_img->set_data(width, height, mipmaps, target_format, compressed_data);
print_verbose(vformat("astcenc: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
}
void _decompress_astc(Image *r_img) {
uint64_t start_time = OS::get_singleton()->get_ticks_msec();
// Determine decompression parameters from image format.
Image::Format img_format = r_img->get_format();
bool is_hdr = false;
unsigned int block_x = 0;
unsigned int block_y = 0;
if (img_format == Image::FORMAT_ASTC_4x4) {
block_x = 4;
block_y = 4;
is_hdr = false;
} else if (img_format == Image::FORMAT_ASTC_4x4_HDR) {
block_x = 4;
block_y = 4;
is_hdr = true;
} else if (img_format == Image::FORMAT_ASTC_8x8) {
block_x = 8;
block_y = 8;
is_hdr = false;
} else if (img_format == Image::FORMAT_ASTC_8x8_HDR) {
block_x = 8;
block_y = 8;
is_hdr = true;
} else {
ERR_FAIL_MSG("astcenc: Cannot decompress Image with a non-ASTC format.");
}
// Initialize astcenc.
astcenc_profile profile = ASTCENC_PRF_LDR;
if (is_hdr) {
profile = ASTCENC_PRF_HDR;
}
astcenc_config config;
const float quality = ASTCENC_PRE_MEDIUM;
astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
// Context allocation.
astcenc_context *context = nullptr;
const unsigned int thread_count = OS::get_singleton()->get_processor_count();
status = astcenc_context_alloc(&config, thread_count, &context);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
// Decompress image.
const bool mipmaps = r_img->has_mipmaps();
int width = r_img->get_width();
int height = r_img->get_height();
astcenc_image image;
image.dim_x = width;
image.dim_y = height;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
Image::Format target_format = Image::FORMAT_RGBA8;
if (is_hdr) {
target_format = Image::FORMAT_RGBAF;
image.data_type = ASTCENC_TYPE_F32;
}
Vector<uint8_t> image_data = r_img->get_data();
Vector<uint8_t> new_image_data;
new_image_data.resize(Image::get_image_data_size(width, height, target_format, false));
new_image_data.fill(0);
uint8_t *slices = new_image_data.ptrw();
image.data = reinterpret_cast<void **>(&slices);
const astcenc_swizzle swizzle = {
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
};
status = astcenc_decompress_image(context, image_data.ptr(), image_data.size(), &image, &swizzle, 0);
ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
vformat("astcenc: ASTC decompression failed: %s.", astcenc_get_error_string(status)));
ERR_FAIL_COND_MSG(image.dim_z > 1,
"astcenc: ASTC decompression failed because this is a 3D texture, which is not supported.");
// Replace original image with compressed one.
Image::Format image_format = Image::FORMAT_RGBA8;
if (image.data_type == ASTCENC_TYPE_F32) {
image_format = Image::FORMAT_RGBAF;
} else if (image.data_type == ASTCENC_TYPE_U8) {
image_format = Image::FORMAT_RGBA8;
} else if (image.data_type == ASTCENC_TYPE_F16) {
image_format = Image::FORMAT_RGBAH;
} else {
ERR_FAIL_MSG("astcenc: ASTC decompression failed with an unknown format.");
}
r_img->set_data(image.dim_x, image.dim_y, mipmaps, image_format, new_image_data);
print_verbose(vformat("astcenc: Decompression took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
}

View File

@ -0,0 +1,39 @@
/**************************************************************************/
/* image_compress_astcenc.h */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#ifndef IMAGE_COMPRESS_ASTCENC_H
#define IMAGE_COMPRESS_ASTCENC_H
#include "core/io/image.h"
void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format);
void _decompress_astc(Image *r_img);
#endif // IMAGE_COMPRESS_ASTCENC_H

View File

@ -0,0 +1,48 @@
/**************************************************************************/
/* register_types.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#include "register_types.h"
#include "image_compress_astcenc.h"
void initialize_astcenc_module(ModuleInitializationLevel p_level) {
if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
return;
}
Image::_image_compress_astc_func = _compress_astc;
Image::_image_decompress_astc = _decompress_astc;
}
void uninitialize_astcenc_module(ModuleInitializationLevel p_level) {
if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
return;
}
}

View File

@ -0,0 +1,39 @@
/**************************************************************************/
/* register_types.h */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#ifndef ASTCENC_REGISTER_TYPES_H
#define ASTCENC_REGISTER_TYPES_H
#include "modules/register_module_types.h"
void initialize_astcenc_module(ModuleInitializationLevel p_level);
void uninitialize_astcenc_module(ModuleInitializationLevel p_level);
#endif // ASTCENC_REGISTER_TYPES_H

View File

@ -33,8 +33,8 @@
#include "core/os/os.h" #include "core/os/os.h"
#include "core/string/print_string.h" #include "core/string/print_string.h"
#include "thirdparty/etcpak/ProcessDxtc.hpp" #include <ProcessDxtc.hpp>
#include "thirdparty/etcpak/ProcessRGB.hpp" #include <ProcessRGB.hpp>
EtcpakType _determine_etc_type(Image::UsedChannels p_channels) { EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
switch (p_channels) { switch (p_channels) {
@ -130,7 +130,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) { } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) {
target_format = Image::FORMAT_DXT5; target_format = Image::FORMAT_DXT5;
} else { } else {
ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format."); ERR_FAIL_MSG("Invalid or unsupported etcpak compression format, not ETC or DXT.");
} }
// Compress image data and (if required) mipmaps. // Compress image data and (if required) mipmaps.
@ -171,7 +171,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
const uint8_t *src_read = r_img->get_data().ptr(); const uint8_t *src_read = r_img->get_data().ptr();
print_verbose(vformat("ETCPAK: Encoding image size %dx%d to format %s.", width, height, Image::get_format_name(target_format))); print_verbose(vformat("etcpak: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps); int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps);
Vector<uint8_t> dest_data; Vector<uint8_t> dest_data;
@ -232,12 +232,12 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) { } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w); CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w);
} else { } else {
ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format."); ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format.");
} }
} }
// Replace original image with compressed one. // Replace original image with compressed one.
r_img->set_data(width, height, mipmaps, target_format, dest_data); r_img->set_data(width, height, mipmaps, target_format, dest_data);
print_verbose(vformat("ETCPAK encode took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time))); print_verbose(vformat("etcpak: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
} }

12
thirdparty/README.md vendored
View File

@ -17,6 +17,18 @@ Files extracted from upstream source:
- `license.txt` - `license.txt`
## astcenc
- Upstream: https://github.com/ARM-software/astc-encoder
- Version: 4.3.0 (ec83dda79fcefe07f69cdae7ed980d169bf2c4d4, 2023)
- License: Apache 2.0
Files extracted from upstream source:
- `astcenc_*` and `astcenc.h` files from `Source`
- `LICENSE.txt`
## basis_universal ## basis_universal
- Upstream: https://github.com/BinomialLLC/basis_universal - Upstream: https://github.com/BinomialLLC/basis_universal

175
thirdparty/astcenc/LICENSE.txt vendored Normal file
View File

@ -0,0 +1,175 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

815
thirdparty/astcenc/astcenc.h vendored Normal file
View File

@ -0,0 +1,815 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief The core astcenc codec library interface.
*
* This interface is the entry point to the core astcenc codec. It aims to be easy to use for
* non-experts, but also to allow experts to have fine control over the compressor heuristics if
* needed. The core codec only handles compression and decompression, transferring all inputs and
* outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
* security and stability problems, all transfer buffers are explicitly sized.
*
* While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
* interface tied to a specific source version. We are not trying to maintain backwards
* compatibility across codec versions.
*
* The API state management is based around an explicit context object, which is the context for all
* allocated memory resources needed to compress and decompress a single image. A context can be
* used to sequentially compress multiple images using the same configuration, allowing setup
* overheads to be amortized over multiple images, which is particularly important when images are
* small.
*
* Multi-threading can be used two ways.
*
* * An application wishing to process multiple images in parallel can allocate multiple
* contexts and assign each context to a thread.
* * An application wishing to process a single image in using multiple threads can configure
* contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
* for faster processing. The caller is responsible for creating the worker threads, and
* synchronizing between images.
*
* Threading
* =========
*
* In pseudo-code, the usage for manual user threading looks like this:
*
* // Configure the compressor run
* astcenc_config my_config;
* astcenc_config_init(..., &my_config);
*
* // Power users can tweak <my_config> settings here ...
*
* // Allocate working state given config and thread_count
* astcenc_context* my_context;
* astcenc_context_alloc(&my_config, thread_count, &my_context);
*
* // Compress each image using these config settings
* foreach image:
* // For each thread in the thread pool
* for i in range(0, thread_count):
* astcenc_compress_image(my_context, &my_input, my_output, i);
*
* astcenc_compress_reset(my_context);
*
* // Clean up
* astcenc_context_free(my_context);
*
* Images
* ======
*
* The codec supports compressing single images, which can be either 2D images or volumetric 3D
* images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
* texture arrays, or sliced 3D textures.
*
* Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
* half-float, or 32-bit float, as indicated by the data_type field.
*
* Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
*
* Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
* within an image slice is always tightly packed without padding. Addressing looks like this:
*
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 ] // Red
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1] // Green
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2] // Blue
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3] // Alpha
*
* Common compressor usage
* =======================
*
* One of the most important things for coding image quality is to align the input data component
* count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
* actually need in the endpoint colors.
*
* | Input data | Encoding swizzle | Sampling swizzle |
* | ------------ | ---------------- | ---------------- |
* | 1 component | RRR1 | .[rgb] |
* | 2 components | RRRG | .[rgb]a |
* | 3 components | RGB1 | .rgb |
* | 4 components | RGBA | .rgba |
*
* The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
* provide best compatibility with other texture formats where the green component may be stored at
* higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
* the luminance endpoint component will be returned for all three.
*
* When using the normal map compression mode ASTC will store normals as a two component X+Y map.
* Input images must contain unit-length normalized and should be passed in using a two component
* swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
* to use GGGR for compatability with BC5n which will work just as well. The Z component can be
* recovered programmatically in shader code, using knowledge that the vector is unit length and
* that Z must be positive for a tangent-space normal map.
*
* Decompress-only usage
* =====================
*
* For some use cases it is useful to have a cut-down context and/or library which supports
* decompression but not compression.
*
* A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
* is allocated. These contexts have lower dynamic memory footprint than a full context.
*
* The entire library can be made decompress-only by building the files with the define
* ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
* exclude the functionality which is only needed for compression. This reduces the binary size by
* ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
*
* Note that context structures returned by a library built as decompress-only are incompatible with
* a library built with compression included, and visa versa, as they have different sizes and
* memory layout.
*
* Self-decompress-only usage
* ==========================
*
* ASTC is a complex format with a large search space. The parts of this search space that are
* searched is determined by heuristics that are, in part, tied to the quality level used when
* creating the context.
*
* A normal context is capable of decompressing any ASTC texture, including those generated by other
* compressors with unknown heuristics. This is the most flexible implementation, but forces the
* data tables used by the codec to include entries that are not needed during compression. This
* can slow down context creation by a significant amount, especially for the faster compression
* modes where few data table entries are actually used. To optimize this use case the context can
* be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
* only be asked to decompress images that it compressed itself, allowing the data tables to
* exclude entries that are not needed by the current compression configuration. This reduces the
* size of the context data tables in memory and improves context creation performance. Note that,
* as of the 3.6 release, this flag no longer affects compression performance.
*
* Using this flag while attempting to decompress an valid image which was created by another
* compressor, or even another astcenc compressor version or configuration, may result in blocks
* returning as solid magenta or NaN value error blocks.
*/
#ifndef ASTCENC_INCLUDED
#define ASTCENC_INCLUDED
#include <cstddef>
#include <cstdint>
#if defined(ASTCENC_DYNAMIC_LIBRARY)
#if defined(_MSC_VER)
#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
#else
#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
#endif
#else
#define ASTCENC_PUBLIC
#endif
/* ============================================================================
Data declarations
============================================================================ */
/**
* @brief An opaque structure; see astcenc_internal.h for definition.
*/
struct astcenc_context;
/**
* @brief A codec API error code.
*/
enum astcenc_error {
/** @brief The call was successful. */
ASTCENC_SUCCESS = 0,
/** @brief The call failed due to low memory, or undersized I/O buffers. */
ASTCENC_ERR_OUT_OF_MEM,
/** @brief The call failed due to the build using fast math. */
ASTCENC_ERR_BAD_CPU_FLOAT,
/** @brief The call failed due to the build using an unsupported ISA. */
ASTCENC_ERR_BAD_CPU_ISA,
/** @brief The call failed due to an out-of-spec parameter. */
ASTCENC_ERR_BAD_PARAM,
/** @brief The call failed due to an out-of-spec block size. */
ASTCENC_ERR_BAD_BLOCK_SIZE,
/** @brief The call failed due to an out-of-spec color profile. */
ASTCENC_ERR_BAD_PROFILE,
/** @brief The call failed due to an out-of-spec quality value. */
ASTCENC_ERR_BAD_QUALITY,
/** @brief The call failed due to an out-of-spec component swizzle. */
ASTCENC_ERR_BAD_SWIZZLE,
/** @brief The call failed due to an out-of-spec flag set. */
ASTCENC_ERR_BAD_FLAGS,
/** @brief The call failed due to the context not supporting the operation. */
ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED,
#if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE,
#endif
};
/**
* @brief A codec color profile.
*/
enum astcenc_profile {
/** @brief The LDR sRGB color profile. */
ASTCENC_PRF_LDR_SRGB = 0,
/** @brief The LDR linear color profile. */
ASTCENC_PRF_LDR,
/** @brief The HDR RGB with LDR alpha color profile. */
ASTCENC_PRF_HDR_RGB_LDR_A,
/** @brief The HDR RGBA color profile. */
ASTCENC_PRF_HDR
};
/** @brief The fastest, lowest quality, search preset. */
static const float ASTCENC_PRE_FASTEST = 0.0f;
/** @brief The fast search preset. */
static const float ASTCENC_PRE_FAST = 10.0f;
/** @brief The medium quality search preset. */
static const float ASTCENC_PRE_MEDIUM = 60.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_THOROUGH = 98.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
/** @brief The exhaustive, highest quality, search preset. */
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
/**
* @brief A codec component swizzle selector.
*/
enum astcenc_swz
{
/** @brief Select the red component. */
ASTCENC_SWZ_R = 0,
/** @brief Select the green component. */
ASTCENC_SWZ_G = 1,
/** @brief Select the blue component. */
ASTCENC_SWZ_B = 2,
/** @brief Select the alpha component. */
ASTCENC_SWZ_A = 3,
/** @brief Use a constant zero component. */
ASTCENC_SWZ_0 = 4,
/** @brief Use a constant one component. */
ASTCENC_SWZ_1 = 5,
/** @brief Use a reconstructed normal vector Z component. */
ASTCENC_SWZ_Z = 6
};
/**
* @brief A texel component swizzle.
*/
struct astcenc_swizzle
{
/** @brief The red component selector. */
astcenc_swz r;
/** @brief The green component selector. */
astcenc_swz g;
/** @brief The blue component selector. */
astcenc_swz b;
/** @brief The alpha component selector. */
astcenc_swz a;
};
/**
* @brief A texel component data format.
*/
enum astcenc_type
{
/** @brief Unorm 8-bit data per component. */
ASTCENC_TYPE_U8 = 0,
/** @brief 16-bit float per component. */
ASTCENC_TYPE_F16 = 1,
/** @brief 32-bit float per component. */
ASTCENC_TYPE_F32 = 2
};
/**
* @brief Enable normal map compression.
*
* Input data will be treated a two component normal map, storing X and Y, and the codec will
* optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
* be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
* used by BC5n).
*/
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable alpha weighting.
*
* The input alpha value is used for transparency, so errors in the RGB components are weighted by
* the transparency level. This allows the codec to more accurately encode the alpha value in areas
* where the color value is less significant.
*/
static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT = 1 << 2;
/**
* @brief Enable perceptual error metrics.
*
* This mode enables perceptual compression mode, which will optimize for perceptual error rather
* than best PSNR. Only some input modes support perceptual error metrics.
*/
static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL = 1 << 3;
/**
* @brief Create a decompression-only context.
*
* This mode disables support for compression. This enables context allocation to skip some
* transient buffer allocation, resulting in lower memory usage.
*/
static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4;
/**
* @brief Create a self-decompression context.
*
* This mode configures the compressor so that it is only guaranteed to be able to decompress images
* that were actually created using the current context. This is the common case for compression use
* cases, and setting this flag enables additional optimizations, but does mean that the context
* cannot reliably decompress arbitrary ASTC images.
*/
static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
/**
* @brief Enable RGBM map compression.
*
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
* error metrics.
*
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
* M values can round to zero due to quantization and result in black or white pixels. It is highly
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
* represented, but is still higher precision than 8-bit LDR.
*
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
*
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
* matching the default scale factor.
*/
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
/**
* @brief The bit mask of all valid flags.
*/
static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_NORMAL |
ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
/**
* @brief The config structure.
*
* This structure will initially be populated by a call to astcenc_config_init, but power users may
* modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
* documentation of the power-user settings.
*
* Note for any settings which are associated with a specific color component, the value in the
* config applies to the component that exists after any compression data swizzle is applied.
*/
struct astcenc_config
{
/** @brief The color profile. */
astcenc_profile profile;
/** @brief The set of set flags. */
unsigned int flags;
/** @brief The ASTC block size X dimension. */
unsigned int block_x;
/** @brief The ASTC block size Y dimension. */
unsigned int block_y;
/** @brief The ASTC block size Z dimension. */
unsigned int block_z;
/** @brief The red component weight scale for error weighting (-cw). */
float cw_r_weight;
/** @brief The green component weight scale for error weighting (-cw). */
float cw_g_weight;
/** @brief The blue component weight scale for error weighting (-cw). */
float cw_b_weight;
/** @brief The alpha component weight scale for error weighting (-cw). */
float cw_a_weight;
/**
* @brief The radius for any alpha-weight scaling (-a).
*
* It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
* will be sampled using linear texture filtering to minimize color bleed out of transparent
* texels that are adjacent to non-transparent texels.
*/
unsigned int a_scale_radius;
/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
float rgbm_m_scale;
/**
* @brief The maximum number of partitions searched (-partitioncountlimit).
*
* Valid values are between 1 and 4.
*/
unsigned int tune_partition_count_limit;
/**
* @brief The maximum number of partitions searched (-2partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_2partition_index_limit;
/**
* @brief The maximum number of partitions searched (-3partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_3partition_index_limit;
/**
* @brief The maximum number of partitions searched (-4partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_4partition_index_limit;
/**
* @brief The maximum centile for block modes searched (-blockmodelimit).
*
* Valid values are between 1 and 100.
*/
unsigned int tune_block_mode_limit;
/**
* @brief The maximum iterative refinements applied (-refinementlimit).
*
* Valid values are between 1 and N; there is no technical upper limit
* but little benefit is expected after N=4.
*/
unsigned int tune_refinement_limit;
/**
* @brief The number of trial candidates per mode search (-candidatelimit).
*
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
*/
unsigned int tune_candidate_limit;
/**
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_2partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_3partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_4partitioning_candidate_limit;
/**
* @brief The dB threshold for stopping block search (-dblimit).
*
* This option is ineffective for HDR textures.
*/
float tune_db_limit;
/**
* @brief The amount of MSE overshoot needed to early-out trials.
*
* The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
* the high probability block modes. This can short-cut compression for simple blocks.
*
* The second early-out is for refinement trials, where we can exit refinement once quality is
* reached.
*/
float tune_mse_overshoot;
/**
* @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_2_partition_early_out_limit_factor;
/**
* @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_3_partition_early_out_limit_factor;
/**
* @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
*
* This option is ineffective for normal maps.
*/
float tune_2_plane_early_out_limit_correlation;
#if defined(ASTCENC_DIAGNOSTICS)
/**
* @brief The path to save the diagnostic trace data to.
*
* This option is not part of the public API, and requires special builds
* of the library.
*/
const char* trace_file_path;
#endif
};
/**
* @brief An uncompressed 2D or 3D image.
*
* 3D image are passed in as an array of 2D slices. Each slice has identical
* size and color format.
*/
struct astcenc_image
{
/** @brief The X dimension of the image, in texels. */
unsigned int dim_x;
/** @brief The Y dimension of the image, in texels. */
unsigned int dim_y;
/** @brief The Z dimension of the image, in texels. */
unsigned int dim_z;
/** @brief The data type per component. */
astcenc_type data_type;
/** @brief The array of 2D slices, of length @c dim_z. */
void** data;
};
/**
* @brief A block encoding metadata query result.
*
* If the block is an error block or a constant color block or an error block all fields other than
* the profile, block dimensions, and error/constant indicator will be zero.
*/
struct astcenc_block_info
{
/** @brief The block encoding color profile. */
astcenc_profile profile;
/** @brief The number of texels in the X dimension. */
unsigned int block_x;
/** @brief The number of texels in the Y dimension. */
unsigned int block_y;
/** @brief The number of texel in the Z dimension. */
unsigned int block_z;
/** @brief The number of texels in the block. */
unsigned int texel_count;
/** @brief True if this block is an error block. */
bool is_error_block;
/** @brief True if this block is a constant color block. */
bool is_constant_block;
/** @brief True if this block is an HDR block. */
bool is_hdr_block;
/** @brief True if this block uses two weight planes. */
bool is_dual_plane_block;
/** @brief The number of partitions if not constant color. */
unsigned int partition_count;
/** @brief The partition index if 2 - 4 partitions used. */
unsigned int partition_index;
/** @brief The component index of the second plane if dual plane. */
unsigned int dual_plane_component;
/** @brief The color endpoint encoding mode for each partition. */
unsigned int color_endpoint_modes[4];
/** @brief The number of color endpoint quantization levels. */
unsigned int color_level_count;
/** @brief The number of weight quantization levels. */
unsigned int weight_level_count;
/** @brief The number of weights in the X dimension. */
unsigned int weight_x;
/** @brief The number of weights in the Y dimension. */
unsigned int weight_y;
/** @brief The number of weights in the Z dimension. */
unsigned int weight_z;
/** @brief The unpacked color endpoints for each partition. */
float color_endpoints[4][2][4];
/** @brief The per-texel interpolation weights for the block. */
float weight_values_plane1[216];
/** @brief The per-texel interpolation weights for the block. */
float weight_values_plane2[216];
/** @brief The per-texel partition assignments for the block. */
uint8_t partition_assignment[216];
};
/**
* Populate a codec config based on default settings.
*
* Power users can edit the returned config struct to fine tune before allocating the context.
*
* @param profile Color profile.
* @param block_x ASTC block size X dimension.
* @param block_y ASTC block size Y dimension.
* @param block_z ASTC block size Z dimension.
* @param quality Search quality preset / effort level. Either an
* @c ASTCENC_PRE_* value, or a effort level between 0
* and 100. Performance is not linear between 0 and 100.
* @param flags A valid set of @c ASTCENC_FLG_* flag bits.
* @param[out] config Output config struct to populate.
*
* @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
* either individually, or in combination.
*/
ASTCENC_PUBLIC astcenc_error astcenc_config_init(
astcenc_profile profile,
unsigned int block_x,
unsigned int block_y,
unsigned int block_z,
float quality,
unsigned int flags,
astcenc_config* config);
/**
* @brief Allocate a new codec context based on a config.
*
* This function allocates all of the memory resources and threads needed by the codec. This can be
* slow, so it is recommended that contexts are reused to serially compress or decompress multiple
* images to amortize setup cost.
*
* Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
* flag when creating the configuration. The compression functions will fail if invoked. For a
* decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
* any context.
*
* @param[in] config Codec config.
* @param thread_count Thread count to configure for.
* @param[out] context Location to store an opaque context pointer.
*
* @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
const astcenc_config* config,
unsigned int thread_count,
astcenc_context** context);
/**
* @brief Compress an image.
*
* A single context can only compress or decompress a single image at a time.
*
* For a context configured for multi-threading, any set of the N threads can call this function.
* Work will be dynamically scheduled across the threads available. Each thread must have a unique
* @c thread_index.
*
* @param context Codec context.
* @param[in,out] image An input image, in 2D slices.
* @param swizzle Compression data swizzle, applied before compression.
* @param[out] data_out Pointer to output data array.
* @param data_len Length of the output data array.
* @param thread_index Thread index [0..N-1] of calling thread.
*
* @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
astcenc_context* context,
astcenc_image* image,
const astcenc_swizzle* swizzle,
uint8_t* data_out,
size_t data_len,
unsigned int thread_index);
/**
* @brief Reset the codec state for a new compression.
*
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
* only be called when all threads have exited the @c astcenc_compress_image() function for image N,
* but before any thread enters it for image N + 1.
*
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
*
* @param context Codec context.
*
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
astcenc_context* context);
/**
* @brief Decompress an image.
*
* @param context Codec context.
* @param[in] data Pointer to compressed data.
* @param data_len Length of the compressed data, in bytes.
* @param[in,out] image_out Output image.
* @param swizzle Decompression data swizzle, applied after decompression.
* @param thread_index Thread index [0..N-1] of calling thread.
*
* @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
astcenc_context* context,
const uint8_t* data,
size_t data_len,
astcenc_image* image_out,
const astcenc_swizzle* swizzle,
unsigned int thread_index);
/**
* @brief Reset the codec state for a new decompression.
*
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
* only be called when all threads have exited the @c astcenc_decompress_image() function for image
* N, but before any thread enters it for image N + 1.
*
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
*
* @param context Codec context.
*
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
astcenc_context* context);
/**
* Free the compressor context.
*
* @param context The codec context.
*/
ASTCENC_PUBLIC void astcenc_context_free(
astcenc_context* context);
/**
* @brief Provide a high level summary of a block's encoding.
*
* This feature is primarily useful for codec developers but may be useful for developers building
* advanced content packaging pipelines.
*
* @param context Codec context.
* @param data One block of compressed ASTC data.
* @param info The output info structure to populate.
*
* @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
* function will return success even if the block itself was an error block encoding, as the
* decode was correctly handled.
*/
ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
astcenc_context* context,
const uint8_t data[16],
astcenc_block_info* info);
/**
* @brief Get a printable string for specific status code.
*
* @param status The status value.
*
* @return A human readable nul-terminated string.
*/
ASTCENC_PUBLIC const char* astcenc_get_error_string(
astcenc_error status);
#endif

View File

@ -0,0 +1,995 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for finding dominant direction of a set of colors.
*/
#if !defined(ASTCENC_DECOMPRESS_ONLY)
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Compute the average RGB color of each partition.
*
* The algorithm here uses a vectorized sequential scan and per-partition
* color accumulators, using select() to mask texel lanes in other partitions.
*
* We only accumulate sums for N-1 partitions during the scan; the value for
* the last partition can be computed given that we know the block-wide average
* already.
*
* Because of this we could reduce the loop iteration count so it "just" spans
* the max texel index needed for the N-1 partitions, which could need fewer
* iterations than the full block texel count. However, this makes the loop
* count erratic and causes more branch mispredictions so is a net loss.
*
* @param pi The partitioning to use.
* @param blk The block data to process.
* @param[out] averages The output averages. Unused partition indices will
* not be initialized, and lane<3> will be zero.
*/
static void compute_partition_averages_rgb(
const partition_info& pi,
const image_block& blk,
vfloat4 averages[BLOCK_MAX_PARTITIONS]
) {
unsigned int partition_count = pi.partition_count;
unsigned int texel_count = blk.texel_count;
promise(texel_count > 0);
// For 1 partition just use the precomputed mean
if (partition_count == 1)
{
averages[0] = blk.data_mean.swz<0, 1, 2>();
}
// For 2 partitions scan results for partition 0, compute partition 1
else if (partition_count == 2)
{
vfloatacc pp_avg_rgb[3] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0], data_r, p0_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[1], data_g, p0_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[2], data_b, p0_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
hadd_s(pp_avg_rgb[1]),
hadd_s(pp_avg_rgb[2]));
vfloat4 p1_total = block_total - p0_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
}
// For 3 partitions scan results for partition 0/1, compute partition 2
else if (partition_count == 3)
{
vfloatacc pp_avg_rgb[2][3] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
hadd_s(pp_avg_rgb[0][1]),
hadd_s(pp_avg_rgb[0][2]));
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
hadd_s(pp_avg_rgb[1][1]),
hadd_s(pp_avg_rgb[1][2]));
vfloat4 p2_total = block_total - p0_total - p1_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
}
else
{
// For 4 partitions scan results for partition 0/1/2, compute partition 3
vfloatacc pp_avg_rgb[3][3] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vmask p2_mask = lane_mask & (texel_partition == vint(2));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
hadd_s(pp_avg_rgb[0][1]),
hadd_s(pp_avg_rgb[0][2]));
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
hadd_s(pp_avg_rgb[1][1]),
hadd_s(pp_avg_rgb[1][2]));
vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
hadd_s(pp_avg_rgb[2][1]),
hadd_s(pp_avg_rgb[2][2]));
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
}
}
/**
* @brief Compute the average RGBA color of each partition.
*
* The algorithm here uses a vectorized sequential scan and per-partition
* color accumulators, using select() to mask texel lanes in other partitions.
*
* We only accumulate sums for N-1 partitions during the scan; the value for
* the last partition can be computed given that we know the block-wide average
* already.
*
* Because of this we could reduce the loop iteration count so it "just" spans
* the max texel index needed for the N-1 partitions, which could need fewer
* iterations than the full block texel count. However, this makes the loop
* count erratic and causes more branch mispredictions so is a net loss.
*
* @param pi The partitioning to use.
* @param blk The block data to process.
* @param[out] averages The output averages. Unused partition indices will
* not be initialized.
*/
static void compute_partition_averages_rgba(
const partition_info& pi,
const image_block& blk,
vfloat4 averages[BLOCK_MAX_PARTITIONS]
) {
unsigned int partition_count = pi.partition_count;
unsigned int texel_count = blk.texel_count;
promise(texel_count > 0);
// For 1 partition just use the precomputed mean
if (partition_count == 1)
{
averages[0] = blk.data_mean;
}
// For 2 partitions scan results for partition 0, compute partition 1
else if (partition_count == 2)
{
vfloat4 pp_avg_rgba[4] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0], data_r, p0_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[1], data_g, p0_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[2], data_b, p0_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[3], data_a, p0_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
hadd_s(pp_avg_rgba[1]),
hadd_s(pp_avg_rgba[2]),
hadd_s(pp_avg_rgba[3]));
vfloat4 p1_total = block_total - p0_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
}
// For 3 partitions scan results for partition 0/1, compute partition 2
else if (partition_count == 3)
{
vfloat4 pp_avg_rgba[2][4] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
hadd_s(pp_avg_rgba[0][1]),
hadd_s(pp_avg_rgba[0][2]),
hadd_s(pp_avg_rgba[0][3]));
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
hadd_s(pp_avg_rgba[1][1]),
hadd_s(pp_avg_rgba[1][2]),
hadd_s(pp_avg_rgba[1][3]));
vfloat4 p2_total = block_total - p0_total - p1_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
}
else
{
// For 4 partitions scan results for partition 0/1/2, compute partition 3
vfloat4 pp_avg_rgba[3][4] {};
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vmask p2_mask = lane_mask & (texel_partition == vint(2));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
hadd_s(pp_avg_rgba[0][1]),
hadd_s(pp_avg_rgba[0][2]),
hadd_s(pp_avg_rgba[0][3]));
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
hadd_s(pp_avg_rgba[1][1]),
hadd_s(pp_avg_rgba[1][2]),
hadd_s(pp_avg_rgba[1][3]));
vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
hadd_s(pp_avg_rgba[2][1]),
hadd_s(pp_avg_rgba[2][2]),
hadd_s(pp_avg_rgba[2][3]));
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_4_comp(
const partition_info& pi,
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
int partition_count = pi.partition_count;
promise(partition_count > 0);
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgba(pi, blk, partition_averages);
for (int partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
vfloat4 sum_wp = vfloat4::zero();
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = blk.texel(iwt);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
sum_wp += select(zero, texel_datum, tdm3);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 prod_wp = dot(sum_wp, sum_wp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
best_sum = select(best_sum, prod_zp, mask);
mask = prod_wp > best_sum;
best_vector = select(best_vector, sum_wp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_3_comp(
const partition_info& pi,
const image_block& blk,
unsigned int omitted_component,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgba(pi, blk, partition_averages);
const float* data_vr = blk.data_r;
const float* data_vg = blk.data_g;
const float* data_vb = blk.data_b;
// TODO: Data-driven permute would be useful to avoid this ...
if (omitted_component == 0)
{
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
data_vr = blk.data_g;
data_vg = blk.data_b;
data_vb = blk.data_a;
}
else if (omitted_component == 1)
{
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
data_vg = blk.data_b;
data_vb = blk.data_a;
}
else if (omitted_component == 2)
{
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
data_vb = blk.data_a;
}
else
{
partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
}
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
for (unsigned int partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = vfloat3(data_vr[iwt],
data_vg[iwt],
data_vb[iwt]);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_3_comp_rgb(
const partition_info& pi,
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgb(pi, blk, partition_averages);
for (unsigned int partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = blk.texel3(iwt);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_2_comp(
const partition_info& pt,
const image_block& blk,
unsigned int component1,
unsigned int component2,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
vfloat4 average;
const float* data_vr = nullptr;
const float* data_vg = nullptr;
if (component1 == 0 && component2 == 1)
{
average = blk.data_mean.swz<0, 1>();
data_vr = blk.data_r;
data_vg = blk.data_g;
}
else if (component1 == 0 && component2 == 2)
{
average = blk.data_mean.swz<0, 2>();
data_vr = blk.data_r;
data_vg = blk.data_b;
}
else // (component1 == 1 && component2 == 2)
{
assert(component1 == 1 && component2 == 2);
average = blk.data_mean.swz<1, 2>();
data_vr = blk.data_g;
data_vg = blk.data_b;
}
unsigned int partition_count = pt.partition_count;
promise(partition_count > 0);
for (unsigned int partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pt.texels_of_partition[partition];
unsigned int texel_count = pt.partition_texel_count[partition];
promise(texel_count > 0);
// Only compute a partition mean if more than one partition
if (partition_count > 1)
{
average = vfloat4::zero();
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
average += vfloat2(data_vr[iwt], data_vg[iwt]);
}
average = average / static_cast<float>(texel_count);
}
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_error_squared_rgba(
const partition_info& pi,
const image_block& blk,
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
float uncor_lengths[BLOCK_MAX_PARTITIONS],
float samec_lengths[BLOCK_MAX_PARTITIONS],
float& uncor_error,
float& samec_error
) {
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
vfloatacc uncor_errorsumv = vfloatacc::zero();
vfloatacc samec_errorsumv = vfloatacc::zero();
for (unsigned int partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
float uncor_loparam = 1e10f;
float uncor_hiparam = -1e10f;
float samec_loparam = 1e10f;
float samec_hiparam = -1e10f;
processed_line4 l_uncor = uncor_plines[partition];
processed_line4 l_samec = samec_plines[partition];
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
// Vectorize some useful scalar inputs
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
vfloat l_samec_bs0(l_samec.bs.lane<0>());
vfloat l_samec_bs1(l_samec.bs.lane<1>());
vfloat l_samec_bs2(l_samec.bs.lane<2>());
vfloat l_samec_bs3(l_samec.bs.lane<3>());
assert(all(l_samec.amod == vfloat4(0.0f)));
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat samec_loparamv(1e10f);
vfloat samec_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
vfloat ew_a(blk.channel_weight.lane<3>());
// This implementation over-shoots, but this is safe as we initialize the texel_indexes
// array to extend the last value. This means min/max are not impacted, but we need to mask
// out the dummy values when we compute the line weighting.
vint lane_ids = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(texel_indexes + i);
vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat data_a = gatherf(blk.data_a, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2)
+ (data_a * l_uncor_bs3);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+ (uncor_param * l_uncor_bs0);
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+ (uncor_param * l_uncor_bs1);
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+ (uncor_param * l_uncor_bs2);
vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+ (uncor_param * l_uncor_bs3);
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+ (ew_g * uncor_dist1 * uncor_dist1)
+ (ew_b * uncor_dist2 * uncor_dist2)
+ (ew_a * uncor_dist3 * uncor_dist3);
haccumulate(uncor_errorsumv, uncor_err, mask);
// Process samechroma data
vfloat samec_param = (data_r * l_samec_bs0)
+ (data_g * l_samec_bs1)
+ (data_b * l_samec_bs2)
+ (data_a * l_samec_bs3);
samec_loparamv = min(samec_param, samec_loparamv);
samec_hiparamv = max(samec_param, samec_hiparamv);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+ (ew_g * samec_dist1 * samec_dist1)
+ (ew_b * samec_dist2 * samec_dist2)
+ (ew_a * samec_dist3 * samec_dist3);
haccumulate(samec_errorsumv, samec_err, mask);
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
uncor_loparam = hmin_s(uncor_loparamv);
uncor_hiparam = hmax_s(uncor_hiparamv);
samec_loparam = hmin_s(samec_loparamv);
samec_hiparam = hmax_s(samec_hiparamv);
float uncor_linelen = uncor_hiparam - uncor_loparam;
float samec_linelen = samec_hiparam - samec_loparam;
// Turn very small numbers and NaNs into a small number
uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
samec_error = hadd_s(samec_errorsumv);
}
/* See header for documentation. */
void compute_error_squared_rgb(
const partition_info& pi,
const image_block& blk,
partition_lines3 plines[BLOCK_MAX_PARTITIONS],
float& uncor_error,
float& samec_error
) {
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
vfloatacc uncor_errorsumv = vfloatacc::zero();
vfloatacc samec_errorsumv = vfloatacc::zero();
for (unsigned int partition = 0; partition < partition_count; partition++)
{
partition_lines3& pl = plines[partition];
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
float uncor_loparam = 1e10f;
float uncor_hiparam = -1e10f;
float samec_loparam = 1e10f;
float samec_hiparam = -1e10f;
processed_line3 l_uncor = pl.uncor_pline;
processed_line3 l_samec = pl.samec_pline;
// This implementation is an example vectorization of this function.
// It works for - the codec is a 2-4% faster than not vectorizing - but
// the benefit is limited by the use of gathers and register pressure
// Vectorize some useful scalar inputs
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
vfloat l_samec_bs0(l_samec.bs.lane<0>());
vfloat l_samec_bs1(l_samec.bs.lane<1>());
vfloat l_samec_bs2(l_samec.bs.lane<2>());
assert(all(l_samec.amod == vfloat4(0.0f)));
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat samec_loparamv(1e10f);
vfloat samec_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
// This implementation over-shoots, but this is safe as we initialize the weights array
// to extend the last value. This means min/max are not impacted, but we need to mask
// out the dummy values when we compute the line weighting.
vint lane_ids = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(texel_indexes + i);
vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+ (uncor_param * l_uncor_bs0);
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+ (uncor_param * l_uncor_bs1);
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+ (uncor_param * l_uncor_bs2);
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+ (ew_g * uncor_dist1 * uncor_dist1)
+ (ew_b * uncor_dist2 * uncor_dist2);
haccumulate(uncor_errorsumv, uncor_err, mask);
// Process samechroma data
vfloat samec_param = (data_r * l_samec_bs0)
+ (data_g * l_samec_bs1)
+ (data_b * l_samec_bs2);
samec_loparamv = min(samec_param, samec_loparamv);
samec_hiparamv = max(samec_param, samec_hiparamv);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+ (ew_g * samec_dist1 * samec_dist1)
+ (ew_b * samec_dist2 * samec_dist2);
haccumulate(samec_errorsumv, samec_err, mask);
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
uncor_loparam = hmin_s(uncor_loparamv);
uncor_hiparam = hmax_s(uncor_hiparamv);
samec_loparam = hmin_s(samec_loparamv);
samec_hiparam = hmax_s(samec_hiparamv);
float uncor_linelen = uncor_hiparam - uncor_loparam;
float samec_linelen = samec_hiparam - samec_loparam;
// Turn very small numbers and NaNs into a small number
pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
samec_error = hadd_s(samec_errorsumv);
}
#endif

1184
thirdparty/astcenc/astcenc_block_sizes.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,941 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#include <utility>
/**
* @brief Functions for color unquantization.
*/
#include "astcenc_internal.h"
/**
* @brief Un-blue-contract a color.
*
* This function reverses any applied blue contraction.
*
* @param input The input color that has been blue-contracted.
*
* @return The uncontracted color.
*/
static ASTCENC_SIMD_INLINE vint4 uncontract_color(
vint4 input
) {
vmask4 mask(true, true, false, false);
vint4 bc0 = asr<1>(input + input.lane<2>());
return select(input, bc0, mask);
}
/**
* @brief Unpack an LDR RGBA color that uses delta encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgba_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Apply bit transfer
bit_transfer_signed(input1, input0);
// Apply blue-uncontraction if needed
int rgb_sum = hadd_rgb_s(input1);
input1 = input1 + input0;
if (rgb_sum < 0)
{
input0 = uncontract_color(input0);
input1 = uncontract_color(input1);
std::swap(input0, input1);
}
output0 = clamp(0, 255, input0);
output1 = clamp(0, 255, input1);
}
/**
* @brief Unpack an LDR RGB color that uses delta encoding.
*
* Output alpha set to 255.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_delta_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
/**
* @brief Unpack an LDR RGBA color that uses direct encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgba_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Apply blue-uncontraction if needed
if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
{
input0 = uncontract_color(input0);
input1 = uncontract_color(input1);
std::swap(input0, input1);
}
output0 = input0;
output1 = input1;
}
/**
* @brief Unpack an LDR RGB color that uses direct encoding.
*
* Output alpha set to 255.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
/**
* @brief Unpack an LDR RGBA color that uses scaled encoding.
*
* Note only the RGB channels use the scaled encoding, alpha uses direct.
*
* @param input0 The packed endpoint 0 color.
* @param alpha1 The packed endpoint 1 alpha value.
* @param scale The packed quantized scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_alpha_unpack(
vint4 input0,
uint8_t alpha1,
uint8_t scale,
vint4& output0,
vint4& output1
) {
output1 = input0;
output1.set_lane<3>(alpha1);
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(input0.lane<3>());
}
/**
* @brief Unpack an LDR RGB color that uses scaled encoding.
*
* Output alpha is 255.
*
* @param input0 The packed endpoint 0 color.
* @param scale The packed scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_unpack(
vint4 input0,
int scale,
vint4& output0,
vint4& output1
) {
output1 = input0;
output1.set_lane<3>(255);
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(255);
}
/**
* @brief Unpack an LDR L color that uses direct encoding.
*
* Output alpha is 255.
*
* @param input The packed endpoints.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
output0 = vint4(lum0, lum0, lum0, 255);
output1 = vint4(lum1, lum1, lum1, 255);
}
/**
* @brief Unpack an LDR L color that uses delta encoding.
*
* Output alpha is 255.
*
* @param input The packed endpoints (L0, L1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_delta_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int l0 = (v0 >> 2) | (v1 & 0xC0);
int l1 = l0 + (v1 & 0x3F);
l1 = astc::min(l1, 255);
output0 = vint4(l0, l0, l0, 255);
output1 = vint4(l1, l1, l1, 255);
}
/**
* @brief Unpack an LDR LA color that uses direct encoding.
*
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
output0 = vint4(lum0, lum0, lum0, alpha0);
output1 = vint4(lum1, lum1, lum1, alpha1);
}
/**
* @brief Unpack an LDR LA color that uses delta encoding.
*
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_delta_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
lum0 |= (lum1 & 0x80) << 1;
alpha0 |= (alpha1 & 0x80) << 1;
lum1 &= 0x7F;
alpha1 &= 0x7F;
if (lum1 & 0x40)
{
lum1 -= 0x80;
}
if (alpha1 & 0x40)
{
alpha1 -= 0x80;
}
lum0 >>= 1;
lum1 >>= 1;
alpha0 >>= 1;
alpha1 >>= 1;
lum1 += lum0;
alpha1 += alpha0;
lum1 = astc::clamp(lum1, 0, 255);
alpha1 = astc::clamp(alpha1, 0, 255);
output0 = vint4(lum0, lum0, lum0, alpha0);
output1 = vint4(lum1, lum1, lum1, alpha1);
}
/**
* @brief Unpack an HDR RGB + offset encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgbo_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
int majcomp;
int mode;
if ((modeval & 0xC) != 0xC)
{
majcomp = modeval >> 2;
mode = modeval & 3;
}
else if (modeval != 0xF)
{
majcomp = modeval & 3;
mode = 4;
}
else
{
majcomp = 0;
mode = 5;
}
int red = v0 & 0x3F;
int green = v1 & 0x1F;
int blue = v2 & 0x1F;
int scale = v3 & 0x1F;
int bit0 = (v1 >> 6) & 1;
int bit1 = (v1 >> 5) & 1;
int bit2 = (v2 >> 6) & 1;
int bit3 = (v2 >> 5) & 1;
int bit4 = (v3 >> 7) & 1;
int bit5 = (v3 >> 6) & 1;
int bit6 = (v3 >> 5) & 1;
int ohcomp = 1 << mode;
if (ohcomp & 0x30)
green |= bit0 << 6;
if (ohcomp & 0x3A)
green |= bit1 << 5;
if (ohcomp & 0x30)
blue |= bit2 << 6;
if (ohcomp & 0x3A)
blue |= bit3 << 5;
if (ohcomp & 0x3D)
scale |= bit6 << 5;
if (ohcomp & 0x2D)
scale |= bit5 << 6;
if (ohcomp & 0x04)
scale |= bit4 << 7;
if (ohcomp & 0x3B)
red |= bit4 << 6;
if (ohcomp & 0x04)
red |= bit3 << 6;
if (ohcomp & 0x10)
red |= bit5 << 7;
if (ohcomp & 0x0F)
red |= bit2 << 7;
if (ohcomp & 0x05)
red |= bit1 << 8;
if (ohcomp & 0x0A)
red |= bit0 << 8;
if (ohcomp & 0x05)
red |= bit0 << 9;
if (ohcomp & 0x02)
red |= bit6 << 9;
if (ohcomp & 0x01)
red |= bit3 << 10;
if (ohcomp & 0x02)
red |= bit5 << 10;
// expand to 12 bits.
static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
int shamt = shamts[mode];
red <<= shamt;
green <<= shamt;
blue <<= shamt;
scale <<= shamt;
// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
// not absolute values.
if (mode != 5)
{
green = red - green;
blue = red - blue;
}
// switch around components.
int temp;
switch (majcomp)
{
case 1:
temp = red;
red = green;
green = temp;
break;
case 2:
temp = red;
red = blue;
blue = temp;
break;
default:
break;
}
int red0 = red - scale;
int green0 = green - scale;
int blue0 = blue - scale;
// clamp to [0,0xFFF].
if (red < 0)
red = 0;
if (green < 0)
green = 0;
if (blue < 0)
blue = 0;
if (red0 < 0)
red0 = 0;
if (green0 < 0)
green0 = 0;
if (blue0 < 0)
blue0 = 0;
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
}
/**
* @brief Unpack an HDR RGB direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_unpack(
const uint8_t input[6],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int v4 = input[4];
int v5 = input[5];
// extract all the fixed-placement bitfields
int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
if (majcomp == 3)
{
output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
return;
}
int a = v0 | ((v1 & 0x40) << 2);
int b0 = v2 & 0x3f;
int b1 = v3 & 0x3f;
int c = v1 & 0x3f;
int d0 = v4 & 0x7f;
int d1 = v5 & 0x7f;
// get hold of the number of bits in 'd0' and 'd1'
static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
int dbits = dbits_tab[modeval];
// extract six variable-placement bits
int bit0 = (v2 >> 6) & 1;
int bit1 = (v3 >> 6) & 1;
int bit2 = (v4 >> 6) & 1;
int bit3 = (v5 >> 6) & 1;
int bit4 = (v4 >> 5) & 1;
int bit5 = (v5 >> 5) & 1;
// and prepend the variable-placement bits depending on mode.
int ohmod = 1 << modeval; // one-hot-mode
if (ohmod & 0xA4)
a |= bit0 << 9;
if (ohmod & 0x8)
a |= bit2 << 9;
if (ohmod & 0x50)
a |= bit4 << 9;
if (ohmod & 0x50)
a |= bit5 << 10;
if (ohmod & 0xA0)
a |= bit1 << 10;
if (ohmod & 0xC0)
a |= bit2 << 11;
if (ohmod & 0x4)
c |= bit1 << 6;
if (ohmod & 0xE8)
c |= bit3 << 6;
if (ohmod & 0x20)
c |= bit2 << 7;
if (ohmod & 0x5B)
{
b0 |= bit0 << 6;
b1 |= bit1 << 6;
}
if (ohmod & 0x12)
{
b0 |= bit2 << 7;
b1 |= bit3 << 7;
}
if (ohmod & 0xAF)
{
d0 |= bit4 << 5;
d1 |= bit5 << 5;
}
if (ohmod & 0x5)
{
d0 |= bit2 << 6;
d1 |= bit3 << 6;
}
// sign-extend 'd0' and 'd1'
// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
int32_t d0x = d0;
int32_t d1x = d1;
int sx_shamt = 32 - dbits;
d0x <<= sx_shamt;
d0x >>= sx_shamt;
d1x <<= sx_shamt;
d1x >>= sx_shamt;
d0 = d0x;
d1 = d1x;
// expand all values to 12 bits, with left-shift as needed.
int val_shamt = (modeval >> 1) ^ 3;
a <<= val_shamt;
b0 <<= val_shamt;
b1 <<= val_shamt;
c <<= val_shamt;
d0 <<= val_shamt;
d1 <<= val_shamt;
// then compute the actual color values.
int red1 = a;
int green1 = a - b0;
int blue1 = a - b1;
int red0 = a - c;
int green0 = a - b0 - c - d0;
int blue0 = a - b1 - c - d1;
// clamp the color components to [0,2^12 - 1]
red0 = astc::clamp(red0, 0, 4095);
green0 = astc::clamp(green0, 0, 4095);
blue0 = astc::clamp(blue0, 0, 4095);
red1 = astc::clamp(red1, 0, 4095);
green1 = astc::clamp(green1, 0, 4095);
blue1 = astc::clamp(blue1, 0, 4095);
// switch around the color components
int temp0, temp1;
switch (majcomp)
{
case 1: // switch around red and green
temp0 = red0;
temp1 = red1;
red0 = green0;
red1 = green1;
green0 = temp0;
green1 = temp1;
break;
case 2: // switch around red and blue
temp0 = red0;
temp1 = red1;
red0 = blue0;
red1 = blue1;
blue0 = temp0;
blue1 = temp1;
break;
case 0: // no switch
break;
}
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR RGB + LDR A direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_ldr_alpha_unpack(
const uint8_t input[8],
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, output0, output1);
int v6 = input[6];
int v7 = input[7];
output0.set_lane<3>(v6);
output1.set_lane<3>(v7);
}
/**
* @brief Unpack an HDR L (small range) direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_small_range_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v0 & 0x80)
{
y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
y1 = (v1 & 0x1F) << 2;
}
else
{
y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
y1 = (v1 & 0xF) << 1;
}
y1 += y0;
if (y1 > 0xFFF)
{
y1 = 0xFFF;
}
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR L (large range) direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_large_range_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v1 >= v0)
{
y0 = v0 << 4;
y1 = v1 << 4;
}
else
{
y0 = (v1 << 4) + 8;
y1 = (v0 << 4) - 8;
}
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR A direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_alpha_unpack(
const uint8_t input[2],
int& output0,
int& output1
) {
int v6 = input[0];
int v7 = input[1];
int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
v6 &= 0x7F;
v7 &= 0x7F;
if (selector == 3)
{
output0 = v6 << 5;
output1 = v7 << 5;
}
else
{
v6 |= (v7 << (selector + 1)) & 0x780;
v7 &= (0x3f >> selector);
v7 ^= 32 >> selector;
v7 -= 32 >> selector;
v6 <<= (4 - selector);
v7 <<= (4 - selector);
v7 += v6;
if (v7 < 0)
{
v7 = 0;
}
else if (v7 > 0xFFF)
{
v7 = 0xFFF;
}
output0 = v6;
output1 = v7;
}
output0 <<= 4;
output1 <<= 4;
}
/**
* @brief Unpack an HDR RGBA direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_hdr_alpha_unpack(
const uint8_t input[8],
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, output0, output1);
int alpha0, alpha1;
hdr_alpha_unpack(input + 6, alpha0, alpha1);
output0.set_lane<3>(alpha0);
output1.set_lane<3>(alpha1);
}
/* See header for documentation. */
void unpack_color_endpoints(
astcenc_profile decode_mode,
int format,
const uint8_t* input,
bool& rgb_hdr,
bool& alpha_hdr,
vint4& output0,
vint4& output1
) {
// Assume no NaNs and LDR endpoints unless set later
rgb_hdr = false;
alpha_hdr = false;
bool alpha_hdr_default = false;
switch (format)
{
case FMT_LUMINANCE:
luminance_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_DELTA:
luminance_delta_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_SMALL_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_small_range_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_LARGE_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_large_range_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA:
luminance_alpha_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA_DELTA:
luminance_alpha_delta_unpack(input, output0, output1);
break;
case FMT_RGB_SCALE:
{
vint4 input0q(input[0], input[1], input[2], 0);
uint8_t scale = input[3];
rgb_scale_unpack(input0q, scale, output0, output1);
}
break;
case FMT_RGB_SCALE_ALPHA:
{
vint4 input0q(input[0], input[1], input[2], input[4]);
uint8_t alpha1q = input[5];
uint8_t scaleq = input[3];
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
}
break;
case FMT_HDR_RGB_SCALE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgbo_unpack(input, output0, output1);
break;
case FMT_RGB:
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_RGB_DELTA:
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgb_unpack(input, output0, output1);
break;
case FMT_RGBA:
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_RGBA_DELTA:
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB_LDR_ALPHA:
rgb_hdr = true;
hdr_rgb_ldr_alpha_unpack(input, output0, output1);
break;
case FMT_HDR_RGBA:
rgb_hdr = true;
alpha_hdr = true;
hdr_rgb_hdr_alpha_unpack(input, output0, output1);
break;
}
// Assign a correct default alpha
if (alpha_hdr_default)
{
if (decode_mode == ASTCENC_PRF_HDR)
{
output0.set_lane<3>(0x7800);
output1.set_lane<3>(0x7800);
alpha_hdr = true;
}
else
{
output0.set_lane<3>(0x00FF);
output1.set_lane<3>(0x00FF);
alpha_hdr = false;
}
}
vint4 ldr_scale(257);
vint4 hdr_scale(1);
vint4 output_scale = ldr_scale;
// An LDR profile image
if ((decode_mode == ASTCENC_PRF_LDR) ||
(decode_mode == ASTCENC_PRF_LDR_SRGB))
{
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
if (rgb_hdr == true)
{
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output_scale = hdr_scale;
rgb_hdr = false;
alpha_hdr = false;
}
}
// An HDR profile image
else
{
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
}
output0 = output0 * output_scale;
output1 = output1 * output_scale;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,472 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions to calculate variance per component in a NxN footprint.
*
* We need N to be parametric, so the routine below uses summed area tables in order to execute in
* O(1) time independent of how big N is.
*
* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
* perform a binary reduction, and then distributes the results. This method means that there is no
* serial dependency between a given element and the next one, and also significantly improves
* numerical stability allowing us to use floats rather than doubles.
*/
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
*
* This will take an input array of the form:
* v0, v1, v2, ...
* ... and modify in-place to turn it into a prefix-sum array of the form:
* v0, v0+v1, v0+v1+v2, ...
*
* @param d The array to prefix-sum.
* @param items The number of items in the array.
* @param stride The item spacing in the array; i.e. dense arrays should use 1.
*/
static void brent_kung_prefix_sum(
vfloat4* d,
size_t items,
int stride
) {
if (items < 2)
return;
size_t lc_stride = 2;
size_t log2_stride = 1;
// The reduction-tree loop
do {
size_t step = lc_stride >> 1;
size_t start = lc_stride - 1;
size_t iters = items >> log2_stride;
vfloat4 *da = d + (start * stride);
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
size_t ofs_stride = stride << log2_stride;
while (iters)
{
*da = *da + da[ofs];
da += ofs_stride;
iters--;
}
log2_stride += 1;
lc_stride <<= 1;
} while (lc_stride <= items);
// The expansion-tree loop
do {
log2_stride -= 1;
lc_stride >>= 1;
size_t step = lc_stride >> 1;
size_t start = step + lc_stride - 1;
size_t iters = (items - step) >> log2_stride;
vfloat4 *da = d + (start * stride);
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
size_t ofs_stride = stride << log2_stride;
while (iters)
{
*da = *da + da[ofs];
da += ofs_stride;
iters--;
}
} while (lc_stride > 2);
}
/* See header for documentation. */
void compute_pixel_region_variance(
astcenc_contexti& ctx,
const pixel_region_args& arg
) {
// Unpack the memory structure into local variables
const astcenc_image* img = arg.img;
astcenc_swizzle swz = arg.swz;
bool have_z = arg.have_z;
int size_x = arg.size_x;
int size_y = arg.size_y;
int size_z = arg.size_z;
int offset_x = arg.offset_x;
int offset_y = arg.offset_y;
int offset_z = arg.offset_z;
int alpha_kernel_radius = arg.alpha_kernel_radius;
float* input_alpha_averages = ctx.input_alpha_averages;
vfloat4* work_memory = arg.work_memory;
// Compute memory sizes and dimensions that we need
int kernel_radius = alpha_kernel_radius;
int kerneldim = 2 * kernel_radius + 1;
int kernel_radius_xy = kernel_radius;
int kernel_radius_z = have_z ? kernel_radius : 0;
int padsize_x = size_x + kerneldim;
int padsize_y = size_y + kerneldim;
int padsize_z = size_z + (have_z ? kerneldim : 0);
int sizeprod = padsize_x * padsize_y * padsize_z;
int zd_start = have_z ? 1 : 0;
vfloat4 *varbuf1 = work_memory;
vfloat4 *varbuf2 = work_memory + sizeprod;
// Scaling factors to apply to Y and Z for accesses into the work buffers
int yst = padsize_x;
int zst = padsize_x * padsize_y;
// Scaling factors to apply to Y and Z for accesses into result buffers
int ydt = img->dim_x;
int zdt = img->dim_x * img->dim_y;
// Macros to act as accessor functions for the work-memory
#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
// Load N and N^2 values into the work buffers
if (img->data_type == ASTCENC_TYPE_U8)
{
// Swizzle data structure 4 = ZERO, 5 = ONE
uint8_t data[6];
data[ASTCENC_SWZ_0] = 0;
data[ASTCENC_SWZ_1] = 255;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
uint8_t r = data[swz.r];
uint8_t g = data[swz.g];
uint8_t b = data[swz.b];
uint8_t a = data[swz.a];
vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
g * (1.0f / 255.0f),
b * (1.0f / 255.0f),
a * (1.0f / 255.0f));
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
uint16_t data[6];
data[ASTCENC_SWZ_0] = 0;
data[ASTCENC_SWZ_1] = 0x3C00;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
vfloat4 d = float16_to_float(di);
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
float data[6];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
float* data32 = static_cast<float*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
float r = data[swz.r];
float g = data[swz.g];
float b = data[swz.b];
float a = data[swz.a];
vfloat4 d(r, g, b, a);
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
// Pad with an extra layer of 0s; this forms the edge of the SAT tables
vfloat4 vbz = vfloat4::zero();
for (int z = 0; z < padsize_z; z++)
{
for (int y = 0; y < padsize_y; y++)
{
VARBUF1(z, y, 0) = vbz;
VARBUF2(z, y, 0) = vbz;
}
for (int x = 0; x < padsize_x; x++)
{
VARBUF1(z, 0, x) = vbz;
VARBUF2(z, 0, x) = vbz;
}
}
if (have_z)
{
for (int y = 0; y < padsize_y; y++)
{
for (int x = 0; x < padsize_x; x++)
{
VARBUF1(0, y, x) = vbz;
VARBUF2(0, y, x) = vbz;
}
}
}
// Generate summed-area tables for N and N^2; this is done in-place, using
// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
for (int z = zd_start; z < padsize_z; z++)
{
for (int y = 1; y < padsize_y; y++)
{
brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
}
}
for (int z = zd_start; z < padsize_z; z++)
{
for (int x = 1; x < padsize_x; x++)
{
brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
}
}
if (have_z)
{
for (int y = 1; y < padsize_y; y++)
{
for (int x = 1; x < padsize_x; x++)
{
brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
}
}
}
// Compute a few constants used in the variance-calculation.
float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
float alpha_rsamples;
if (have_z)
{
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
}
else
{
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
}
// Use the summed-area tables to compute variance for each neighborhood
if (have_z)
{
for (int z = 0; z < size_z; z++)
{
int z_src = z + kernel_radius_z;
int z_dst = z + offset_z;
int z_low = z_src - alpha_kernel_radius;
int z_high = z_src + alpha_kernel_radius + 1;
for (int y = 0; y < size_y; y++)
{
int y_src = y + kernel_radius_xy;
int y_dst = y + offset_y;
int y_low = y_src - alpha_kernel_radius;
int y_high = y_src + alpha_kernel_radius + 1;
for (int x = 0; x < size_x; x++)
{
int x_src = x + kernel_radius_xy;
int x_dst = x + offset_x;
int x_low = x_src - alpha_kernel_radius;
int x_high = x_src + alpha_kernel_radius + 1;
// Summed-area table lookups for alpha average
float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
- VARBUF1(z_high, y_low, x_high).lane<3>()
- VARBUF1(z_high, y_high, x_low).lane<3>()
+ VARBUF1(z_high, y_high, x_high).lane<3>()) -
( VARBUF1(z_low, y_low, x_low).lane<3>()
- VARBUF1(z_low, y_low, x_high).lane<3>()
- VARBUF1(z_low, y_high, x_low).lane<3>()
+ VARBUF1(z_low, y_high, x_high).lane<3>());
int out_index = z_dst * zdt + y_dst * ydt + x_dst;
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
}
}
}
}
else
{
for (int y = 0; y < size_y; y++)
{
int y_src = y + kernel_radius_xy;
int y_dst = y + offset_y;
int y_low = y_src - alpha_kernel_radius;
int y_high = y_src + alpha_kernel_radius + 1;
for (int x = 0; x < size_x; x++)
{
int x_src = x + kernel_radius_xy;
int x_dst = x + offset_x;
int x_low = x_src - alpha_kernel_radius;
int x_high = x_src + alpha_kernel_radius + 1;
// Summed-area table lookups for alpha average
float vasum = VARBUF1(0, y_low, x_low).lane<3>()
- VARBUF1(0, y_low, x_high).lane<3>()
- VARBUF1(0, y_high, x_low).lane<3>()
+ VARBUF1(0, y_high, x_high).lane<3>();
int out_index = y_dst * ydt + x_dst;
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
}
}
}
}
/* See header for documentation. */
unsigned int init_compute_averages(
const astcenc_image& img,
unsigned int alpha_kernel_radius,
const astcenc_swizzle& swz,
avg_args& ag
) {
unsigned int size_x = img.dim_x;
unsigned int size_y = img.dim_y;
unsigned int size_z = img.dim_z;
// Compute maximum block size and from that the working memory buffer size
unsigned int kernel_radius = alpha_kernel_radius;
unsigned int kerneldim = 2 * kernel_radius + 1;
bool have_z = (size_z > 1);
unsigned int max_blk_size_xy = have_z ? 16 : 32;
unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
// Perform block-wise averages calculations across the image
// Initialize fields which are not populated until later
ag.arg.size_x = 0;
ag.arg.size_y = 0;
ag.arg.size_z = 0;
ag.arg.offset_x = 0;
ag.arg.offset_y = 0;
ag.arg.offset_z = 0;
ag.arg.work_memory = nullptr;
ag.arg.img = &img;
ag.arg.swz = swz;
ag.arg.have_z = have_z;
ag.arg.alpha_kernel_radius = alpha_kernel_radius;
ag.img_size_x = size_x;
ag.img_size_y = size_y;
ag.img_size_z = size_z;
ag.blk_size_xy = max_blk_size_xy;
ag.blk_size_z = max_blk_size_z;
ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
// The parallel task count
unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
return z_tasks * y_tasks;
}
#endif

View File

@ -0,0 +1,623 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions to decompress a symbolic block.
*/
#include "astcenc_internal.h"
#include <stdio.h>
#include <assert.h>
/**
* @brief Compute the integer linear interpolation of two color endpoints.
*
* @param decode_mode The ASTC profile (linear or sRGB)
* @param color0 The endpoint0 color.
* @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64).
*
* @return The interpolated color.
*/
static vint4 lerp_color_int(
astcenc_profile decode_mode,
vint4 color0,
vint4 color1,
vint4 weights
) {
vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1;
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color0 = asr<8>(color0);
color1 = asr<8>(color1);
}
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color);
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color = color * vint4(257);
}
return color;
}
/**
* @brief Convert integer color value into a float value for the decoder.
*
* @param data The integer color value post-interpolation.
* @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
*
* @return The float color value.
*/
static inline vfloat4 decode_texel(
vint4 data,
vmask4 lns_mask
) {
vint4 color_lns = vint4::zero();
vint4 color_unorm = vint4::zero();
if (any(lns_mask))
{
color_lns = lns_to_sf16(data);
}
if (!all(lns_mask))
{
color_unorm = unorm16_to_sf16(data);
}
// Pick components and then convert to FP16
vint4 datai = select(color_unorm, color_lns, lns_mask);
return float16_to_float(datai);
}
/* See header for documentation. */
void unpack_weights(
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const decimation_info& di,
bool is_dual_plane,
int weights_plane1[BLOCK_MAX_TEXELS],
int weights_plane2[BLOCK_MAX_TEXELS]
) {
// Safe to overshoot as all arrays are allocated to full size
if (!is_dual_plane)
{
// Build full 64-entry weight lookup table
vint4 tab0(reinterpret_cast<const int*>(scb.weights + 0));
vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
vint tab0p, tab1p, tab2p, tab3p;
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
}
store(lsr<4>(summed_value), weights_plane1 + i);
}
}
else
{
// Build a 32-entry weight lookup table per plane
// Plane 1
vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights + 0));
vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
vint tab0_plane1p, tab1_plane1p;
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
// Plane 2
vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
vint tab0_plane2p, tab1_plane2p;
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint sum_plane1(8);
vint sum_plane2(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
}
store(lsr<4>(sum_plane1), weights_plane1 + i);
store(lsr<4>(sum_plane2), weights_plane2 + i);
}
}
}
/**
* @brief Return an FP32 NaN value for use in error colors.
*
* This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
*
* @return The float color value.
*/
static float error_color_nan()
{
if32 v;
v.u = 0xFFFFE000U;
return v.f;
}
/* See header for documentation. */
void decompress_symbolic_block(
astcenc_profile decode_mode,
const block_size_descriptor& bsd,
int xpos,
int ypos,
int zpos,
const symbolic_compressed_block& scb,
image_block& blk
) {
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
blk.data_min = vfloat4::zero();
blk.data_mean = vfloat4::zero();
blk.data_max = vfloat4::zero();
blk.grayscale = false;
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
for (unsigned int i = 0; i < bsd.texel_count; i++)
{
blk.data_r[i] = error_color_nan();
blk.data_g[i] = error_color_nan();
blk.data_b[i] = error_color_nan();
blk.data_a[i] = error_color_nan();
blk.rgb_lns[i] = 0;
blk.alpha_lns[i] = 0;
}
return;
}
if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
(scb.block_type == SYM_BTYPE_CONST_U16))
{
vfloat4 color;
uint8_t use_lns = 0;
// UNORM16 constant color block
if (scb.block_type == SYM_BTYPE_CONST_U16)
{
vint4 colori(scb.constant_color);
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
colori = asr<8>(colori) * 257;
}
vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16);
}
// FLOAT16 constant color block
else
{
switch (decode_mode)
{
case ASTCENC_PRF_LDR_SRGB:
case ASTCENC_PRF_LDR:
color = vfloat4(error_color_nan());
break;
case ASTCENC_PRF_HDR_RGB_LDR_A:
case ASTCENC_PRF_HDR:
// Constant-color block; unpack from FP16 to FP32.
color = float16_to_float(vint4(scb.constant_color));
use_lns = 1;
break;
}
}
for (unsigned int i = 0; i < bsd.texel_count; i++)
{
blk.data_r[i] = color.lane<0>();
blk.data_g[i] = color.lane<1>();
blk.data_b[i] = color.lane<2>();
blk.data_a[i] = color.lane<3>();
blk.rgb_lns[i] = use_lns;
blk.alpha_lns[i] = use_lns;
}
return;
}
// Get the appropriate partition-table entry
int partition_count = scb.partition_count;
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
// Get the appropriate block descriptors
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
// Now that we have endpoint colors and weights, we can unpack texel colors
int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
for (int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(decode_mode,
scb.color_formats[i],
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
int texel_count = pi.partition_texel_count[i];
for (int j = 0; j < texel_count; j++)
{
int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>();
blk.data_g[tix] = colorf.lane<1>();
blk.data_b[tix] = colorf.lane<2>();
blk.data_a[tix] = colorf.lane<3>();
}
}
}
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/* See header for documentation. */
float compute_symbolic_block_difference_2plane(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
assert(scb.partition_count == 1);
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
vfloat4 summa = vfloat4::zero();
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++)
{
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i);
// Compare error using a perceptual decode metric for RGBM textures
if (config.flags & ASTCENC_FLG_MAP_RGBM)
{
// Fail encodings that result in zero weight M pixels. Note that this can cause
// "interesting" artifacts if we reject all useful encodings - we typically get max
// brightness encodings instead which look just as bad. We recommend users apply a
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
// getting small M values post-quantization, but we can't prove it would never
// happen, especially at low bit rates ...
if (color.lane<3>() == 0.0f)
{
return -ERROR_CALC_DEFAULT;
}
// Compute error based on decoded RGBM color
color = vfloat4(
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
1.0f
);
oldColor = vfloat4(
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
1.0f
);
}
vfloat4 error = oldColor - color;
error = min(abs(error), 1e15f);
error = error * error;
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
}
return summa.lane<0>();
}
/* See header for documentation. */
float compute_symbolic_block_difference_1plane(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
// Get the appropriate partition-table entry
unsigned int partition_count = scb.partition_count;
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[i],
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = pi.partition_texel_count[i];
for (unsigned int j = 0; j < texel_count; j++)
{
unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(config.profile, ep0, ep1,
vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(tix);
// Compare error using a perceptual decode metric for RGBM textures
if (config.flags & ASTCENC_FLG_MAP_RGBM)
{
// Fail encodings that result in zero weight M pixels. Note that this can cause
// "interesting" artifacts if we reject all useful encodings - we typically get max
// brightness encodings instead which look just as bad. We recommend users apply a
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
// getting small M values post-quantization, but we can't prove it would never
// happen, especially at low bit rates ...
if (color.lane<3>() == 0.0f)
{
return -ERROR_CALC_DEFAULT;
}
// Compute error based on decoded RGBM color
color = vfloat4(
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
1.0f
);
oldColor = vfloat4(
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
1.0f
);
}
vfloat4 error = oldColor - color;
error = min(abs(error), 1e15f);
error = error * error;
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
}
}
return summa.lane<0>();
}
/* See header for documentation. */
float compute_symbolic_block_difference_1plane_1partition(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
// Pre-shift sRGB so things round correctly
if (config.profile == ASTCENC_PRF_LDR_SRGB)
{
ep0 = asr<8>(ep0);
ep1 = asr<8>(ep1);
}
// Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id();
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
// Compute EP1 contribution
vint weight1 = vint::loada(plane1_weights + i);
vint ep1_r = vint(ep1.lane<0>()) * weight1;
vint ep1_g = vint(ep1.lane<1>()) * weight1;
vint ep1_b = vint(ep1.lane<2>()) * weight1;
vint ep1_a = vint(ep1.lane<3>()) * weight1;
// Compute EP0 contribution
vint weight0 = vint(64) - weight1;
vint ep0_r = vint(ep0.lane<0>()) * weight0;
vint ep0_g = vint(ep0.lane<1>()) * weight0;
vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Shift so things round correctly
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
// Compute color diff
vfloat color_r = int_to_float(colori_r);
vfloat color_g = int_to_float(colori_g);
vfloat color_b = int_to_float(colori_b);
vfloat color_a = int_to_float(colori_a);
vfloat color_orig_r = loada(blk.data_r + i);
vfloat color_orig_g = loada(blk.data_g + i);
vfloat color_orig_b = loada(blk.data_b + i);
vfloat color_orig_a = loada(blk.data_a + i);
vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
// Compute squared error metric
color_error_r = color_error_r * color_error_r;
color_error_g = color_error_g * color_error_g;
color_error_b = color_error_b * color_error_b;
color_error_a = color_error_a * color_error_a;
vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+ color_error_g * blk.channel_weight.lane<1>()
+ color_error_b * blk.channel_weight.lane<2>()
+ color_error_a * blk.channel_weight.lane<3>();
// Mask off bad lanes
vmask mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
haccumulate(summav, metric, mask);
}
return hadd_s(summav);
}
#endif

View File

@ -0,0 +1,230 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for the library entrypoint.
*/
#if defined(ASTCENC_DIAGNOSTICS)
#include <cassert>
#include <cstdarg>
#include <cstdio>
#include <string>
#include "astcenc_diagnostic_trace.h"
/** @brief The global trace logger. */
static TraceLog* g_TraceLog = nullptr;
/** @brief The JSON indentation level. */
static const size_t g_trace_indent = 2;
TraceLog::TraceLog(
const char* file_name):
m_file(file_name, std::ofstream::out | std::ofstream::binary)
{
assert(!g_TraceLog);
g_TraceLog = this;
m_root = new TraceNode("root");
}
/* See header for documentation. */
TraceNode* TraceLog::get_current_leaf()
{
if (m_stack.size())
{
return m_stack.back();
}
return nullptr;
}
/* See header for documentation. */
size_t TraceLog::get_depth()
{
return m_stack.size();
}
/* See header for documentation. */
TraceLog::~TraceLog()
{
assert(g_TraceLog == this);
delete m_root;
g_TraceLog = nullptr;
}
/* See header for documentation. */
TraceNode::TraceNode(
const char* format,
...
) {
// Format the name string
constexpr size_t bufsz = 256;
char buffer[bufsz];
va_list args;
va_start (args, format);
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
// Generate the node
TraceNode* parent = g_TraceLog->get_current_leaf();
size_t depth = g_TraceLog->get_depth();
g_TraceLog->m_stack.push_back(this);
bool comma = parent && parent->m_attrib_count;
auto& out = g_TraceLog->m_file;
if (parent)
{
parent->m_attrib_count++;
}
if (comma)
{
out << ',';
}
if (depth)
{
out << '\n';
}
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
{
out_indents = std::string(out_indent, ' ');
}
std::string in_indents(in_indent, ' ');
out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
out << in_indents << "[";
}
/* See header for documentation. */
void TraceNode::add_attrib(
std::string type,
std::string key,
std::string value
) {
(void)type;
size_t depth = g_TraceLog->get_depth();
size_t indent = (depth * 2) * g_trace_indent;
auto& out = g_TraceLog->m_file;
bool comma = m_attrib_count;
m_attrib_count++;
if (comma)
{
out << ',';
}
out << '\n';
out << std::string(indent, ' ') << "[ "
<< "\"" << key << "\", "
<< value << " ]";
}
/* See header for documentation. */
TraceNode::~TraceNode()
{
g_TraceLog->m_stack.pop_back();
auto& out = g_TraceLog->m_file;
size_t depth = g_TraceLog->get_depth();
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
{
out_indents = std::string(out_indent, ' ');
}
std::string in_indents(in_indent, ' ');
if (m_attrib_count)
{
out << "\n" << in_indents;
}
out << "]\n";
out << out_indents << "]";
}
/* See header for documentation. */
void trace_add_data(
const char* key,
const char* format,
...
) {
constexpr size_t bufsz = 256;
char buffer[bufsz];
va_list args;
va_start (args, format);
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
std::string value = "\"" + std::string(buffer) + "\"";
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("str", key, value);
}
/* See header for documentation. */
void trace_add_data(
const char* key,
float value
) {
char buffer[256];
sprintf(buffer, "%.20g", (double)value);
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("float", key, buffer);
}
/* See header for documentation. */
void trace_add_data(
const char* key,
int value
) {
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("int", key, std::to_string(value));
}
/* See header for documentation. */
void trace_add_data(
const char* key,
unsigned int value
) {
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("int", key, std::to_string(value));
}
#endif

View File

@ -0,0 +1,219 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief This module provides a set of diagnostic tracing utilities.
*
* Overview
* ========
*
* The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
* hierarchy contains three levels:
*
* - block
* - pass
* - candidate
*
* One block node exists for each compressed block in the image. One pass node exists for each major
* pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
* encoding candidate trialed for a pass.
*
* Each node contains both the hierarchy but also a number of attributes which explain the behavior.
* For example, the block node contains the block coordinates in the image, the pass explains the
* pass configuration, and the candidate will explain the candidate encoding such as weight
* decimation, refinement error, etc.
*
* Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
* Constructing a trace node on the stack will automatically add it to the current node as a child,
* and then make it the current node. Destroying the current node will pop the stack and set the
* parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
* tree structure.
*
* A set of utility macros are provided to add attribute annotations to the current trace node.
*
* Usage
* =====
*
* Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
* in builds with diagnostics disabled.
*
* Add annotations to the current trace node using the @c trace_add_data() macro. This will
* similarly compile out completely in builds with diagnostics disabled.
*
* If you need to add additional code to support diagnostics-only behavior wrap
* it in preprocessor guards:
*
* #if defined(ASTCENC_DIAGNOSTICS)
* #endif
*/
#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
#if defined(ASTCENC_DIAGNOSTICS)
#include <iostream>
#include <fstream>
#include <vector>
/**
* @brief Class representing a single node in the trace hierarchy.
*/
class TraceNode
{
public:
/**
* @brief Construct a new node.
*
* Constructing a node will push to the the top of the stack, automatically making it a child of
* the current node, and then setting it to become the current node.
*
* @param format The format template for the node name.
* @param ... The format parameters.
*/
TraceNode(const char* format, ...);
/**
* @brief Add an attribute to this node.
*
* Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
* the caller.
*
* @param type The type of the attribute.
* @param key The key of the attribute.
* @param value The value of the attribute.
*/
void add_attrib(std::string type, std::string key, std::string value);
/**
* @brief Destroy this node.
*
* Destroying a node will pop it from the top of the stack, making its parent the current node.
* It is invalid behavior to destroy a node that is not the current node; usage must conform to
* stack push-pop semantics.
*/
~TraceNode();
/**
* @brief The number of attributes and child nodes in this node.
*/
unsigned int m_attrib_count { 0 };
};
/**
* @brief Class representing the trace log file being written.
*/
class TraceLog
{
public:
/**
* @brief Create a new trace log.
*
* The trace log is global; there can be only one at a time.
*
* @param file_name The name of the file to write.
*/
TraceLog(const char* file_name);
/**
* @brief Detroy the trace log.
*
* Trace logs MUST be cleanly destroyed to ensure the file gets written.
*/
~TraceLog();
/**
* @brief Get the current child node.
*
* @return The current leaf node.
*/
TraceNode* get_current_leaf();
/**
* @brief Get the stack depth of the current child node.
*
* @return The current leaf node stack depth.
*/
size_t get_depth();
/**
* @brief The file stream to write to.
*/
std::ofstream m_file;
/**
* @brief The stack of nodes (newest at the back).
*/
std::vector<TraceNode*> m_stack;
private:
/**
* @brief The root node in the JSON file.
*/
TraceNode* m_root;
};
/**
* @brief Utility macro to create a trace node on the stack.
*
* @param name The variable name to use.
* @param ... The name template and format parameters.
*/
#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
/**
* @brief Add a string annotation to the current node.
*
* @param key The name of the attribute.
* @param format The format template for the attribute value.
* @param ... The format parameters.
*/
void trace_add_data(const char* key, const char* format, ...);
/**
* @brief Add a float annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, float value);
/**
* @brief Add an integer annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, int value);
/**
* @brief Add an unsigned integer annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, unsigned int value);
#else
#define TRACE_NODE(name, ...)
#define trace_add_data(...)
#endif
#endif

1427
thirdparty/astcenc/astcenc_entry.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,780 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions for finding best partition for a block.
*
* The partition search operates in two stages. The first pass uses kmeans clustering to group
* texels into an ideal partitioning for the requested partition count, and then compares that
* against the 1024 partitionings generated by the ASTC partition hash function. The generated
* partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
* clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
* partitionings that actually generate fewer than the requested partition count, but only the top
* N candidates are actually put through a more detailed search. N is determined by the compressor
* quality preset.
*
* For the detailed search, each candidate is checked against two possible encoding methods:
*
* - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
* - The best partitioning assuming same chroma colors (RGB + scale endpoints).
*
* This is implemented by computing the compute mean color and dominant direction for each
* partition. This defines two lines, both of which go through the mean color value.
*
* - One line has a direction defined by the dominant direction; this is used to assess the error
* from using an uncorrelated color representation.
* - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
* (RGB + scale) color representation.
*
* The best candidate is selected by computing the squared-errors that result from using these
* lines for endpoint selection.
*/
#include <limits>
#include "astcenc_internal.h"
/**
* @brief Pick some initial kmeans cluster centers.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param[out] cluster_centers The initial partition cluster center colors.
*/
static void kmeans_init(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
unsigned int clusters_selected = 0;
float distances[BLOCK_MAX_TEXELS];
// Pick a random sample as first cluster center; 145897 from random.org
unsigned int sample = 145897 % texel_count;
vfloat4 center_color = blk.texel(sample);
cluster_centers[clusters_selected] = center_color;
clusters_selected++;
// Compute the distance to the first cluster center
float distance_sum = 0.0f;
for (unsigned int i = 0; i < texel_count; i++)
{
vfloat4 color = blk.texel(i);
vfloat4 diff = color - center_color;
float distance = dot_s(diff * diff, blk.channel_weight);
distance_sum += distance;
distances[i] = distance;
}
// More numbers from random.org for weighted-random center selection
const float cluster_cutoffs[9] {
0.626220f, 0.932770f, 0.275454f,
0.318558f, 0.240113f, 0.009190f,
0.347661f, 0.731960f, 0.156391f
};
unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
// Pick the remaining samples as needed
while (true)
{
// Pick the next center in a weighted-random fashion.
float summa = 0.0f;
float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
for (sample = 0; sample < texel_count; sample++)
{
summa += distances[sample];
if (summa >= distance_cutoff)
{
break;
}
}
// Clamp to a valid range and store the selected cluster center
sample = astc::min(sample, texel_count - 1);
center_color = blk.texel(sample);
cluster_centers[clusters_selected++] = center_color;
if (clusters_selected >= partition_count)
{
break;
}
// Compute the distance to the new cluster center, keep the min dist
distance_sum = 0.0f;
for (unsigned int i = 0; i < texel_count; i++)
{
vfloat4 color = blk.texel(i);
vfloat4 diff = color - center_color;
float distance = dot_s(diff * diff, blk.channel_weight);
distance = astc::min(distance, distances[i]);
distance_sum += distance;
distances[i] = distance;
}
}
}
/**
* @brief Assign texels to clusters, based on a set of chosen center points.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param cluster_centers The partition cluster center colors.
* @param[out] partition_of_texel The partition assigned for each texel.
*/
static void kmeans_assign(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
// Find the best partition for every texel
for (unsigned int i = 0; i < texel_count; i++)
{
float best_distance = std::numeric_limits<float>::max();
unsigned int best_partition = 0;
vfloat4 color = blk.texel(i);
for (unsigned int j = 0; j < partition_count; j++)
{
vfloat4 diff = color - cluster_centers[j];
float distance = dot_s(diff * diff, blk.channel_weight);
if (distance < best_distance)
{
best_distance = distance;
best_partition = j;
}
}
partition_of_texel[i] = static_cast<uint8_t>(best_partition);
partition_texel_count[best_partition]++;
}
// It is possible to get a situation where a partition ends up without any texels. In this case,
// assign texel N to partition N. This is silly, but ensures that every partition retains at
// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
// so if we actually did a reassignment, run the whole loop over again.
bool problem_case;
do
{
problem_case = false;
for (unsigned int i = 0; i < partition_count; i++)
{
if (partition_texel_count[i] == 0)
{
partition_texel_count[partition_of_texel[i]]--;
partition_texel_count[i]++;
partition_of_texel[i] = static_cast<uint8_t>(i);
problem_case = true;
}
}
} while (problem_case);
}
/**
* @brief Compute new cluster centers based on their center of gravity.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param[out] cluster_centers The new cluster center colors.
* @param partition_of_texel The partition assigned for each texel.
*/
static void kmeans_update(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
vfloat4::zero(),
vfloat4::zero(),
vfloat4::zero(),
vfloat4::zero()
};
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
// Find the center-of-gravity in each cluster
for (unsigned int i = 0; i < texel_count; i++)
{
uint8_t partition = partition_of_texel[i];
color_sum[partition] += blk.texel(i);
partition_texel_count[partition]++;
}
// Set the center of gravity to be the new cluster center
for (unsigned int i = 0; i < partition_count; i++)
{
float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
cluster_centers[i] = color_sum[i] * scale;
}
}
/**
* @brief Compute bit-mismatch for partitioning in 2-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch2(
const uint64_t a[2],
const uint64_t b[2]
) {
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
return astc::min(v1, v2);
}
/**
* @brief Compute bit-mismatch for partitioning in 3-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch3(
const uint64_t a[3],
const uint64_t b[3]
) {
int p00 = popcount(a[0] ^ b[0]);
int p01 = popcount(a[0] ^ b[1]);
int p02 = popcount(a[0] ^ b[2]);
int p10 = popcount(a[1] ^ b[0]);
int p11 = popcount(a[1] ^ b[1]);
int p12 = popcount(a[1] ^ b[2]);
int p20 = popcount(a[2] ^ b[0]);
int p21 = popcount(a[2] ^ b[1]);
int p22 = popcount(a[2] ^ b[2]);
int s0 = p11 + p22;
int s1 = p12 + p21;
int v0 = astc::min(s0, s1) + p00;
int s2 = p10 + p22;
int s3 = p12 + p20;
int v1 = astc::min(s2, s3) + p01;
int s4 = p10 + p21;
int s5 = p11 + p20;
int v2 = astc::min(s4, s5) + p02;
return astc::min(v0, v1, v2);
}
/**
* @brief Compute bit-mismatch for partitioning in 4-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch4(
const uint64_t a[4],
const uint64_t b[4]
) {
int p00 = popcount(a[0] ^ b[0]);
int p01 = popcount(a[0] ^ b[1]);
int p02 = popcount(a[0] ^ b[2]);
int p03 = popcount(a[0] ^ b[3]);
int p10 = popcount(a[1] ^ b[0]);
int p11 = popcount(a[1] ^ b[1]);
int p12 = popcount(a[1] ^ b[2]);
int p13 = popcount(a[1] ^ b[3]);
int p20 = popcount(a[2] ^ b[0]);
int p21 = popcount(a[2] ^ b[1]);
int p22 = popcount(a[2] ^ b[2]);
int p23 = popcount(a[2] ^ b[3]);
int p30 = popcount(a[3] ^ b[0]);
int p31 = popcount(a[3] ^ b[1]);
int p32 = popcount(a[3] ^ b[2]);
int p33 = popcount(a[3] ^ b[3]);
int mx23 = astc::min(p22 + p33, p23 + p32);
int mx13 = astc::min(p21 + p33, p23 + p31);
int mx12 = astc::min(p21 + p32, p22 + p31);
int mx03 = astc::min(p20 + p33, p23 + p30);
int mx02 = astc::min(p20 + p32, p22 + p30);
int mx01 = astc::min(p21 + p30, p20 + p31);
int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
return astc::min(v0, v1, v2, v3);
}
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
/**
* @brief Count the partition table mismatches vs the data clustering.
*
* @param bsd The block size information.
* @param partition_count The number of partitions in the block.
* @param bitmaps The block texel partition assignment patterns.
* @param[out] mismatch_counts The array storing per partitioning mismatch counts.
*/
static void count_partition_mismatch_bits(
const block_size_descriptor& bsd,
unsigned int partition_count,
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
) {
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
promise(active_count > 0);
if (partition_count == 2)
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
}
}
else if (partition_count == 3)
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
}
}
else
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
}
}
}
/**
* @brief Use counting sort on the mismatch array to sort partition candidates.
*
* @param partitioning_count The number of packed partitionings.
* @param mismatch_count Partitioning mismatch counts, in index order.
* @param[out] partition_ordering Partition index values, in mismatch order.
*
* @return The number of active partitions in this selection.
*/
static unsigned int get_partition_ordering_by_mismatch_bits(
unsigned int partitioning_count,
const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
promise(partitioning_count > 0);
unsigned int mscount[256] { 0 };
// Create the histogram of mismatch counts
for (unsigned int i = 0; i < partitioning_count; i++)
{
mscount[mismatch_count[i]]++;
}
unsigned int active_count = partitioning_count - mscount[255];
// Create a running sum from the histogram array
// Cells store previous values only; i.e. exclude self after sum
unsigned int summa = 0;
for (unsigned int i = 0; i < 256; i++)
{
unsigned int cnt = mscount[i];
mscount[i] = summa;
summa += cnt;
}
// Use the running sum as the index, incrementing after read to allow
// sequential entries with the same count
for (unsigned int i = 0; i < partitioning_count; i++)
{
unsigned int idx = mscount[mismatch_count[i]]++;
partition_ordering[idx] = i;
}
return active_count;
}
/**
* @brief Use k-means clustering to compute a partition ordering for a block..
*
* @param bsd The block size information.
* @param blk The image block color data to compress.
* @param partition_count The desired number of partitions in the block.
* @param[out] partition_ordering The list of recommended partition indices, in priority order.
*
* @return The number of active partitionings in this selection.
*/
static unsigned int compute_kmeans_partition_ordering(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
// Use three passes of k-means clustering to partition the block data
for (unsigned int i = 0; i < 3; i++)
{
if (i == 0)
{
kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
}
else
{
kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
}
kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
}
// Construct the block bitmaps of texel assignments to each partition
uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
promise(texels_to_process > 0);
for (unsigned int i = 0; i < texels_to_process; i++)
{
unsigned int idx = bsd.kmeans_texels[i];
bitmaps[texel_partitions[idx]] |= 1ULL << i;
}
// Count the mismatch between the block and the format's partition tables
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
// Sort the partitions based on the number of mismatched bits
return get_partition_ordering_by_mismatch_bits(
bsd.partitioning_count_selected[partition_count - 1],
mismatch_counts, partition_ordering);
}
/**
* @brief Insert a partitioning into an order list of results, sorted by error.
*
* @param max_values The max number of entries in the best result arrays.
* @param this_error The error of the new entry.
* @param this_partition The partition ID of the new entry.
* @param[out] best_errors The array of best error values.
* @param[out] best_partitions The array of best partition values.
*/
static void insert_result(
unsigned int max_values,
float this_error,
unsigned int this_partition,
float* best_errors,
unsigned int* best_partitions)
{
promise(max_values > 0);
// Don't bother searching if the current worst error beats the new error
if (this_error >= best_errors[max_values - 1])
{
return;
}
// Else insert into the list in error-order
for (unsigned int i = 0; i < max_values; i++)
{
// Existing result is better - move on ...
if (this_error > best_errors[i])
{
continue;
}
// Move existing results down one
for (unsigned int j = max_values - 1; j > i; j--)
{
best_errors[j] = best_errors[j - 1];
best_partitions[j] = best_partitions[j - 1];
}
// Insert new result
best_errors[i] = this_error;
best_partitions[i] = this_partition;
break;
}
}
/* See header for documentation. */
unsigned int find_best_partition_candidates(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_search_limit,
unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
unsigned int requested_candidates
) {
// Constant used to estimate quantization error for a given partitioning; the optimal value for
// this depends on bitrate. These values have been determined empirically.
unsigned int texels_per_block = bsd.texel_count;
float weight_imprecision_estim = 0.055f;
if (texels_per_block <= 20)
{
weight_imprecision_estim = 0.03f;
}
else if (texels_per_block <= 31)
{
weight_imprecision_estim = 0.04f;
}
else if (texels_per_block <= 41)
{
weight_imprecision_estim = 0.05f;
}
promise(partition_count > 0);
promise(partition_search_limit > 0);
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
partition_search_limit = astc::min(partition_search_limit, sequence_len);
requested_candidates = astc::min(partition_search_limit, requested_candidates);
bool uses_alpha = !blk.is_constant_channel(3);
// Partitioning errors assuming uncorrelated-chrominance endpoints
float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
// Partitioning errors assuming same-chrominance endpoints
float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
samec_best_errors[i] = ERROR_CALC_DEFAULT;
}
if (uses_alpha)
{
for (unsigned int i = 0; i < partition_search_limit; i++)
{
unsigned int partition = partition_sequence[i];
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
// Compute weighting to give to each component in each partition
partition_metrics pms[BLOCK_MAX_PARTITIONS];
compute_avgs_and_dirs_4_comp(pi, blk, pms);
line4 uncor_lines[BLOCK_MAX_PARTITIONS];
line4 samec_lines[BLOCK_MAX_PARTITIONS];
processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
float uncor_line_lens[BLOCK_MAX_PARTITIONS];
float samec_line_lens[BLOCK_MAX_PARTITIONS];
for (unsigned int j = 0; j < partition_count; j++)
{
partition_metrics& pm = pms[j];
uncor_lines[j].a = pm.avg;
uncor_lines[j].b = normalize_safe(pm.dir, unit4());
uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
uncor_plines[j].bs = uncor_lines[j].b;
samec_lines[j].a = vfloat4::zero();
samec_lines[j].b = normalize_safe(pm.avg, unit4());
samec_plines[j].amod = vfloat4::zero();
samec_plines[j].bs = samec_lines[j].b;
}
float uncor_error = 0.0f;
float samec_error = 0.0f;
compute_error_squared_rgba(pi,
blk,
uncor_plines,
samec_plines,
uncor_line_lens,
samec_line_lens,
uncor_error,
samec_error);
// Compute an estimate of error introduced by weight quantization imprecision.
// This error is computed as follows, for each partition
// 1: compute the principal-axis vector (full length) in error-space
// 2: convert the principal-axis vector to regular RGB-space
// 3: scale the vector by a constant that estimates average quantization error
// 4: for each texel, square the vector, then do a dot-product with the texel's
// error weight; sum up the results across all texels.
// 4(optimized): square the vector once, then do a dot-product with the average
// texel error, then multiply by the number of texels.
for (unsigned int j = 0; j < partition_count; j++)
{
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot_s(samec_vector * samec_vector, error_weights);
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
else
{
for (unsigned int i = 0; i < partition_search_limit; i++)
{
unsigned int partition = partition_sequence[i];
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
// Compute weighting to give to each component in each partition
partition_metrics pms[BLOCK_MAX_PARTITIONS];
compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
partition_lines3 plines[BLOCK_MAX_PARTITIONS];
for (unsigned int j = 0; j < partition_count; j++)
{
partition_metrics& pm = pms[j];
partition_lines3& pl = plines[j];
pl.uncor_line.a = pm.avg;
pl.uncor_line.b = normalize_safe(pm.dir, unit3());
pl.samec_line.a = vfloat4::zero();
pl.samec_line.b = normalize_safe(pm.avg, unit3());
pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
pl.uncor_pline.bs = pl.uncor_line.b;
pl.samec_pline.amod = vfloat4::zero();
pl.samec_pline.bs = pl.samec_line.b;
}
float uncor_error = 0.0f;
float samec_error = 0.0f;
compute_error_squared_rgb(pi,
blk,
plines,
uncor_error,
samec_error);
// Compute an estimate of error introduced by weight quantization imprecision.
// This error is computed as follows, for each partition
// 1: compute the principal-axis vector (full length) in error-space
// 2: convert the principal-axis vector to regular RGB-space
// 3: scale the vector by a constant that estimates average quantization error
// 4: for each texel, square the vector, then do a dot-product with the texel's
// error weight; sum up the results across all texels.
// 4(optimized): square the vector once, then do a dot-product with the average
// texel error, then multiply by the number of texels.
for (unsigned int j = 0; j < partition_count; j++)
{
partition_lines3& pl = plines[j];
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
if (best_is_uncor)
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
}
else
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
}
}
uint64_t bitmasks[1024/64] { 0 };
unsigned int emitted = 0;
// Deduplicate the first "requested" entries
for (unsigned int i = 0; i < requested_candidates * 2; i++)
{
unsigned int partition = interleave[i];
unsigned int word = partition / 64;
unsigned int bit = partition % 64;
bool written = bitmasks[word] & (1ull << bit);
if (!written)
{
best_partitions[emitted] = partition;
bitmasks[word] |= 1ull << bit;
emitted++;
if (emitted == requested_candidates)
{
break;
}
}
}
return emitted;
}
#endif

File diff suppressed because it is too large Load Diff

558
thirdparty/astcenc/astcenc_image.cpp vendored Normal file
View File

@ -0,0 +1,558 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for creating in-memory ASTC image structures.
*/
#include <cassert>
#include <cstring>
#include "astcenc_internal.h"
/**
* @brief Loader pipeline function type for data fetch from memory.
*/
using pixel_loader = vfloat4(*)(const void*, int);
/**
* @brief Loader pipeline function type for swizzling data in a vector.
*/
using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
/**
* @brief Loader pipeline function type for converting data in a vector to LNS.
*/
using pixel_converter = vfloat4(*)(vfloat4, vmask4);
/**
* @brief Load a 8-bit UNORM texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_u8(
const void* data,
int base_offset
) {
const uint8_t* data8 = static_cast<const uint8_t*>(data);
return int_to_float(vint4(data8 + base_offset)) / 255.0f;
}
/**
* @brief Load a 16-bit fp16 texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_f16(
const void* data,
int base_offset
) {
const uint16_t* data16 = static_cast<const uint16_t*>(data);
int r = data16[base_offset ];
int g = data16[base_offset + 1];
int b = data16[base_offset + 2];
int a = data16[base_offset + 3];
return float16_to_float(vint4(r, g, b, a));
}
/**
* @brief Load a 32-bit float texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_f32(
const void* data,
int base_offset
) {
const float* data32 = static_cast<const float*>(data);
return vfloat4(data32 + base_offset);
}
/**
* @brief Dummy no-op swizzle function.
*
* @param data The source RGBA vector to swizzle.
* @param swz The swizzle to use.
*/
static vfloat4 swz_texel_skip(
vfloat4 data,
const astcenc_swizzle& swz
) {
(void)swz;
return data;
}
/**
* @brief Swizzle a texel into a new arrangement.
*
* @param data The source RGBA vector to swizzle.
* @param swz The swizzle to use.
*/
static vfloat4 swz_texel(
vfloat4 data,
const astcenc_swizzle& swz
) {
alignas(16) float datas[6];
storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f;
datas[ASTCENC_SWZ_1] = 1.0f;
return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
}
/**
* @brief Encode a texel that is entirely LDR linear.
*
* @param data The RGBA data to encode.
* @param lns_mask The mask for the HDR channels than need LNS encoding.
*/
static vfloat4 encode_texel_unorm(
vfloat4 data,
vmask4 lns_mask
) {
(void)lns_mask;
return data * 65535.0f;
}
/**
* @brief Encode a texel that includes at least some HDR LNS texels.
*
* @param data The RGBA data to encode.
* @param lns_mask The mask for the HDR channels than need LNS encoding.
*/
static vfloat4 encode_texel_lns(
vfloat4 data,
vmask4 lns_mask
) {
vfloat4 datav_unorm = data * 65535.0f;
vfloat4 datav_lns = float_to_lns(data);
return select(datav_unorm, datav_lns, lns_mask);
}
/* See header for documentation. */
void load_image_block(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
unsigned int xsize = img.dim_x;
unsigned int ysize = img.dim_y;
unsigned int zsize = img.dim_z;
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
// True if any non-identity swizzle
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
int idx = 0;
vfloat4 data_min(1e38f);
vfloat4 data_mean(0.0f);
vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
vfloat4 data_max(-1e38f);
vmask4 grayscalev(true);
// This works because we impose the same choice everywhere during encode
uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
vmask4 lns_mask = use_lns != vint4::zero();
// Set up the function pointers for loading pipeline as needed
pixel_loader loader = load_texel_u8;
if (img.data_type == ASTCENC_TYPE_F16)
{
loader = load_texel_f16;
}
else if (img.data_type == ASTCENC_TYPE_F32)
{
loader = load_texel_f32;
}
pixel_swizzler swizzler = swz_texel_skip;
if (needs_swz)
{
swizzler = swz_texel;
}
pixel_converter converter = encode_texel_unorm;
if (any(lns_mask))
{
converter = encode_texel_lns;
}
for (unsigned int z = 0; z < bsd.zdim; z++)
{
unsigned int zi = astc::min(zpos + z, zsize - 1);
void* plane = img.data[zi];
for (unsigned int y = 0; y < bsd.ydim; y++)
{
unsigned int yi = astc::min(ypos + y, ysize - 1);
for (unsigned int x = 0; x < bsd.xdim; x++)
{
unsigned int xi = astc::min(xpos + x, xsize - 1);
vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
datav = swizzler(datav, swz);
datav = converter(datav, lns_mask);
// Compute block metadata
data_min = min(data_min, datav);
data_mean += datav * data_mean_scale;
data_max = max(data_max, datav);
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
blk.data_r[idx] = datav.lane<0>();
blk.data_g[idx] = datav.lane<1>();
blk.data_b[idx] = datav.lane<2>();
blk.data_a[idx] = datav.lane<3>();
blk.rgb_lns[idx] = rgb_lns;
blk.alpha_lns[idx] = a_lns;
idx++;
}
}
}
// Reverse the encoding so we store origin block in the original format
vfloat4 data_enc = blk.texel(0);
vfloat4 data_enc_unorm = data_enc / 65535.0f;
vfloat4 data_enc_lns = vfloat4::zero();
if (rgb_lns || a_lns)
{
data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
}
blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
// Store block metadata
blk.data_min = data_min;
blk.data_mean = data_mean;
blk.data_max = data_max;
blk.grayscale = all(grayscalev);
}
/* See header for documentation. */
void load_image_block_fast_ldr(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
(void)swz;
(void)decode_mode;
unsigned int xsize = img.dim_x;
unsigned int ysize = img.dim_y;
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
vfloat4 data_min(1e38f);
vfloat4 data_mean = vfloat4::zero();
vfloat4 data_max(-1e38f);
vmask4 grayscalev(true);
int idx = 0;
const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
{
unsigned int yi = astc::min(y, ysize - 1);
for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
{
unsigned int xi = astc::min(x, xsize - 1);
vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
// Compute block metadata
data_min = min(data_min, datav);
data_mean += datav;
data_max = max(data_max, datav);
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
blk.data_r[idx] = datav.lane<0>();
blk.data_g[idx] = datav.lane<1>();
blk.data_b[idx] = datav.lane<2>();
blk.data_a[idx] = datav.lane<3>();
idx++;
}
}
// Reverse the encoding so we store origin block in the original format
blk.origin_texel = blk.texel(0) / 65535.0f;
// Store block metadata
blk.rgb_lns[0] = 0;
blk.alpha_lns[0] = 0;
blk.data_min = data_min;
blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
blk.data_max = data_max;
blk.grayscale = all(grayscalev);
}
/* See header for documentation. */
void store_image_block(
astcenc_image& img,
const image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
unsigned int x_size = img.dim_x;
unsigned int x_start = xpos;
unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
unsigned int x_count = x_end - x_start;
unsigned int x_nudge = bsd.xdim - x_count;
unsigned int y_size = img.dim_y;
unsigned int y_start = ypos;
unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
unsigned int y_count = y_end - y_start;
unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
unsigned int z_size = img.dim_z;
unsigned int z_start = zpos;
unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
// True if any non-identity swizzle
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
// True if any swizzle uses Z reconstruct
bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
(swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
int idx = 0;
if (img.data_type == ASTCENC_TYPE_U8)
{
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
{
unsigned int max_texels = ASTCENC_SIMD_WIDTH;
unsigned int used_texels = astc::min(x_count - x, max_texels);
// Unaligned load as rows are not always SIMD_WIDTH long
vfloat data_r(blk.data_r + idx);
vfloat data_g(blk.data_g + idx);
vfloat data_b(blk.data_b + idx);
vfloat data_a(blk.data_a + idx);
vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
if (needs_swz)
{
vint swizzle_table[7];
swizzle_table[ASTCENC_SWZ_0] = vint(0);
swizzle_table[ASTCENC_SWZ_1] = vint(255);
swizzle_table[ASTCENC_SWZ_R] = data_ri;
swizzle_table[ASTCENC_SWZ_G] = data_gi;
swizzle_table[ASTCENC_SWZ_B] = data_bi;
swizzle_table[ASTCENC_SWZ_A] = data_ai;
if (needs_z)
{
vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
data_z = max(data_z, 0.0f);
data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
}
data_ri = swizzle_table[swz.r];
data_gi = swizzle_table[swz.g];
data_bi = swizzle_table[swz.b];
data_ai = swizzle_table[swz.a];
}
// Errors are NaN encoded - convert to magenta error color
// Branch is OK here - it is almost never true so predicts well
vmask nan_mask = data_r != data_r;
if (any(nan_mask))
{
data_ri = select(data_ri, vint(0xFF), nan_mask);
data_gi = select(data_gi, vint(0x00), nan_mask);
data_bi = select(data_bi, vint(0xFF), nan_mask);
data_ai = select(data_ai, vint(0xFF), nan_mask);
}
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
vmask store_mask = vint::lane_id() < vint(used_texels);
store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
data8_row += ASTCENC_SIMD_WIDTH * 4;
idx += used_texels;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
else if (img.data_type == ASTCENC_TYPE_F16)
{
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vint4 color;
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = blk.data_r[idx];
data[ASTCENC_SWZ_G] = blk.data_g[idx];
data[ASTCENC_SWZ_B] = blk.data_b[idx];
data[ASTCENC_SWZ_A] = blk.data_a[idx];
if (needs_z)
{
float xN = (data[0] * 2.0f) - 1.0f;
float yN = (data[3] * 2.0f) - 1.0f;
float zN = 1.0f - xN * xN - yN * yN;
if (zN < 0.0f)
{
zN = 0.0f;
}
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
}
vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
color = float_to_float16(colorf);
}
else
{
vfloat4 colorf = blk.texel(idx);
color = float_to_float16(colorf);
}
// TODO: Vectorize with store N shorts?
data16_row[0] = static_cast<uint16_t>(color.lane<0>());
data16_row[1] = static_cast<uint16_t>(color.lane<1>());
data16_row[2] = static_cast<uint16_t>(color.lane<2>());
data16_row[3] = static_cast<uint16_t>(color.lane<3>());
data16_row += 4;
idx++;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
else // if (img.data_type == ASTCENC_TYPE_F32)
{
assert(img.data_type == ASTCENC_TYPE_F32);
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
float* data32 = static_cast<float*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vfloat4 color = blk.texel(idx);
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = color.lane<0>();
data[ASTCENC_SWZ_G] = color.lane<1>();
data[ASTCENC_SWZ_B] = color.lane<2>();
data[ASTCENC_SWZ_A] = color.lane<3>();
if (needs_z)
{
float xN = (data[0] * 2.0f) - 1.0f;
float yN = (data[3] * 2.0f) - 1.0f;
float zN = 1.0f - xN * xN - yN * yN;
if (zN < 0.0f)
{
zN = 0.0f;
}
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
}
color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
}
store(color, data32_row);
data32_row += 4;
idx++;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
}

View File

@ -0,0 +1,739 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
*/
#include "astcenc_internal.h"
#include <array>
/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t quints_of_integer[128][3] {
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
};
/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
static const uint8_t integer_of_quints[5][5][5] {
{
{0, 1, 2, 3, 4},
{8, 9, 10, 11, 12},
{16, 17, 18, 19, 20},
{24, 25, 26, 27, 28},
{5, 13, 21, 29, 6}
},
{
{32, 33, 34, 35, 36},
{40, 41, 42, 43, 44},
{48, 49, 50, 51, 52},
{56, 57, 58, 59, 60},
{37, 45, 53, 61, 14}
},
{
{64, 65, 66, 67, 68},
{72, 73, 74, 75, 76},
{80, 81, 82, 83, 84},
{88, 89, 90, 91, 92},
{69, 77, 85, 93, 22}
},
{
{96, 97, 98, 99, 100},
{104, 105, 106, 107, 108},
{112, 113, 114, 115, 116},
{120, 121, 122, 123, 124},
{101, 109, 117, 125, 30}
},
{
{102, 103, 70, 71, 38},
{110, 111, 78, 79, 46},
{118, 119, 86, 87, 54},
{126, 127, 94, 95, 62},
{39, 47, 55, 63, 31}
}
};
/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t trits_of_integer[256][5] {
{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
};
/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
static const uint8_t integer_of_trits[3][3][3][3][3] {
{
{
{
{0, 1, 2},
{4, 5, 6},
{8, 9, 10}
},
{
{16, 17, 18},
{20, 21, 22},
{24, 25, 26}
},
{
{3, 7, 15},
{19, 23, 27},
{12, 13, 14}
}
},
{
{
{32, 33, 34},
{36, 37, 38},
{40, 41, 42}
},
{
{48, 49, 50},
{52, 53, 54},
{56, 57, 58}
},
{
{35, 39, 47},
{51, 55, 59},
{44, 45, 46}
}
},
{
{
{64, 65, 66},
{68, 69, 70},
{72, 73, 74}
},
{
{80, 81, 82},
{84, 85, 86},
{88, 89, 90}
},
{
{67, 71, 79},
{83, 87, 91},
{76, 77, 78}
}
}
},
{
{
{
{128, 129, 130},
{132, 133, 134},
{136, 137, 138}
},
{
{144, 145, 146},
{148, 149, 150},
{152, 153, 154}
},
{
{131, 135, 143},
{147, 151, 155},
{140, 141, 142}
}
},
{
{
{160, 161, 162},
{164, 165, 166},
{168, 169, 170}
},
{
{176, 177, 178},
{180, 181, 182},
{184, 185, 186}
},
{
{163, 167, 175},
{179, 183, 187},
{172, 173, 174}
}
},
{
{
{192, 193, 194},
{196, 197, 198},
{200, 201, 202}
},
{
{208, 209, 210},
{212, 213, 214},
{216, 217, 218}
},
{
{195, 199, 207},
{211, 215, 219},
{204, 205, 206}
}
}
},
{
{
{
{96, 97, 98},
{100, 101, 102},
{104, 105, 106}
},
{
{112, 113, 114},
{116, 117, 118},
{120, 121, 122}
},
{
{99, 103, 111},
{115, 119, 123},
{108, 109, 110}
}
},
{
{
{224, 225, 226},
{228, 229, 230},
{232, 233, 234}
},
{
{240, 241, 242},
{244, 245, 246},
{248, 249, 250}
},
{
{227, 231, 239},
{243, 247, 251},
{236, 237, 238}
}
},
{
{
{28, 29, 30},
{60, 61, 62},
{92, 93, 94}
},
{
{156, 157, 158},
{188, 189, 190},
{220, 221, 222}
},
{
{31, 63, 127},
{159, 191, 255},
{252, 253, 254}
}
}
}
};
/**
* @brief The number of bits, trits, and quints needed for a quant level.
*/
struct btq_count
{
/** @brief The number of bits. */
uint8_t bits:6;
/** @brief The number of trits. */
uint8_t trits:1;
/** @brief The number of quints. */
uint8_t quints:1;
};
/**
* @brief The table of bits, trits, and quints needed for a quant encode.
*/
static const std::array<btq_count, 21> btq_counts {{
{ 1, 0, 0 }, // QUANT_2
{ 0, 1, 0 }, // QUANT_3
{ 2, 0, 0 }, // QUANT_4
{ 0, 0, 1 }, // QUANT_5
{ 1, 1, 0 }, // QUANT_6
{ 3, 0, 0 }, // QUANT_8
{ 1, 0, 1 }, // QUANT_10
{ 2, 1, 0 }, // QUANT_12
{ 4, 0, 0 }, // QUANT_16
{ 2, 0, 1 }, // QUANT_20
{ 3, 1, 0 }, // QUANT_24
{ 5, 0, 0 }, // QUANT_32
{ 3, 0, 1 }, // QUANT_40
{ 4, 1, 0 }, // QUANT_48
{ 6, 0, 0 }, // QUANT_64
{ 4, 0, 1 }, // QUANT_80
{ 5, 1, 0 }, // QUANT_96
{ 7, 0, 0 }, // QUANT_128
{ 5, 0, 1 }, // QUANT_160
{ 6, 1, 0 }, // QUANT_192
{ 8, 0, 0 } // QUANT_256
}};
/**
* @brief The sequence scale, round, and divisors needed to compute sizing.
*
* The length of a quantized sequence in bits is:
* (scale * <sequence_len> + round) / divisor
*/
struct ise_size
{
/** @brief The scaling parameter. */
uint8_t scale:6;
/** @brief The divisor parameter. */
uint8_t divisor:2;
};
/**
* @brief The table of scale, round, and divisors needed for quant sizing.
*/
static const std::array<ise_size, 21> ise_sizes {{
{ 1, 0 }, // QUANT_2
{ 8, 2 }, // QUANT_3
{ 2, 0 }, // QUANT_4
{ 7, 1 }, // QUANT_5
{ 13, 2 }, // QUANT_6
{ 3, 0 }, // QUANT_8
{ 10, 1 }, // QUANT_10
{ 18, 2 }, // QUANT_12
{ 4, 0 }, // QUANT_16
{ 13, 1 }, // QUANT_20
{ 23, 2 }, // QUANT_24
{ 5, 0 }, // QUANT_32
{ 16, 1 }, // QUANT_40
{ 28, 2 }, // QUANT_48
{ 6, 0 }, // QUANT_64
{ 19, 1 }, // QUANT_80
{ 33, 2 }, // QUANT_96
{ 7, 0 }, // QUANT_128
{ 22, 1 }, // QUANT_160
{ 38, 2 }, // QUANT_192
{ 8, 0 } // QUANT_256
}};
/* See header for documentation. */
unsigned int get_ise_sequence_bitcount(
unsigned int character_count,
quant_method quant_level
) {
// Cope with out-of bounds values - input might be invalid
if (static_cast<size_t>(quant_level) >= ise_sizes.size())
{
// Arbitrary large number that's more than an ASTC block can hold
return 1024;
}
auto& entry = ise_sizes[quant_level];
unsigned int divisor = (entry.divisor << 1) + 1;
return (entry.scale * character_count + divisor - 1) / divisor;
}
/**
* @brief Write up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param value The value to write.
* @param bitcount The number of bits to write, starting from LSB.
* @param bitoffset The bit offset to store at, between 0 and 7.
* @param[in,out] ptr The data pointer to write to.
*/
static inline void write_bits(
unsigned int value,
unsigned int bitcount,
unsigned int bitoffset,
uint8_t ptr[2]
) {
unsigned int mask = (1 << bitcount) - 1;
value &= mask;
ptr += bitoffset >> 3;
bitoffset &= 7;
value <<= bitoffset;
mask <<= bitoffset;
mask = ~mask;
ptr[0] &= mask;
ptr[0] |= value;
ptr[1] &= mask >> 8;
ptr[1] |= value >> 8;
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline unsigned int read_bits(
unsigned int bitcount,
unsigned int bitoffset,
const uint8_t* ptr
) {
unsigned int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
unsigned int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
/* See header for documentation. */
void encode_ise(
quant_method quant_level,
unsigned int character_count,
const uint8_t* input_data,
uint8_t* output_data,
unsigned int bit_offset
) {
promise(character_count > 0);
unsigned int bits = btq_counts[quant_level].bits;
unsigned int trits = btq_counts[quant_level].trits;
unsigned int quints = btq_counts[quant_level].quints;
unsigned int mask = (1 << bits) - 1;
// Write out trits and bits
if (trits)
{
unsigned int i = 0;
unsigned int full_trit_blocks = character_count / 5;
for (unsigned int j = 0; j < full_trit_blocks; j++)
{
unsigned int i4 = input_data[i + 4] >> bits;
unsigned int i3 = input_data[i + 3] >> bits;
unsigned int i2 = input_data[i + 2] >> bits;
unsigned int i1 = input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
// The max size of a trit bit count is 6, so we can always safely
// pack a single MX value with the following 1 or 2 T bits.
uint8_t pack;
// Element 0 + T0 + T1
pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 1 + T2 + T3
pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 2 + T4
pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
write_bits(pack, bits + 1, bit_offset, output_data);
bit_offset += bits + 1;
// Element 3 + T5 + T6
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 4 + T7
pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
write_bits(pack, bits + 1, bit_offset, output_data);
bit_offset += bits + 1;
}
// Loop tail for a partial block
if (i != character_count)
{
// i4 cannot be present - we know the block is partial
// i0 must be present - we know the block isn't empty
unsigned int i4 = 0;
unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
for (unsigned int j = 0; i < character_count; i++, j++)
{
// Truncated table as this iteration is always partital
static const uint8_t tbits[4] { 2, 2, 1, 2 };
static const uint8_t tshift[4] { 0, 2, 4, 5 };
uint8_t pack = (input_data[i] & mask) |
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
write_bits(pack, bits + tbits[j], bit_offset, output_data);
bit_offset += bits + tbits[j];
}
}
}
// Write out quints and bits
else if (quints)
{
unsigned int i = 0;
unsigned int full_quint_blocks = character_count / 3;
for (unsigned int j = 0; j < full_quint_blocks; j++)
{
unsigned int i2 = input_data[i + 2] >> bits;
unsigned int i1 = input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_quints[i2][i1][i0];
// The max size of a quint bit count is 5, so we can always safely
// pack a single M value with the following 2 or 3 T bits.
uint8_t pack;
// Element 0
pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
write_bits(pack, bits + 3, bit_offset, output_data);
bit_offset += bits + 3;
// Element 1
pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 2
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
}
// Loop tail for a partial block
if (i != character_count)
{
// i2 cannot be present - we know the block is partial
// i0 must be present - we know the block isn't empty
unsigned int i2 = 0;
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_quints[i2][i1][i0];
for (unsigned int j = 0; i < character_count; i++, j++)
{
// Truncated table as this iteration is always partital
static const uint8_t tbits[2] { 3, 2 };
static const uint8_t tshift[2] { 0, 3 };
uint8_t pack = (input_data[i] & mask) |
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
write_bits(pack, bits + tbits[j], bit_offset, output_data);
bit_offset += bits + tbits[j];
}
}
}
// Write out just bits
else
{
for (unsigned int i = 0; i < character_count; i++)
{
write_bits(input_data[i], bits, bit_offset, output_data);
bit_offset += bits;
}
}
}
/* See header for documentation. */
void decode_ise(
quant_method quant_level,
unsigned int character_count,
const uint8_t* input_data,
uint8_t* output_data,
unsigned int bit_offset
) {
promise(character_count > 0);
// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
// but we keep 4 additional character_count of padding.
uint8_t results[68];
uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
unsigned int bits = btq_counts[quant_level].bits;
unsigned int trits = btq_counts[quant_level].trits;
unsigned int quints = btq_counts[quant_level].quints;
unsigned int lcounter = 0;
unsigned int hcounter = 0;
// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
for (unsigned int i = 0; i < character_count; i++)
{
results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
bit_offset += bits;
if (trits)
{
static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 };
static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 };
static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
hcounter += hcounter_incr[lcounter];
lcounter = next_lcounter[lcounter];
}
if (quints)
{
static const uint8_t bits_to_read[3] { 3, 2, 2 };
static const uint8_t block_shift[3] { 0, 3, 5 };
static const uint8_t next_lcounter[3] { 1, 2, 0 };
static const uint8_t hcounter_incr[3] { 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
hcounter += hcounter_incr[lcounter];
lcounter = next_lcounter[lcounter];
}
}
// Unpack trit-blocks or quint-blocks as needed
if (trits)
{
unsigned int trit_blocks = (character_count + 4) / 5;
promise(trit_blocks > 0);
for (unsigned int i = 0; i < trit_blocks; i++)
{
const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
results[5 * i ] |= tritptr[0] << bits;
results[5 * i + 1] |= tritptr[1] << bits;
results[5 * i + 2] |= tritptr[2] << bits;
results[5 * i + 3] |= tritptr[3] << bits;
results[5 * i + 4] |= tritptr[4] << bits;
}
}
if (quints)
{
unsigned int quint_blocks = (character_count + 2) / 3;
promise(quint_blocks > 0);
for (unsigned int i = 0; i < quint_blocks; i++)
{
const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
results[3 * i ] |= quintptr[0] << bits;
results[3 * i + 1] |= quintptr[1] << bits;
results[3 * i + 2] |= quintptr[2] << bits;
}
}
for (unsigned int i = 0; i < character_count; i++)
{
output_data[i] = results[i];
}
}

2196
thirdparty/astcenc/astcenc_internal.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,273 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data declarations for the outer context.
*
* The outer context includes thread-pool management, which is slower to
* compile due to increased use of C++ stdlib. The inner context used in the
* majority of the codec library does not include this.
*/
#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
#define ASTCENC_INTERNAL_ENTRY_INCLUDED
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include "astcenc_internal.h"
/* ============================================================================
Parallel execution control
============================================================================ */
/**
* @brief A simple counter-based manager for parallel task execution.
*
* The task processing execution consists of:
*
* * A single-threaded init stage.
* * A multi-threaded processing stage.
* * A condition variable so threads can wait for processing completion.
*
* The init stage will be executed by the first thread to arrive in the critical section, there is
* no main thread in the thread pool.
*
* The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
* basis. Threads may each therefore executed different numbers of tasks, depending on their
* processing complexity. The task queue and the task tickets are just counters; the caller must map
* these integers to an actual processing partition in a specific problem domain.
*
* The exit wait condition is needed to ensure processing has finished before a worker thread can
* progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
* because there are no new tasks to assign to it while other worker threads are still processing.
* Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
*
* The basic usage model:
*
* // --------- From single-threaded code ---------
*
* // Reset the tracker state
* manager->reset()
*
* // --------- From multi-threaded code ---------
*
* // Run the stage init; only first thread actually runs the lambda
* manager->init(<lambda>)
*
* do
* {
* // Request a task assignment
* uint task_count;
* uint base_index = manager->get_tasks(<granule>, task_count);
*
* // Process any tasks we were given (task_count <= granule size)
* if (task_count)
* {
* // Run the user task processing code for N tasks here
* ...
*
* // Flag these tasks as complete
* manager->complete_tasks(task_count);
* }
* } while (task_count);
*
* // Wait for all threads to complete tasks before progressing
* manager->wait()
*
* // Run the stage term; only first thread actually runs the lambda
* manager->term(<lambda>)
*/
class ParallelManager
{
private:
/** @brief Lock used for critical section and condition synchronization. */
std::mutex m_lock;
/** @brief True if the stage init() step has been executed. */
bool m_init_done;
/** @brief True if the stage term() step has been executed. */
bool m_term_done;
/** @brief Condition variable for tracking stage processing completion. */
std::condition_variable m_complete;
/** @brief Number of tasks started, but not necessarily finished. */
std::atomic<unsigned int> m_start_count;
/** @brief Number of tasks finished. */
unsigned int m_done_count;
/** @brief Number of tasks that need to be processed. */
unsigned int m_task_count;
public:
/** @brief Create a new ParallelManager. */
ParallelManager()
{
reset();
}
/**
* @brief Reset the tracker for a new processing batch.
*
* This must be called from single-threaded code before starting the multi-threaded processing
* operations.
*/
void reset()
{
m_init_done = false;
m_term_done = false;
m_start_count = 0;
m_done_count = 0;
m_task_count = 0;
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param init_func Callable which executes the stage initialization. It must return the
* total number of tasks in the stage.
*/
void init(std::function<unsigned int(void)> init_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_task_count = init_func();
m_init_done = true;
}
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param task_count Total number of tasks needing processing.
*/
void init(unsigned int task_count)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_task_count = task_count;
m_init_done = true;
}
}
/**
* @brief Request a task assignment.
*
* Assign up to @c granule tasks to the caller for processing.
*
* @param granule Maximum number of tasks that can be assigned.
* @param[out] count Actual number of tasks assigned, or zero if no tasks were assigned.
*
* @return Task index of the first assigned task; assigned tasks increment from this.
*/
unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
{
unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
if (base >= m_task_count)
{
count = 0;
return 0;
}
count = astc::min(m_task_count - base, granule);
return base;
}
/**
* @brief Complete a task assignment.
*
* Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
* completes the processing of the stage.
*
* @param count The number of completed tasks.
*/
void complete_task_assignment(unsigned int count)
{
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads
std::unique_lock<std::mutex> lck(m_lock);
this->m_done_count += count;
if (m_done_count == m_task_count)
{
lck.unlock();
m_complete.notify_all();
}
}
/**
* @brief Wait for stage processing to complete.
*/
void wait()
{
std::unique_lock<std::mutex> lck(m_lock);
m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
}
/**
* @brief Trigger the pipeline stage term step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* work pool termination. Caller must have called @c wait() prior to calling this function to
* ensure that processing is complete.
*
* @param term_func Callable which executes the stage termination.
*/
void term(std::function<void(void)> term_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_term_done)
{
term_func();
m_term_done = true;
}
}
};
/**
* @brief The astcenc compression context.
*/
struct astcenc_context
{
/** @brief The context internal state. */
astcenc_contexti context;
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/** @brief The parallel manager for averages computation. */
ParallelManager manage_avg;
/** @brief The parallel manager for compression. */
ParallelManager manage_compress;
#endif
/** @brief The parallel manager for decompression. */
ParallelManager manage_decompress;
};
#endif

48
thirdparty/astcenc/astcenc_mathlib.cpp vendored Normal file
View File

@ -0,0 +1,48 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#include "astcenc_mathlib.h"
/**
* @brief 64-bit rotate left.
*
* @param val The value to rotate.
* @param count The rotation, in bits.
*/
static inline uint64_t rotl(uint64_t val, int count)
{
return (val << count) | (val >> (64 - count));
}
/* See header for documentation. */
void astc::rand_init(uint64_t state[2])
{
state[0] = 0xfaf9e171cea1ec6bULL;
state[1] = 0xf1b318cc06af5d71ULL;
}
/* See header for documentation. */
uint64_t astc::rand(uint64_t state[2])
{
uint64_t s0 = state[0];
uint64_t s1 = state[1];
uint64_t res = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
state[1] = rotl(s1, 37);
return res;
}

478
thirdparty/astcenc/astcenc_mathlib.h vendored Normal file
View File

@ -0,0 +1,478 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/*
* This module implements a variety of mathematical data types and library
* functions used by the codec.
*/
#ifndef ASTC_MATHLIB_H_INCLUDED
#define ASTC_MATHLIB_H_INCLUDED
#include <cassert>
#include <cstdint>
#include <cmath>
#ifndef ASTCENC_POPCNT
#if defined(__POPCNT__)
#define ASTCENC_POPCNT 1
#else
#define ASTCENC_POPCNT 0
#endif
#endif
#ifndef ASTCENC_F16C
#if defined(__F16C__)
#define ASTCENC_F16C 1
#else
#define ASTCENC_F16C 0
#endif
#endif
#ifndef ASTCENC_SSE
#if defined(__SSE4_2__)
#define ASTCENC_SSE 42
#elif defined(__SSE4_1__)
#define ASTCENC_SSE 41
#elif defined(__SSE3__)
#define ASTCENC_SSE 30
#elif defined(__SSE2__)
#define ASTCENC_SSE 20
#else
#define ASTCENC_SSE 0
#endif
#endif
#ifndef ASTCENC_AVX
#if defined(__AVX2__)
#define ASTCENC_AVX 2
#elif defined(__AVX__)
#define ASTCENC_AVX 1
#else
#define ASTCENC_AVX 0
#endif
#endif
#ifndef ASTCENC_NEON
#if defined(__aarch64__)
#define ASTCENC_NEON 1
#else
#define ASTCENC_NEON 0
#endif
#endif
#if ASTCENC_AVX
#define ASTCENC_VECALIGN 32
#else
#define ASTCENC_VECALIGN 16
#endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
#include <immintrin.h>
#endif
/* ============================================================================
Fast math library; note that many of the higher-order functions in this set
use approximations which are less accurate, but faster, than <cmath> standard
library equivalents.
Note: Many of these are not necessarily faster than simple C versions when
used on a single scalar value, but are included for testing purposes as most
have an option based on SSE intrinsics and therefore provide an obvious route
to future vectorization.
============================================================================ */
// Union for manipulation of float bit patterns
typedef union
{
uint32_t u;
int32_t s;
float f;
} if32;
// These are namespaced to avoid colliding with C standard library functions.
namespace astc
{
static const float PI = 3.14159265358979323846f;
static const float PI_OVER_TWO = 1.57079632679489661923f;
/**
* @brief SP float absolute value.
*
* @param v The value to make absolute.
*
* @return The absolute value.
*/
static inline float fabs(float v)
{
return std::fabs(v);
}
/**
* @brief Test if a float value is a nan.
*
* @param v The value test.
*
* @return Zero is not a NaN, non-zero otherwise.
*/
static inline bool isnan(float v)
{
return v != v;
}
/**
* @brief Return the minimum of two values.
*
* For floats, NaNs are turned into @c q.
*
* @param p The first value to compare.
* @param q The second value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q)
{
return p < q ? p : q;
}
/**
* @brief Return the minimum of three values.
*
* For floats, NaNs are turned into @c r.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q, T r)
{
return min(min(p, q), r);
}
/**
* @brief Return the minimum of four values.
*
* For floats, NaNs are turned into @c s.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
* @param s The fourth value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q, T r, T s)
{
return min(min(p, q), min(r, s));
}
/**
* @brief Return the maximum of two values.
*
* For floats, NaNs are turned into @c q.
*
* @param p The first value to compare.
* @param q The second value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q)
{
return p > q ? p : q;
}
/**
* @brief Return the maximum of three values.
*
* For floats, NaNs are turned into @c r.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q, T r)
{
return max(max(p, q), r);
}
/**
* @brief Return the maximum of four values.
*
* For floats, NaNs are turned into @c s.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
* @param s The fourth value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q, T r, T s)
{
return max(max(p, q), max(r, s));
}
/**
* @brief Clamp a value value between @c mn and @c mx.
*
* For floats, NaNs are turned into @c mn.
*
* @param v The value to clamp.
* @param mn The min value (inclusive).
* @param mx The max value (inclusive).
*
* @return The clamped value.
*/
template<typename T>
inline T clamp(T v, T mn, T mx)
{
// Do not reorder; correct NaN handling relies on the fact that comparison
// with NaN returns false and will fall-though to the "min" value.
if (v > mx) return mx;
if (v > mn) return v;
return mn;
}
/**
* @brief Clamp a float value between 0.0f and 1.0f.
*
* NaNs are turned into 0.0f.
*
* @param v The value to clamp.
*
* @return The clamped value.
*/
static inline float clamp1f(float v)
{
return astc::clamp(v, 0.0f, 1.0f);
}
/**
* @brief Clamp a float value between 0.0f and 255.0f.
*
* NaNs are turned into 0.0f.
*
* @param v The value to clamp.
*
* @return The clamped value.
*/
static inline float clamp255f(float v)
{
return astc::clamp(v, 0.0f, 255.0f);
}
/**
* @brief SP float round-down.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline float flt_rd(float v)
{
return std::floor(v);
}
/**
* @brief SP float round-to-nearest and convert to integer.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline int flt2int_rtn(float v)
{
return static_cast<int>(v + 0.5f);
}
/**
* @brief SP float round down and convert to integer.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline int flt2int_rd(float v)
{
return static_cast<int>(v);
}
/**
* @brief SP float bit-interpreted as an integer.
*
* @param v The value to bitcast.
*
* @return The converted value.
*/
static inline int float_as_int(float v)
{
union { int a; float b; } u;
u.b = v;
return u.a;
}
/**
* @brief Integer bit-interpreted as an SP float.
*
* @param v The value to bitcast.
*
* @return The converted value.
*/
static inline float int_as_float(int v)
{
union { int a; float b; } u;
u.a = v;
return u.b;
}
/**
* @brief Fast approximation of 1.0 / sqrt(val).
*
* @param v The input value.
*
* @return The approximated result.
*/
static inline float rsqrt(float v)
{
return 1.0f / std::sqrt(v);
}
/**
* @brief Fast approximation of sqrt(val).
*
* @param v The input value.
*
* @return The approximated result.
*/
static inline float sqrt(float v)
{
return std::sqrt(v);
}
/**
* @brief Extract mantissa and exponent of a float value.
*
* @param v The input value.
* @param[out] expo The output exponent.
*
* @return The mantissa.
*/
static inline float frexp(float v, int* expo)
{
if32 p;
p.f = v;
*expo = ((p.u >> 23) & 0xFF) - 126;
p.u = (p.u & 0x807fffff) | 0x3f000000;
return p.f;
}
/**
* @brief Initialize the seed structure for a random number generator.
*
* Important note: For the purposes of ASTC we want sets of random numbers to
* use the codec, but we want the same seed value across instances and threads
* to ensure that image output is stable across compressor runs and across
* platforms. Every PRNG created by this call will therefore return the same
* sequence of values ...
*
* @param state The state structure to initialize.
*/
void rand_init(uint64_t state[2]);
/**
* @brief Return the next random number from the generator.
*
* This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
* public-domain implementation given by David Blackman & Sebastiano Vigna at
* http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
*
* @param state The state structure to use/update.
*/
uint64_t rand(uint64_t state[2]);
}
/* ============================================================================
Softfloat library with fp32 and fp16 conversion functionality.
============================================================================ */
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
/* narrowing float->float conversions */
uint16_t float_to_sf16(float val);
float sf16_to_float(uint16_t val);
#endif
/*********************************
Vector library
*********************************/
#include "astcenc_vecmathlib.h"
/*********************************
Declaration of line types
*********************************/
// parametric line, 2D: The line is given by line = a + b * t.
struct line2
{
vfloat4 a;
vfloat4 b;
};
// parametric line, 3D
struct line3
{
vfloat4 a;
vfloat4 b;
};
struct line4
{
vfloat4 a;
vfloat4 b;
};
struct processed_line2
{
vfloat4 amod;
vfloat4 bs;
};
struct processed_line3
{
vfloat4 amod;
vfloat4 bs;
};
struct processed_line4
{
vfloat4 amod;
vfloat4 bs;
};
#endif

View File

@ -0,0 +1,411 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Soft-float library for IEEE-754.
*/
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
#include "astcenc_mathlib.h"
/* sized soft-float types. These are mapped to the sized integer
types of C99, instead of C's floating-point types; this is because
the library needs to maintain exact, bit-level control on all
operations on these data types. */
typedef uint16_t sf16;
typedef uint32_t sf32;
/******************************************
helper functions and their lookup tables
******************************************/
/* count leading zeros functions. Only used when the input is nonzero. */
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
#elif defined(__arm__) && defined(__ARMCC_VERSION)
#elif defined(__arm__) && defined(__GNUC__)
#else
/* table used for the slow default versions. */
static const uint8_t clz_table[256] =
{
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#endif
/*
32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
static uint32_t clz32(uint32_t inp)
{
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
uint32_t bsr;
__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
return 31 - bsr;
#else
#if defined(__arm__) && defined(__ARMCC_VERSION)
return __clz(inp); /* armcc builtin */
#else
#if defined(__arm__) && defined(__GNUC__)
uint32_t lz;
__asm__("clz %0, %1": "=r"(lz):"r"(inp));
return lz;
#else
/* slow default version */
uint32_t summa = 24;
if (inp >= UINT32_C(0x10000))
{
inp >>= 16;
summa -= 16;
}
if (inp >= UINT32_C(0x100))
{
inp >>= 8;
summa -= 8;
}
return summa + clz_table[inp];
#endif
#endif
#endif
}
/* the five rounding modes that IEEE-754r defines */
typedef enum
{
SF_UP = 0, /* round towards positive infinity */
SF_DOWN = 1, /* round towards negative infinity */
SF_TOZERO = 2, /* round towards zero */
SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */
SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */
} roundmode;
static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = UINT32_C(1) << shamt;
uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
msk--; /* negative if even, nonnegative if odd. */
inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */
inp2 >>= shamt;
return inp2;
}
static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
inp += vl1;
inp >>= shamt;
return inp;
}
static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = UINT32_C(1) << shamt;
inp += vl1;
inp--;
inp >>= shamt;
return inp;
}
/* convert from FP16 to FP32. */
static sf32 sf16_to_sf32(sf16 inp)
{
uint32_t inpx = inp;
/*
This table contains, for every FP16 sign/exponent value combination,
the difference between the input FP16 value and the value obtained
by shifting the correct FP32 result right by 13 bits.
This table allows us to handle every case except denormals and NaN
with just 1 table lookup, 2 shifts and 1 add.
*/
#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
static const uint32_t tbl[64] =
{
WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
};
uint32_t res = tbl[inpx >> 10];
res += inpx;
/* Normal cases: MSB of 'res' not set. */
if ((res & WITH_MSB(0)) == 0)
{
return res << 13;
}
/* Infinity and Zero: 10 LSB of 'res' not set. */
if ((res & 0x3FF) == 0)
{
return res << 13;
}
/* NaN: the exponent field of 'inp' is non-zero. */
if ((inpx & 0x7C00) != 0)
{
/* All NaNs are quietened. */
return (res << 13) | 0x400000;
}
/* Denormal cases */
uint32_t sign = (inpx & 0x8000) << 16;
uint32_t mskval = inpx & 0x7FFF;
uint32_t leadingzeroes = clz32(mskval);
mskval <<= leadingzeroes;
return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
}
/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
{
/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
static const uint8_t tab[512] {
0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
};
/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
size. */
static const uint32_t tabx[60] {
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
};
uint32_t p;
uint32_t idx = rmode + tab[inp >> 23];
uint32_t vlx = tabx[idx];
switch (idx)
{
/*
Positive number which may be Infinity or NaN.
We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
(If we don't do this quieting, then a NaN that is distinguished only by having
its low-order bits set, would be turned into an INF. */
case 50:
case 51:
case 52:
case 53:
case 54:
case 55:
case 56:
case 57:
case 58:
case 59:
/*
the input value is 0x7F800000 or 0xFF800000 if it is INF.
By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
For NaNs, however, this operation will keep bit 23 with the value 1.
We can then extract bit 23, and logical-OR bit 9 of the result with this
bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
of the mantissa is set.)
*/
p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
/*
positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
If it is, then return 0, else return 1 (the smallest representable nonzero number)
*/
case 0:
/*
-inp will set the MSB if the input number is nonzero.
Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
*/
return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
/*
negative, exponent = , round-mode == DOWN, need to check whether number is
actually 0. If it is, return 0x8000 ( float -0.0 )
Else return the smallest negative number ( 0x8001 ) */
case 6:
/*
in this case 'vlx' is 0x80000000. By subtracting the input value from it,
we obtain a value that is 0 if the input value is in fact zero and has
the MSB set if it isn't. We then right-shift the value by 31 places to
get a value that is 0 if the input is -0.0 and 1 otherwise.
*/
return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
/*
for all other cases involving underflow/overflow, we don't need to
do actual tests; we just return 'vlx'.
*/
case 1:
case 2:
case 3:
case 4:
case 5:
case 7:
case 8:
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 40:
case 41:
case 42:
case 43:
case 44:
case 45:
case 46:
case 47:
case 48:
case 49:
return static_cast<sf16>(vlx);
/*
for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
/* normal number, all rounding modes except round-to-nearest-even: */
case 30:
case 31:
case 32:
case 34:
case 35:
case 36:
case 37:
case 39:
return static_cast<sf16>((inp + vlx) >> 13);
/* normal number, round-to-nearest-even. */
case 33:
case 38:
p = inp + vlx;
p += (inp >> 13) & 1;
return static_cast<sf16>(p >> 13);
/*
the various denormal cases. These are not expected to be common, so their performance is a bit
less important. For each of these cases, we need to extract an exponent and a mantissa
(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
sign of the resulting denormal number.
*/
case 21:
case 22:
case 25:
case 27:
/* denormal, round towards zero. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
case 20:
case 26:
/* denormal, round away from zero. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 24:
case 29:
/* denormal, round to nearest-away */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 23:
case 28:
/* denormal, round to nearest-even. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
}
return 0;
}
/* convert from soft-float to native-float */
float sf16_to_float(uint16_t p)
{
if32 i;
i.u = sf16_to_sf32(p);
return i.f;
}
/* convert from native-float to soft-float */
uint16_t float_to_sf16(float p)
{
if32 i;
i.f = p;
return sf32_to_sf16(i.u, SF_NEARESTEVEN);
}
#endif

View File

@ -0,0 +1,481 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for generating partition tables on demand.
*/
#include "astcenc_internal.h"
/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
/**
* @brief Generate a canonical representation of a partition pattern.
*
* The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
* the remapped texel index. Remapping ensures that we only match on the partition pattern,
* independent of the partition order generated by the hash.
*
* @param texel_count The number of texels in the block.
* @param partition_of_texel The partition assignments, in hash order.
* @param[out] bit_pattern The output bit pattern representation.
*/
static void generate_canonical_partitioning(
unsigned int texel_count,
const uint8_t* partition_of_texel,
uint64_t bit_pattern[BIT_PATTERN_WORDS]
) {
// Clear the pattern
for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
{
bit_pattern[i] = 0;
}
// Store a mapping to reorder the raw partitions so that the partitions are ordered such
// that the lowest texel index in partition N is smaller than the lowest texel index in
// partition N + 1.
int mapped_index[BLOCK_MAX_PARTITIONS];
int map_weight_count = 0;
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
{
mapped_index[i] = -1;
}
for (unsigned int i = 0; i < texel_count; i++)
{
int index = partition_of_texel[i];
if (mapped_index[index] < 0)
{
mapped_index[index] = map_weight_count++;
}
uint64_t xlat_index = mapped_index[index];
bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
}
}
/**
* @brief Compare two canonical patterns to see if they are the same.
*
* @param part1 The first canonical bit pattern to check.
* @param part2 The second canonical bit pattern to check.
*
* @return @c true if the patterns are the same, @c false otherwise.
*/
static bool compare_canonical_partitionings(
const uint64_t part1[BIT_PATTERN_WORDS],
const uint64_t part2[BIT_PATTERN_WORDS]
) {
return (part1[0] == part2[0])
#if BIT_PATTERN_WORDS > 1
&& (part1[1] == part2[1])
#endif
#if BIT_PATTERN_WORDS > 2
&& (part1[2] == part2[2])
#endif
#if BIT_PATTERN_WORDS > 3
&& (part1[3] == part2[3])
#endif
#if BIT_PATTERN_WORDS > 4
&& (part1[4] == part2[4])
#endif
#if BIT_PATTERN_WORDS > 5
&& (part1[5] == part2[5])
#endif
#if BIT_PATTERN_WORDS > 6
&& (part1[6] == part2[6])
#endif
;
}
/**
* @brief Hash function used for procedural partition assignment.
*
* @param inp The hash seed.
*
* @return The hashed value.
*/
static uint32_t hash52(
uint32_t inp
) {
inp ^= inp >> 15;
// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
inp *= 0xEEDE0891;
inp ^= inp >> 5;
inp += inp << 16;
inp ^= inp >> 7;
inp ^= inp >> 3;
inp ^= inp << 6;
inp ^= inp >> 17;
return inp;
}
/**
* @brief Select texel assignment for a single coordinate.
*
* @param seed The seed - the partition index from the block.
* @param x The texel X coordinate in the block.
* @param y The texel Y coordinate in the block.
* @param z The texel Z coordinate in the block.
* @param partition_count The total partition count of this encoding.
* @param small_block @c true if the block has fewer than 32 texels.
*
* @return The assigned partition index for this texel.
*/
static uint8_t select_partition(
int seed,
int x,
int y,
int z,
int partition_count,
bool small_block
) {
// For small blocks bias the coordinates to get better distribution
if (small_block)
{
x <<= 1;
y <<= 1;
z <<= 1;
}
seed += (partition_count - 1) * 1024;
uint32_t rnum = hash52(seed);
uint8_t seed1 = rnum & 0xF;
uint8_t seed2 = (rnum >> 4) & 0xF;
uint8_t seed3 = (rnum >> 8) & 0xF;
uint8_t seed4 = (rnum >> 12) & 0xF;
uint8_t seed5 = (rnum >> 16) & 0xF;
uint8_t seed6 = (rnum >> 20) & 0xF;
uint8_t seed7 = (rnum >> 24) & 0xF;
uint8_t seed8 = (rnum >> 28) & 0xF;
uint8_t seed9 = (rnum >> 18) & 0xF;
uint8_t seed10 = (rnum >> 22) & 0xF;
uint8_t seed11 = (rnum >> 26) & 0xF;
uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
// Squaring all the seeds in order to bias their distribution towards lower values.
seed1 *= seed1;
seed2 *= seed2;
seed3 *= seed3;
seed4 *= seed4;
seed5 *= seed5;
seed6 *= seed6;
seed7 *= seed7;
seed8 *= seed8;
seed9 *= seed9;
seed10 *= seed10;
seed11 *= seed11;
seed12 *= seed12;
int sh1, sh2;
if (seed & 1)
{
sh1 = (seed & 2 ? 4 : 5);
sh2 = (partition_count == 3 ? 6 : 5);
}
else
{
sh1 = (partition_count == 3 ? 6 : 5);
sh2 = (seed & 2 ? 4 : 5);
}
int sh3 = (seed & 0x10) ? sh1 : sh2;
seed1 >>= sh1;
seed2 >>= sh2;
seed3 >>= sh1;
seed4 >>= sh2;
seed5 >>= sh1;
seed6 >>= sh2;
seed7 >>= sh1;
seed8 >>= sh2;
seed9 >>= sh3;
seed10 >>= sh3;
seed11 >>= sh3;
seed12 >>= sh3;
int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
// Apply the saw
a &= 0x3F;
b &= 0x3F;
c &= 0x3F;
d &= 0x3F;
// Remove some of the components if we are to output < 4 partitions.
if (partition_count <= 3)
{
d = 0;
}
if (partition_count <= 2)
{
c = 0;
}
if (partition_count <= 1)
{
b = 0;
}
uint8_t partition;
if (a >= b && a >= c && a >= d)
{
partition = 0;
}
else if (b >= c && b >= d)
{
partition = 1;
}
else if (c >= d)
{
partition = 2;
}
else
{
partition = 3;
}
return partition;
}
/**
* @brief Generate a single partition info structure.
*
* @param[out] bsd The block size information.
* @param partition_count The partition count of this partitioning.
* @param partition_index The partition index / seed of this partitioning.
* @param partition_remap_index The remapped partition index of this partitioning.
* @param[out] pi The partition info structure to populate.
*
* @return True if this is a useful partition index, False if we can skip it.
*/
static bool generate_one_partition_info_entry(
block_size_descriptor& bsd,
unsigned int partition_count,
unsigned int partition_index,
unsigned int partition_remap_index,
partition_info& pi
) {
int texels_per_block = bsd.texel_count;
bool small_block = texels_per_block < 32;
uint8_t *partition_of_texel = pi.partition_of_texel;
// Assign texels to partitions
int texel_idx = 0;
int counts[BLOCK_MAX_PARTITIONS] { 0 };
for (unsigned int z = 0; z < bsd.zdim; z++)
{
for (unsigned int y = 0; y < bsd.ydim; y++)
{
for (unsigned int x = 0; x < bsd.xdim; x++)
{
uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
*partition_of_texel++ = part;
}
}
}
// Fill loop tail so we can overfetch later
for (unsigned int i = 0; i < partition_count; i++)
{
int ptex_count = counts[i];
int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
for (int j = ptex_count; j < ptex_count_simd; j++)
{
pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
}
}
// Populate the actual procedural partition count
if (counts[0] == 0)
{
pi.partition_count = 0;
}
else if (counts[1] == 0)
{
pi.partition_count = 1;
}
else if (counts[2] == 0)
{
pi.partition_count = 2;
}
else if (counts[3] == 0)
{
pi.partition_count = 3;
}
else
{
pi.partition_count = 4;
}
// Populate the partition index
pi.partition_index = static_cast<uint16_t>(partition_index);
// Populate the coverage bitmaps for 2/3/4 partitions
uint64_t* bitmaps { nullptr };
if (partition_count == 2)
{
bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
}
else if (partition_count == 3)
{
bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
}
else if (partition_count == 4)
{
bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
}
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
{
pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
}
// Valid partitionings have texels in all of the requested partitions
bool valid = pi.partition_count == partition_count;
if (bitmaps)
{
// Populate the partition coverage bitmap
for (unsigned int i = 0; i < partition_count; i++)
{
bitmaps[i] = 0ULL;
}
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
for (unsigned int i = 0; i < texels_to_process; i++)
{
unsigned int idx = bsd.kmeans_texels[i];
bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
}
}
return valid;
}
static void build_partition_table_for_one_partition_count(
block_size_descriptor& bsd,
bool can_omit_partitionings,
unsigned int partition_count_cutoff,
unsigned int partition_count,
partition_info* ptab,
uint64_t* canonical_patterns
) {
unsigned int next_index = 0;
bsd.partitioning_count_selected[partition_count - 1] = 0;
bsd.partitioning_count_all[partition_count - 1] = 0;
// Skip tables larger than config max partition count if we can omit modes
if (can_omit_partitionings && (partition_count > partition_count_cutoff))
{
return;
}
// Iterate through twice
// - Pass 0: Keep selected partitionings
// - Pass 1: Keep non-selected partitionings (skip if in omit mode)
unsigned int max_iter = can_omit_partitionings ? 1 : 2;
// Tracker for things we built in the first iteration
uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
for (unsigned int x = 0; x < max_iter; x++)
{
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
{
// Don't include things we built in the first pass
if ((x == 1) && build[i])
{
continue;
}
bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
if ((x == 0) && !keep_useful)
{
continue;
}
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
bool keep_canonical = true;
for (unsigned int j = 0; j < next_index; j++)
{
bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns + j * BIT_PATTERN_WORDS);
if (match)
{
keep_canonical = false;
break;
}
}
if (keep_useful && keep_canonical)
{
if (x == 0)
{
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_selected[partition_count - 1]++;
bsd.partitioning_count_all[partition_count - 1]++;
build[i] = 1;
next_index++;
}
}
else
{
if (x == 1)
{
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_all[partition_count - 1]++;
next_index++;
}
}
}
}
}
/* See header for documentation. */
void init_partition_tables(
block_size_descriptor& bsd,
bool can_omit_partitionings,
unsigned int partition_count_cutoff
) {
partition_info* par_tab2 = bsd.partitionings;
partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
bsd.partitioning_count_selected[0] = 1;
bsd.partitioning_count_all[0] = 1;
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
delete[] canonical_patterns;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,166 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Platform-specific function implementations.
*
* This module contains functions for querying the host extended ISA support.
*/
// Include before the defines below to pick up any auto-setup based on compiler
// built-in config, if not being set explicitly by the build system
#include "astcenc_internal.h"
#if (ASTCENC_SSE > 0) || (ASTCENC_AVX > 0) || \
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
static bool g_init { false };
/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
static bool g_cpu_has_sse41 { false };
/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
static bool g_cpu_has_avx2 { false };
/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
static bool g_cpu_has_popcnt { false };
/** Does this CPU support F16C? Set to -1 if not yet initialized. */
static bool g_cpu_has_f16c { false };
/* ============================================================================
Platform code for Visual Studio
============================================================================ */
#if !defined(__clang__) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <intrin.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
int data[4];
__cpuid(data, 0);
int num_id = data[0];
if (num_id >= 1)
{
__cpuidex(data, 1, 0);
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
if (num_id >= 7)
{
__cpuidex(data, 7, 0);
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
MemoryBarrier();
g_init = true;
}
/* ============================================================================
Platform code for GCC and Clang
============================================================================ */
#else
#include <cpuid.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
unsigned int data[4];
if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
{
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
g_cpu_has_avx2 = 0;
if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
{
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
__sync_synchronize();
g_init = true;
}
#endif
/* See header for documentation. */
bool cpu_supports_popcnt()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_popcnt;
}
/* See header for documentation. */
bool cpu_supports_f16c()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_f16c;
}
/* See header for documentation. */
bool cpu_supports_sse41()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_sse41;
}
/* See header for documentation. */
bool cpu_supports_avx2()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_avx2;
}
#endif

View File

@ -0,0 +1,904 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data tables for numeric quantization..
*/
#include "astcenc_internal.h"
#if !defined(ASTCENC_DECOMPRESS_ONLY)
// Starts from QUANT_6
// Not scrambled
const uint8_t color_unquant_to_uquant_tables[17][256] {
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 73, 73, 73, 73, 73, 73, 73, 73, 73,
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 109, 109, 109, 109,
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219,
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 56, 56, 56, 56, 56,
56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56,
56, 56, 56, 56, 56, 56, 56, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171,
171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199,
199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 69, 69, 69, 69, 69, 69,
69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
69, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
92, 92, 92, 92, 92, 92, 92, 92, 92, 116, 116, 116, 116, 116, 116, 116,
116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163,
163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232,
232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34,
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68,
68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85,
85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119,
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 67, 67, 67,
67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80, 94, 94, 94, 94, 94, 94, 94, 94,
94, 94, 94, 94, 94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148,
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161,
161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175,
175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228,
228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242,
242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33,
33, 33, 33, 33, 33, 33, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44,
44, 44, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 77, 77, 77, 77, 77, 77, 77, 77,
77, 77, 77, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99, 99, 110, 110, 110, 110, 110, 110, 110,
110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145,
145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156,
156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178,
178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189,
189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211,
211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222,
222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244,
244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16,
16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, 33, 33, 33,
33, 33, 33, 33, 33, 33, 41, 41, 41, 41, 41, 41, 41, 41, 49, 49,
49, 49, 49, 49, 49, 49, 57, 57, 57, 57, 57, 57, 57, 57, 66, 66,
66, 66, 66, 66, 66, 66, 66, 74, 74, 74, 74, 74, 74, 74, 74, 82,
82, 82, 82, 82, 82, 82, 82, 90, 90, 90, 90, 90, 90, 90, 90, 99,
99, 99, 99, 99, 99, 99, 99, 99, 107, 107, 107, 107, 107, 107, 107, 107,
115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123,
132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140,
148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156,
156, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173,
173, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189,
189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206,
206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222,
222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239,
239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255
},
{
0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 13, 13, 13, 13, 13, 13,
13, 19, 19, 19, 19, 19, 19, 26, 26, 26, 26, 26, 26, 26, 32, 32,
32, 32, 32, 32, 39, 39, 39, 39, 39, 39, 39, 45, 45, 45, 45, 45,
45, 52, 52, 52, 52, 52, 52, 52, 58, 58, 58, 58, 58, 58, 65, 65,
65, 65, 65, 65, 65, 71, 71, 71, 71, 71, 71, 78, 78, 78, 78, 78,
78, 78, 84, 84, 84, 84, 84, 84, 91, 91, 91, 91, 91, 91, 91, 97,
97, 97, 97, 97, 97, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110,
110, 110, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123,
132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 145, 145,
145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158,
158, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 177, 177,
177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190,
190, 190, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 210,
210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223,
223, 223, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 242,
242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255
},
{
0, 0, 0, 5, 5, 5, 5, 5, 5, 11, 11, 11, 11, 11, 16, 16,
16, 16, 16, 21, 21, 21, 21, 21, 21, 27, 27, 27, 27, 27, 32, 32,
32, 32, 32, 32, 38, 38, 38, 38, 38, 43, 43, 43, 43, 43, 48, 48,
48, 48, 48, 48, 54, 54, 54, 54, 54, 59, 59, 59, 59, 59, 59, 65,
65, 65, 65, 65, 70, 70, 70, 70, 70, 70, 76, 76, 76, 76, 76, 81,
81, 81, 81, 81, 86, 86, 86, 86, 86, 86, 92, 92, 92, 92, 92, 97,
97, 97, 97, 97, 97, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 113,
113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124,
131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142,
142, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158,
158, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174,
174, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190,
190, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 207, 207, 207, 207,
207, 207, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 223, 223, 223, 223,
223, 223, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 239, 239, 239,
239, 239, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 255, 255, 255
},
{
0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16,
16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28, 32,
32, 32, 32, 36, 36, 36, 36, 40, 40, 40, 40, 44, 44, 44, 44, 48,
48, 48, 48, 52, 52, 52, 52, 56, 56, 56, 56, 60, 60, 60, 60, 65,
65, 65, 65, 65, 69, 69, 69, 69, 73, 73, 73, 73, 77, 77, 77, 77,
81, 81, 81, 81, 85, 85, 85, 85, 89, 89, 89, 89, 93, 93, 93, 93,
97, 97, 97, 97, 101, 101, 101, 101, 105, 105, 105, 105, 109, 109, 109, 109,
113, 113, 113, 113, 117, 117, 117, 117, 121, 121, 121, 121, 125, 125, 125, 125,
130, 130, 130, 130, 134, 134, 134, 134, 138, 138, 138, 138, 142, 142, 142, 142,
146, 146, 146, 146, 150, 150, 150, 150, 154, 154, 154, 154, 158, 158, 158, 158,
162, 162, 162, 162, 166, 166, 166, 166, 170, 170, 170, 170, 174, 174, 174, 174,
178, 178, 178, 178, 182, 182, 182, 182, 186, 186, 186, 186, 190, 190, 190, 190,
190, 195, 195, 195, 195, 199, 199, 199, 199, 203, 203, 203, 203, 207, 207, 207,
207, 211, 211, 211, 211, 215, 215, 215, 215, 219, 219, 219, 219, 223, 223, 223,
223, 227, 227, 227, 227, 231, 231, 231, 231, 235, 235, 235, 235, 239, 239, 239,
239, 243, 243, 243, 243, 247, 247, 247, 247, 251, 251, 251, 251, 255, 255, 255
},
{
0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 9, 9, 13, 13, 13, 16,
16, 16, 19, 19, 19, 22, 22, 22, 25, 25, 25, 25, 29, 29, 29, 32,
32, 32, 35, 35, 35, 38, 38, 38, 38, 42, 42, 42, 45, 45, 45, 48,
48, 48, 51, 51, 51, 54, 54, 54, 54, 58, 58, 58, 61, 61, 61, 64,
64, 64, 67, 67, 67, 67, 71, 71, 71, 74, 74, 74, 77, 77, 77, 80,
80, 80, 83, 83, 83, 83, 87, 87, 87, 90, 90, 90, 93, 93, 93, 96,
96, 96, 96, 100, 100, 100, 103, 103, 103, 106, 106, 106, 109, 109, 109, 112,
112, 112, 112, 116, 116, 116, 119, 119, 119, 122, 122, 122, 125, 125, 125, 125,
130, 130, 130, 130, 133, 133, 133, 136, 136, 136, 139, 139, 139, 143, 143, 143,
143, 146, 146, 146, 149, 149, 149, 152, 152, 152, 155, 155, 155, 159, 159, 159,
159, 162, 162, 162, 165, 165, 165, 168, 168, 168, 172, 172, 172, 172, 175, 175,
175, 178, 178, 178, 181, 181, 181, 184, 184, 184, 188, 188, 188, 188, 191, 191,
191, 194, 194, 194, 197, 197, 197, 201, 201, 201, 201, 204, 204, 204, 207, 207,
207, 210, 210, 210, 213, 213, 213, 217, 217, 217, 217, 220, 220, 220, 223, 223,
223, 226, 226, 226, 230, 230, 230, 230, 233, 233, 233, 236, 236, 236, 239, 239,
239, 242, 242, 242, 246, 246, 246, 246, 249, 249, 249, 252, 252, 252, 255, 255
},
{
0, 0, 2, 2, 5, 5, 5, 8, 8, 8, 10, 10, 13, 13, 13, 16,
16, 16, 18, 18, 21, 21, 21, 24, 24, 24, 26, 26, 29, 29, 29, 32,
32, 32, 35, 35, 35, 37, 37, 40, 40, 40, 43, 43, 43, 45, 45, 48,
48, 48, 51, 51, 51, 53, 53, 56, 56, 56, 59, 59, 59, 61, 61, 64,
64, 64, 67, 67, 67, 70, 70, 70, 72, 72, 75, 75, 75, 78, 78, 78,
80, 80, 83, 83, 83, 86, 86, 86, 88, 88, 91, 91, 91, 94, 94, 94,
96, 96, 99, 99, 99, 102, 102, 102, 104, 104, 107, 107, 107, 110, 110, 110,
112, 112, 115, 115, 115, 118, 118, 118, 120, 120, 123, 123, 123, 126, 126, 126,
129, 129, 129, 132, 132, 132, 135, 135, 137, 137, 137, 140, 140, 140, 143, 143,
145, 145, 145, 148, 148, 148, 151, 151, 153, 153, 153, 156, 156, 156, 159, 159,
161, 161, 161, 164, 164, 164, 167, 167, 169, 169, 169, 172, 172, 172, 175, 175,
177, 177, 177, 180, 180, 180, 183, 183, 185, 185, 185, 188, 188, 188, 191, 191,
191, 194, 194, 196, 196, 196, 199, 199, 199, 202, 202, 204, 204, 204, 207, 207,
207, 210, 210, 212, 212, 212, 215, 215, 215, 218, 218, 220, 220, 220, 223, 223,
223, 226, 226, 226, 229, 229, 231, 231, 231, 234, 234, 234, 237, 237, 239, 239,
239, 242, 242, 242, 245, 245, 247, 247, 247, 250, 250, 250, 253, 253, 255, 255
},
{
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28, 30, 30,
32, 32, 34, 34, 36, 36, 38, 38, 40, 40, 42, 42, 44, 44, 46, 46,
48, 48, 50, 50, 52, 52, 54, 54, 56, 56, 58, 58, 60, 60, 62, 62,
64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 74, 74, 76, 76, 78, 78,
80, 80, 82, 82, 84, 84, 86, 86, 88, 88, 90, 90, 92, 92, 94, 94,
96, 96, 98, 98, 100, 100, 102, 102, 104, 104, 106, 106, 108, 108, 110, 110,
112, 112, 114, 114, 116, 116, 118, 118, 120, 120, 122, 122, 124, 124, 126, 126,
129, 129, 131, 131, 133, 133, 135, 135, 137, 137, 139, 139, 141, 141, 143, 143,
145, 145, 147, 147, 149, 149, 151, 151, 153, 153, 155, 155, 157, 157, 159, 159,
161, 161, 163, 163, 165, 165, 167, 167, 169, 169, 171, 171, 173, 173, 175, 175,
177, 177, 179, 179, 181, 181, 183, 183, 185, 185, 187, 187, 189, 189, 191, 191,
193, 193, 195, 195, 197, 197, 199, 199, 201, 201, 203, 203, 205, 205, 207, 207,
209, 209, 211, 211, 213, 213, 215, 215, 217, 217, 219, 219, 221, 221, 223, 223,
225, 225, 227, 227, 229, 229, 231, 231, 233, 233, 235, 235, 237, 237, 239, 239,
241, 241, 243, 243, 245, 245, 247, 247, 249, 249, 251, 251, 253, 253, 255, 255
},
{
0, 1, 1, 3, 4, 4, 6, 6, 8, 9, 9, 11, 12, 12, 14, 14,
16, 17, 17, 19, 20, 20, 22, 22, 24, 25, 25, 27, 28, 28, 30, 30,
32, 33, 33, 35, 36, 36, 38, 38, 40, 41, 41, 43, 44, 44, 46, 46,
48, 49, 49, 51, 52, 52, 54, 54, 56, 57, 57, 59, 60, 60, 62, 62,
64, 65, 65, 67, 68, 68, 70, 70, 72, 73, 73, 75, 76, 76, 78, 78,
80, 81, 81, 83, 84, 84, 86, 86, 88, 89, 89, 91, 92, 92, 94, 94,
96, 97, 97, 99, 100, 100, 102, 102, 104, 105, 105, 107, 108, 108, 110, 110,
112, 113, 113, 115, 116, 116, 118, 118, 120, 121, 121, 123, 124, 124, 126, 126,
129, 129, 131, 131, 132, 134, 134, 135, 137, 137, 139, 139, 140, 142, 142, 143,
145, 145, 147, 147, 148, 150, 150, 151, 153, 153, 155, 155, 156, 158, 158, 159,
161, 161, 163, 163, 164, 166, 166, 167, 169, 169, 171, 171, 172, 174, 174, 175,
177, 177, 179, 179, 180, 182, 182, 183, 185, 185, 187, 187, 188, 190, 190, 191,
193, 193, 195, 195, 196, 198, 198, 199, 201, 201, 203, 203, 204, 206, 206, 207,
209, 209, 211, 211, 212, 214, 214, 215, 217, 217, 219, 219, 220, 222, 222, 223,
225, 225, 227, 227, 228, 230, 230, 231, 233, 233, 235, 235, 236, 238, 238, 239,
241, 241, 243, 243, 244, 246, 246, 247, 249, 249, 251, 251, 252, 254, 254, 255
},
{
0, 1, 2, 2, 4, 5, 6, 6, 8, 9, 10, 10, 12, 13, 14, 14,
16, 17, 18, 18, 20, 21, 22, 22, 24, 25, 26, 26, 28, 29, 30, 30,
32, 33, 34, 34, 36, 37, 38, 38, 40, 41, 42, 42, 44, 45, 46, 46,
48, 49, 50, 50, 52, 53, 54, 54, 56, 57, 58, 58, 60, 61, 62, 62,
64, 65, 66, 66, 68, 69, 70, 70, 72, 73, 74, 74, 76, 77, 78, 78,
80, 81, 82, 82, 84, 85, 86, 86, 88, 89, 90, 90, 92, 93, 94, 94,
96, 97, 98, 98, 100, 101, 102, 102, 104, 105, 106, 106, 108, 109, 110, 110,
112, 113, 114, 114, 116, 117, 118, 118, 120, 121, 122, 122, 124, 125, 126, 126,
129, 129, 130, 131, 133, 133, 134, 135, 137, 137, 138, 139, 141, 141, 142, 143,
145, 145, 146, 147, 149, 149, 150, 151, 153, 153, 154, 155, 157, 157, 158, 159,
161, 161, 162, 163, 165, 165, 166, 167, 169, 169, 170, 171, 173, 173, 174, 175,
177, 177, 178, 179, 181, 181, 182, 183, 185, 185, 186, 187, 189, 189, 190, 191,
193, 193, 194, 195, 197, 197, 198, 199, 201, 201, 202, 203, 205, 205, 206, 207,
209, 209, 210, 211, 213, 213, 214, 215, 217, 217, 218, 219, 221, 221, 222, 223,
225, 225, 226, 227, 229, 229, 230, 231, 233, 233, 234, 235, 237, 237, 238, 239,
241, 241, 242, 243, 245, 245, 246, 247, 249, 249, 250, 251, 253, 253, 254, 255
},
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
}
};
// Starts from QUANT_6
// Scrambled
const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15
},
{
0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1
},
{
0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1
},
{
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10,
10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
19, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25,
25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31
},
{
0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16,
16, 24, 24, 24, 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 2, 2,
2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 18, 18, 18, 18, 18,
18, 26, 26, 26, 26, 26, 26, 26, 34, 34, 34, 34, 34, 34, 4, 4,
4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 20, 20, 20, 20, 20,
20, 20, 28, 28, 28, 28, 28, 28, 36, 36, 36, 36, 36, 36, 36, 6,
6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14, 22, 22, 22, 22,
22, 22, 30, 30, 30, 30, 30, 30, 30, 38, 38, 38, 38, 38, 38, 38,
39, 39, 39, 39, 39, 39, 39, 31, 31, 31, 31, 31, 31, 31, 23, 23,
23, 23, 23, 23, 15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7,
7, 37, 37, 37, 37, 37, 37, 37, 29, 29, 29, 29, 29, 29, 21, 21,
21, 21, 21, 21, 21, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5,
5, 5, 35, 35, 35, 35, 35, 35, 27, 27, 27, 27, 27, 27, 27, 19,
19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3,
3, 3, 33, 33, 33, 33, 33, 33, 33, 25, 25, 25, 25, 25, 25, 17,
17, 17, 17, 17, 17, 17, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1
},
{
0, 0, 0, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 2, 2,
2, 2, 2, 18, 18, 18, 18, 18, 18, 34, 34, 34, 34, 34, 4, 4,
4, 4, 4, 4, 20, 20, 20, 20, 20, 36, 36, 36, 36, 36, 6, 6,
6, 6, 6, 6, 22, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38, 8,
8, 8, 8, 8, 24, 24, 24, 24, 24, 24, 40, 40, 40, 40, 40, 10,
10, 10, 10, 10, 26, 26, 26, 26, 26, 26, 42, 42, 42, 42, 42, 12,
12, 12, 12, 12, 12, 28, 28, 28, 28, 28, 44, 44, 44, 44, 44, 14,
14, 14, 14, 14, 14, 30, 30, 30, 30, 30, 46, 46, 46, 46, 46, 46,
47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 15, 15, 15, 15, 15,
15, 45, 45, 45, 45, 45, 29, 29, 29, 29, 29, 13, 13, 13, 13, 13,
13, 43, 43, 43, 43, 43, 27, 27, 27, 27, 27, 27, 11, 11, 11, 11,
11, 41, 41, 41, 41, 41, 25, 25, 25, 25, 25, 25, 9, 9, 9, 9,
9, 39, 39, 39, 39, 39, 39, 23, 23, 23, 23, 23, 7, 7, 7, 7,
7, 7, 37, 37, 37, 37, 37, 21, 21, 21, 21, 21, 5, 5, 5, 5,
5, 5, 35, 35, 35, 35, 35, 19, 19, 19, 19, 19, 19, 3, 3, 3,
3, 3, 33, 33, 33, 33, 33, 17, 17, 17, 17, 17, 17, 1, 1, 1
},
{
0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8,
8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16,
16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31,
32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51,
51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55,
55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59,
59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63
},
{
0, 0, 16, 16, 16, 32, 32, 32, 48, 48, 48, 48, 64, 64, 64, 2,
2, 2, 18, 18, 18, 34, 34, 34, 50, 50, 50, 50, 66, 66, 66, 4,
4, 4, 20, 20, 20, 36, 36, 36, 36, 52, 52, 52, 68, 68, 68, 6,
6, 6, 22, 22, 22, 38, 38, 38, 38, 54, 54, 54, 70, 70, 70, 8,
8, 8, 24, 24, 24, 24, 40, 40, 40, 56, 56, 56, 72, 72, 72, 10,
10, 10, 26, 26, 26, 26, 42, 42, 42, 58, 58, 58, 74, 74, 74, 12,
12, 12, 12, 28, 28, 28, 44, 44, 44, 60, 60, 60, 76, 76, 76, 14,
14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 78, 78, 78, 78,
79, 79, 79, 79, 63, 63, 63, 47, 47, 47, 31, 31, 31, 15, 15, 15,
15, 77, 77, 77, 61, 61, 61, 45, 45, 45, 29, 29, 29, 13, 13, 13,
13, 75, 75, 75, 59, 59, 59, 43, 43, 43, 27, 27, 27, 27, 11, 11,
11, 73, 73, 73, 57, 57, 57, 41, 41, 41, 25, 25, 25, 25, 9, 9,
9, 71, 71, 71, 55, 55, 55, 39, 39, 39, 39, 23, 23, 23, 7, 7,
7, 69, 69, 69, 53, 53, 53, 37, 37, 37, 37, 21, 21, 21, 5, 5,
5, 67, 67, 67, 51, 51, 51, 51, 35, 35, 35, 19, 19, 19, 3, 3,
3, 65, 65, 65, 49, 49, 49, 49, 33, 33, 33, 17, 17, 17, 1, 1
},
{
0, 0, 32, 32, 64, 64, 64, 2, 2, 2, 34, 34, 66, 66, 66, 4,
4, 4, 36, 36, 68, 68, 68, 6, 6, 6, 38, 38, 70, 70, 70, 8,
8, 8, 40, 40, 40, 72, 72, 10, 10, 10, 42, 42, 42, 74, 74, 12,
12, 12, 44, 44, 44, 76, 76, 14, 14, 14, 46, 46, 46, 78, 78, 16,
16, 16, 48, 48, 48, 80, 80, 80, 18, 18, 50, 50, 50, 82, 82, 82,
20, 20, 52, 52, 52, 84, 84, 84, 22, 22, 54, 54, 54, 86, 86, 86,
24, 24, 56, 56, 56, 88, 88, 88, 26, 26, 58, 58, 58, 90, 90, 90,
28, 28, 60, 60, 60, 92, 92, 92, 30, 30, 62, 62, 62, 94, 94, 94,
95, 95, 95, 63, 63, 63, 31, 31, 93, 93, 93, 61, 61, 61, 29, 29,
91, 91, 91, 59, 59, 59, 27, 27, 89, 89, 89, 57, 57, 57, 25, 25,
87, 87, 87, 55, 55, 55, 23, 23, 85, 85, 85, 53, 53, 53, 21, 21,
83, 83, 83, 51, 51, 51, 19, 19, 81, 81, 81, 49, 49, 49, 17, 17,
17, 79, 79, 47, 47, 47, 15, 15, 15, 77, 77, 45, 45, 45, 13, 13,
13, 75, 75, 43, 43, 43, 11, 11, 11, 73, 73, 41, 41, 41, 9, 9,
9, 71, 71, 71, 39, 39, 7, 7, 7, 69, 69, 69, 37, 37, 5, 5,
5, 67, 67, 67, 35, 35, 3, 3, 3, 65, 65, 65, 33, 33, 1, 1
},
{
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,
16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23,
24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31,
32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39,
40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47,
48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55,
56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63,
64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71,
72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87,
88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103,
104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
},
{
0, 32, 32, 64, 96, 96, 128, 128, 2, 34, 34, 66, 98, 98, 130, 130,
4, 36, 36, 68, 100, 100, 132, 132, 6, 38, 38, 70, 102, 102, 134, 134,
8, 40, 40, 72, 104, 104, 136, 136, 10, 42, 42, 74, 106, 106, 138, 138,
12, 44, 44, 76, 108, 108, 140, 140, 14, 46, 46, 78, 110, 110, 142, 142,
16, 48, 48, 80, 112, 112, 144, 144, 18, 50, 50, 82, 114, 114, 146, 146,
20, 52, 52, 84, 116, 116, 148, 148, 22, 54, 54, 86, 118, 118, 150, 150,
24, 56, 56, 88, 120, 120, 152, 152, 26, 58, 58, 90, 122, 122, 154, 154,
28, 60, 60, 92, 124, 124, 156, 156, 30, 62, 62, 94, 126, 126, 158, 158,
159, 159, 127, 127, 95, 63, 63, 31, 157, 157, 125, 125, 93, 61, 61, 29,
155, 155, 123, 123, 91, 59, 59, 27, 153, 153, 121, 121, 89, 57, 57, 25,
151, 151, 119, 119, 87, 55, 55, 23, 149, 149, 117, 117, 85, 53, 53, 21,
147, 147, 115, 115, 83, 51, 51, 19, 145, 145, 113, 113, 81, 49, 49, 17,
143, 143, 111, 111, 79, 47, 47, 15, 141, 141, 109, 109, 77, 45, 45, 13,
139, 139, 107, 107, 75, 43, 43, 11, 137, 137, 105, 105, 73, 41, 41, 9,
135, 135, 103, 103, 71, 39, 39, 7, 133, 133, 101, 101, 69, 37, 37, 5,
131, 131, 99, 99, 67, 35, 35, 3, 129, 129, 97, 97, 65, 33, 33, 1
},
{
0, 64, 128, 128, 2, 66, 130, 130, 4, 68, 132, 132, 6, 70, 134, 134,
8, 72, 136, 136, 10, 74, 138, 138, 12, 76, 140, 140, 14, 78, 142, 142,
16, 80, 144, 144, 18, 82, 146, 146, 20, 84, 148, 148, 22, 86, 150, 150,
24, 88, 152, 152, 26, 90, 154, 154, 28, 92, 156, 156, 30, 94, 158, 158,
32, 96, 160, 160, 34, 98, 162, 162, 36, 100, 164, 164, 38, 102, 166, 166,
40, 104, 168, 168, 42, 106, 170, 170, 44, 108, 172, 172, 46, 110, 174, 174,
48, 112, 176, 176, 50, 114, 178, 178, 52, 116, 180, 180, 54, 118, 182, 182,
56, 120, 184, 184, 58, 122, 186, 186, 60, 124, 188, 188, 62, 126, 190, 190,
191, 191, 127, 63, 189, 189, 125, 61, 187, 187, 123, 59, 185, 185, 121, 57,
183, 183, 119, 55, 181, 181, 117, 53, 179, 179, 115, 51, 177, 177, 113, 49,
175, 175, 111, 47, 173, 173, 109, 45, 171, 171, 107, 43, 169, 169, 105, 41,
167, 167, 103, 39, 165, 165, 101, 37, 163, 163, 99, 35, 161, 161, 97, 33,
159, 159, 95, 31, 157, 157, 93, 29, 155, 155, 91, 27, 153, 153, 89, 25,
151, 151, 87, 23, 149, 149, 85, 21, 147, 147, 83, 19, 145, 145, 81, 17,
143, 143, 79, 15, 141, 141, 77, 13, 139, 139, 75, 11, 137, 137, 73, 9,
135, 135, 71, 7, 133, 133, 69, 5, 131, 131, 67, 3, 129, 129, 65, 1
},
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
}
};
#endif
// Starts from QUANT_6
// Scrambled
static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
0, 255, 51, 204, 102, 153
};
static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
0, 36, 73, 109, 146, 182, 219, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
0, 255, 28, 227, 56, 199, 84, 171, 113, 142
};
static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
0, 255, 69, 186, 23, 232, 92, 163, 46, 209, 116, 139
};
static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
0, 255, 67, 188, 13, 242, 80, 175, 27, 228, 94, 161, 40, 215, 107, 148,
54, 201, 121, 134
};
static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
0, 255, 33, 222, 66, 189, 99, 156, 11, 244, 44, 211, 77, 178, 110, 145,
22, 233, 55, 200, 88, 167, 121, 134
};
static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123,
132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
0, 255, 32, 223, 65, 190, 97, 158, 6, 249, 39, 216, 71, 184, 104, 151,
13, 242, 45, 210, 78, 177, 110, 145, 19, 236, 52, 203, 84, 171, 117, 138,
26, 229, 58, 197, 91, 164, 123, 132
};
static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
0, 255, 16, 239, 32, 223, 48, 207, 65, 190, 81, 174, 97, 158, 113, 142,
5, 250, 21, 234, 38, 217, 54, 201, 70, 185, 86, 169, 103, 152, 119, 136,
11, 244, 27, 228, 43, 212, 59, 196, 76, 179, 92, 163, 108, 147, 124, 131
};
static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125,
130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
};
static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
0, 255, 16, 239, 32, 223, 48, 207, 64, 191, 80, 175, 96, 159, 112, 143,
3, 252, 19, 236, 35, 220, 51, 204, 67, 188, 83, 172, 100, 155, 116, 139,
6, 249, 22, 233, 38, 217, 54, 201, 71, 184, 87, 168, 103, 152, 119, 136,
9, 246, 25, 230, 42, 213, 58, 197, 74, 181, 90, 165, 106, 149, 122, 133,
13, 242, 29, 226, 45, 210, 61, 194, 77, 178, 93, 162, 109, 146, 125, 130
};
static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
2, 253, 10, 245, 18, 237, 26, 229, 35, 220, 43, 212, 51, 204, 59, 196,
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
5, 250, 13, 242, 21, 234, 29, 226, 37, 218, 45, 210, 53, 202, 61, 194,
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
1, 254, 9, 246, 17, 238, 25, 230, 33, 222, 41, 214, 49, 206, 57, 198,
65, 190, 73, 182, 81, 174, 89, 166, 97, 158, 105, 150, 113, 142, 121, 134,
3, 252, 11, 244, 19, 236, 27, 228, 35, 220, 43, 212, 51, 204, 59, 196,
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
4, 251, 12, 243, 20, 235, 28, 227, 36, 219, 44, 211, 52, 203, 60, 195,
68, 187, 76, 179, 84, 171, 92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
6, 249, 14, 241, 22, 233, 30, 225, 38, 217, 46, 209, 54, 201, 62, 193,
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
0, 255, 4, 251, 8, 247, 12, 243, 16, 239, 20, 235, 24, 231, 28, 227,
32, 223, 36, 219, 40, 215, 44, 211, 48, 207, 52, 203, 56, 199, 60, 195,
64, 191, 68, 187, 72, 183, 76, 179, 80, 175, 84, 171, 88, 167, 92, 163,
96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
1, 254, 5, 250, 9, 246, 13, 242, 17, 238, 21, 234, 25, 230, 29, 226,
33, 222, 37, 218, 41, 214, 45, 210, 49, 206, 53, 202, 57, 198, 61, 194,
65, 190, 69, 186, 73, 182, 77, 178, 81, 174, 85, 170, 89, 166, 93, 162,
97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
2, 253, 6, 249, 10, 245, 14, 241, 18, 237, 22, 233, 26, 229, 30, 225,
34, 221, 38, 217, 42, 213, 46, 209, 50, 205, 54, 201, 58, 197, 62, 193,
66, 189, 70, 185, 74, 181, 78, 177, 82, 173, 86, 169, 90, 165, 94, 161,
98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
};
const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
color_scrambled_pquant_to_uquant_q6,
color_scrambled_pquant_to_uquant_q8,
color_scrambled_pquant_to_uquant_q10,
color_scrambled_pquant_to_uquant_q12,
color_scrambled_pquant_to_uquant_q16,
color_scrambled_pquant_to_uquant_q20,
color_scrambled_pquant_to_uquant_q24,
color_scrambled_pquant_to_uquant_q32,
color_scrambled_pquant_to_uquant_q40,
color_scrambled_pquant_to_uquant_q48,
color_scrambled_pquant_to_uquant_q64,
color_scrambled_pquant_to_uquant_q80,
color_scrambled_pquant_to_uquant_q96,
color_scrambled_pquant_to_uquant_q128,
color_scrambled_pquant_to_uquant_q160,
color_scrambled_pquant_to_uquant_q192,
color_scrambled_pquant_to_uquant_q256
};
// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
// count and number of bits that the integer may fit into.
const int8_t quant_mode_table[10][128] {
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
},
{
-1, -1, 0, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7,
8, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1,
2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7,
8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5,
5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10,
10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6,
6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10,
11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9,
9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
}
};

View File

@ -0,0 +1,534 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for converting between symbolic and physical encodings.
*/
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Write up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
* may span two separate bytes in memory.
*
* @param value The value to write.
* @param bitcount The number of bits to write, starting from LSB.
* @param bitoffset The bit offset to store at, between 0 and 7.
* @param[in,out] ptr The data pointer to write to.
*/
static inline void write_bits(
int value,
int bitcount,
int bitoffset,
uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
value &= mask;
ptr += bitoffset >> 3;
bitoffset &= 7;
value <<= bitoffset;
mask <<= bitoffset;
mask = ~mask;
ptr[0] &= mask;
ptr[0] |= value;
ptr[1] &= mask >> 8;
ptr[1] |= value >> 8;
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline int read_bits(
int bitcount,
int bitoffset,
const uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
/**
* @brief Reverse bits in a byte.
*
* @param p The value to reverse.
*
* @return The reversed result.
*/
static inline int bitrev8(int p)
{
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
return p;
}
/* See header for documentation. */
void symbolic_to_physical(
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
physical_compressed_block& pcb
) {
assert(scb.block_type != SYM_BTYPE_ERROR);
// Constant color block using UNORM16 colors
if (scb.block_type == SYM_BTYPE_CONST_U16)
{
// There is currently no attempt to coalesce larger void-extents
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb.data[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
}
// Constant color block using FP16 colors
if (scb.block_type == SYM_BTYPE_CONST_F16)
{
// There is currently no attempt to coalesce larger void-extents
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb.data[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
}
unsigned int partition_count = scb.partition_count;
// Compress the weights.
// They are encoded as an ordinary integer-sequence, then bit-reversed
uint8_t weightbuf[16] { 0 };
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
quant_method weight_quant_method = bm.get_weight_quant_mode();
float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
int is_dual_plane = bm.is_dual_plane;
const auto& qat = quant_and_xfer_tables[weight_quant_method];
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
uint8_t weights[64];
if (is_dual_plane)
{
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[2 * i] = qat.scramble_map[qwi];
uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
qwi = static_cast<int>(qw + 0.5f);
weights[2 * i + 1] = qat.scramble_map[qwi];
}
}
else
{
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[i] = qat.scramble_map[qwi];
}
}
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
for (int i = 0; i < 16; i++)
{
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
}
write_bits(scb.block_mode, 11, 0, pcb.data);
write_bits(partition_count - 1, 2, 11, pcb.data);
int below_weights_pos = 128 - bits_for_weights;
// Encode partition index and color endpoint types for blocks with 2+ partitions
if (partition_count > 1)
{
write_bits(scb.partition_index, 6, 13, pcb.data);
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
if (scb.color_formats_matched)
{
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
}
else
{
// Check endpoint types for each partition to determine the lowest class present
int low_class = 4;
for (unsigned int i = 0; i < partition_count; i++)
{
int class_of_format = scb.color_formats[i] >> 2;
low_class = astc::min(class_of_format, low_class);
}
if (low_class == 3)
{
low_class = 2;
}
int encoded_type = low_class + 1;
int bitpos = 2;
for (unsigned int i = 0; i < partition_count; i++)
{
int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
encoded_type |= classbit_of_format << bitpos;
bitpos++;
}
for (unsigned int i = 0; i < partition_count; i++)
{
int lowbits_of_format = scb.color_formats[i] & 3;
encoded_type |= lowbits_of_format << bitpos;
bitpos += 2;
}
int encoded_type_lowpart = encoded_type & 0x3F;
int encoded_type_highpart = encoded_type >> 6;
int encoded_type_highpart_size = (3 * partition_count) - 4;
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
below_weights_pos -= encoded_type_highpart_size;
}
}
else
{
write_bits(scb.color_formats[0], 4, 13, pcb.data);
}
// In dual-plane mode, encode the color component of the second plane of weights
if (is_dual_plane)
{
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
}
// Encode the color components
uint8_t values_to_encode[32];
int valuecount_to_encode = 0;
const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
for (unsigned int i = 0; i < scb.partition_count; i++)
{
int vals = 2 * (scb.color_formats[i] >> 2) + 2;
assert(vals <= 8);
for (int j = 0; j < vals; j++)
{
values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
}
valuecount_to_encode += vals;
}
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
}
/* See header for documentation. */
void physical_to_symbolic(
const block_size_descriptor& bsd,
const physical_compressed_block& pcb,
symbolic_compressed_block& scb
) {
uint8_t bswapped[16];
scb.block_type = SYM_BTYPE_NONCONST;
// Extract header fields
int block_mode = read_bits(11, 0, pcb.data);
if ((block_mode & 0x1FF) == 0x1FC)
{
// Constant color block
// Check what format the data has
if (block_mode & 0x200)
{
scb.block_type = SYM_BTYPE_CONST_F16;
}
else
{
scb.block_type = SYM_BTYPE_CONST_U16;
}
scb.partition_count = 0;
for (int i = 0; i < 4; i++)
{
scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
}
// Additionally, check that the void-extent
if (bsd.zdim == 1)
{
// 2D void-extent
int rsvbits = read_bits(2, 10, pcb.data);
if (rsvbits != 3)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
}
else
{
// 3D void-extent
int vx_low_s = read_bits(9, 10, pcb.data);
int vx_high_s = read_bits(9, 19, pcb.data);
int vx_low_t = read_bits(9, 28, pcb.data);
int vx_high_t = read_bits(9, 37, pcb.data);
int vx_low_p = read_bits(9, 46, pcb.data);
int vx_high_p = read_bits(9, 55, pcb.data);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
}
return;
}
unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
if (packed_index == BLOCK_BAD_BLOCK_MODE)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
const auto& bm = bsd.get_block_mode(block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
promise(weight_count > 0);
quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
int is_dual_plane = bm.is_dual_plane;
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int partition_count = read_bits(2, 11, pcb.data) + 1;
promise(partition_count > 0);
scb.block_mode = static_cast<uint16_t>(block_mode);
scb.partition_count = static_cast<uint8_t>(partition_count);
for (int i = 0; i < 16; i++)
{
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
}
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
int below_weights_pos = 128 - bits_for_weights;
uint8_t indices[64];
const auto& qat = quant_and_xfer_tables[weight_quant_method];
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
if (is_dual_plane)
{
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
}
}
else
{
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
}
}
if (is_dual_plane && partition_count == 4)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
scb.color_formats_matched = 0;
// Determine the format of each endpoint pair
int color_formats[BLOCK_MAX_PARTITIONS];
int encoded_type_highpart_size = 0;
if (partition_count == 1)
{
color_formats[0] = read_bits(4, 13, pcb.data);
scb.partition_index = 0;
}
else
{
encoded_type_highpart_size = (3 * partition_count) - 4;
below_weights_pos -= encoded_type_highpart_size;
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
int baseclass = encoded_type & 0x3;
if (baseclass == 0)
{
for (int i = 0; i < partition_count; i++)
{
color_formats[i] = (encoded_type >> 2) & 0xF;
}
below_weights_pos += encoded_type_highpart_size;
scb.color_formats_matched = 1;
encoded_type_highpart_size = 0;
}
else
{
int bitpos = 2;
baseclass--;
for (int i = 0; i < partition_count; i++)
{
color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
bitpos++;
}
for (int i = 0; i < partition_count; i++)
{
color_formats[i] |= (encoded_type >> bitpos) & 3;
bitpos += 2;
}
}
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
}
for (int i = 0; i < partition_count; i++)
{
scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
}
// Determine number of color endpoint integers
int color_integer_count = 0;
for (int i = 0; i < partition_count; i++)
{
int endpoint_class = color_formats[i] >> 2;
color_integer_count += (endpoint_class + 1) * 2;
}
if (color_integer_count > 18)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
// Determine the color endpoint format to use
static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
if (is_dual_plane)
{
color_bits -= 2;
}
if (color_bits < 0)
{
color_bits = 0;
}
int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
if (color_quant_level < QUANT_6)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
// Unpack the integer color values and assign to endpoints
scb.quant_mode = static_cast<quant_method>(color_quant_level);
uint8_t values_to_decode[32];
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
int valuecount_to_decode = 0;
const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
for (int i = 0; i < partition_count; i++)
{
int vals = 2 * (color_formats[i] >> 2) + 2;
for (int j = 0; j < vals; j++)
{
scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
}
valuecount_to_decode += vals;
}
// Fetch component for second-plane in the case of dual plane of weights.
scb.plane2_component = -1;
if (is_dual_plane)
{
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
}
}

570
thirdparty/astcenc/astcenc_vecmathlib.h vendored Normal file
View File

@ -0,0 +1,570 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2008 Jose Fonseca
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/*
* This module implements vector support for floats, ints, and vector lane
* control masks. It provides access to both explicit vector width types, and
* flexible N-wide types where N can be determined at compile time.
*
* The design of this module encourages use of vector length agnostic code, via
* the vint, vfloat, and vmask types. These will take on the widest SIMD vector
* with that is available at compile time. The current vector width is
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
*
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
* These are provided primarily for prototyping and algorithm debug of VLA
* implementations.
*
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
* types. These are provided for use by VLA code, but are also expected to be
* used as a fixed-width type and will supported a reference C++ fallback for
* use on platforms without SIMD intrinsics.
*
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
* types. These are provide for use by VLA code, and are not expected to be
* used as a fixed-width type in normal code. No reference C implementation is
* provided on platforms without underlying SIMD intrinsics.
*
* With the current implementation ISA support is provided for:
*
* * 1-wide for scalar reference.
* * 4-wide for Armv8-A NEON.
* * 4-wide for x86-64 SSE2.
* * 4-wide for x86-64 SSE4.1.
* * 8-wide for x86-64 AVX2.
*/
#ifndef ASTC_VECMATHLIB_H_INCLUDED
#define ASTC_VECMATHLIB_H_INCLUDED
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
#include <immintrin.h>
#elif ASTCENC_NEON != 0
#include <arm_neon.h>
#endif
#if !defined(__clang__) && defined(_MSC_VER)
#define ASTCENC_SIMD_INLINE __forceinline
#define ASTCENC_NO_INLINE
#elif defined(__GNUC__) && !defined(__clang__)
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#else
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#endif
#if ASTCENC_AVX >= 2
/* If we have AVX2 expose 8-wide VLA. */
#include "astcenc_vecmathlib_sse_4.h"
#include "astcenc_vecmathlib_common_4.h"
#include "astcenc_vecmathlib_avx2_8.h"
#define ASTCENC_SIMD_WIDTH 8
using vfloat = vfloat8;
#if defined(ASTCENC_NO_INVARIANCE)
using vfloatacc = vfloat8;
#else
using vfloatacc = vfloat4;
#endif
using vint = vint8;
using vmask = vmask8;
constexpr auto loada = vfloat8::loada;
constexpr auto load1 = vfloat8::load1;
#elif ASTCENC_SSE >= 20
/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
#include "astcenc_vecmathlib_sse_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
#elif ASTCENC_NEON > 0
/* If we have NEON expose 4-wide VLA. */
#include "astcenc_vecmathlib_neon_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
#else
// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
// Note: We no longer expose the 1-wide scalar fallback because it is not
// invariant with the 4-wide path due to algorithms that use horizontal
// operations that accumulate a local vector sum before accumulating into
// a running sum.
//
// For 4 items adding into an accumulator using 1-wide vectors the sum is:
//
// result = ((((sum + l0) + l1) + l2) + l3)
//
// ... whereas the accumulator for a 4-wide vector sum is:
//
// result = sum + ((l0 + l2) + (l1 + l3))
//
// In "normal maths" this is the same, but the floating point reassociation
// differences mean that these will not produce the same result.
#include "astcenc_vecmathlib_none_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
#endif
/**
* @brief Round a count down to the largest multiple of 8.
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
{
return count & static_cast<unsigned int>(~(8 - 1));
}
/**
* @brief Round a count down to the largest multiple of 4.
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
{
return count & static_cast<unsigned int>(~(4 - 1));
}
/**
* @brief Round a count down to the largest multiple of the SIMD width.
*
* Assumption that the vector width is a power of two ...
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
{
return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
}
/**
* @brief Round a count up to the largest multiple of the SIMD width.
*
* Assumption that the vector width is a power of two ...
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
{
unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
return multiples * ASTCENC_SIMD_WIDTH;
}
/**
* @brief Return @c a with lanes negated if the @c b lane is negative.
*/
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
{
vint ia = float_as_int(a);
vint ib = float_as_int(b);
vint sign_mask(static_cast<int>(0x80000000));
vint r = ia ^ (ib & sign_mask);
return int_as_float(r);
}
/**
* @brief Return fast, but approximate, vector atan(x).
*
* Max error of this implementation is 0.004883.
*/
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
{
vmask c = abs(x) > vfloat(1.0f);
vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
vfloat y = select(x, vfloat(1.0f) / x, c);
y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
return select(y, z - y, c);
}
/**
* @brief Return fast, but approximate, vector atan2(x, y).
*/
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
{
vfloat z = atan(abs(y / x));
vmask xmask = vmask(float_as_int(x).m);
return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
}
/*
* @brief Factory that returns a unit length 4 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit4()
{
return vfloat4(0.5f);
}
/**
* @brief Factory that returns a unit length 3 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit3()
{
float val = 0.577350258827209473f;
return vfloat4(val, val, val, 0.0f);
}
/**
* @brief Factory that returns a unit length 2 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit2()
{
float val = 0.707106769084930420f;
return vfloat4(val, val, 0.0f, 0.0f);
}
/**
* @brief Factory that returns a 3 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
{
return vfloat4(a, b, c, 0.0f);
}
/**
* @brief Factory that returns a 2 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
{
return vfloat4(a, b, 0.0f, 0.0f);
}
/**
* @brief Normalize a non-zero length vector to unit length.
*/
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
{
vfloat4 length = dot(a, a);
return a / sqrt(length);
}
/**
* @brief Normalize a vector, returning @c safe if len is zero.
*/
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
{
vfloat4 length = dot(a, a);
if (length.lane<0>() != 0.0f)
{
return a / sqrt(length);
}
return safe;
}
#define POLY0(x, c0) ( c0)
#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
/**
* @brief Compute an approximate exp2(x) for each lane in the vector.
*
* Based on 5th degree minimax polynomials, ported from this blog
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
*/
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
{
x = clamp(-126.99999f, 129.0f, x);
vint4 ipart = float_to_int(x - 0.5f);
vfloat4 fpart = x - int_to_float(ipart);
// Integer contrib, using 1 << ipart
vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
vfloat4 fexp = POLY5(fpart,
9.9999994e-1f,
6.9315308e-1f,
2.4015361e-1f,
5.5826318e-2f,
8.9893397e-3f,
1.8775767e-3f);
return iexp * fexp;
}
/**
* @brief Compute an approximate log2(x) for each lane in the vector.
*
* Based on 5th degree minimax polynomials, ported from this blog
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
*/
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
{
vint4 exp(0x7F800000);
vint4 mant(0x007FFFFF);
vint4 one(0x3F800000);
vint4 i = float_as_int(x);
vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
vfloat4 m = int_as_float((i & mant) | one);
// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
vfloat4 p = POLY4(m,
2.8882704548164776201f,
-2.52074962577807006663f,
1.48116647521213171641f,
-0.465725644288844778798f,
0.0596515482674574969533f);
// Increases the polynomial degree, but ensures that log2(1) == 0
p = p * (m - 1.0f);
return p + e;
}
/**
* @brief Compute an approximate pow(x, y) for each lane in the vector.
*
* Power function based on the exp2(log2(x) * y) transform.
*/
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
{
vmask4 zero_mask = y == vfloat4(0.0f);
vfloat4 estimate = exp2(log2(x) * y);
// Guarantee that y == 0 returns exactly 1.0f
return select(estimate, vfloat4(1.0f), zero_mask);
}
/**
* @brief Count the leading zeros for each lane in @c a.
*
* Valid for all data values of @c a; will return a per-lane value [0, 32].
*/
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
{
// This function is a horrible abuse of floating point exponents to convert
// the original integer value into a 2^N encoding we can recover easily.
// Convert to float without risk of rounding up by keeping only top 8 bits.
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
a = (~lsr<8>(a)) & a;
a = float_as_int(int_to_float(a));
// Extract and unbias exponent
a = vint4(127 + 31) - lsr<23>(a);
// Clamp result to a valid 32-bit range
return clamp(0, 32, a);
}
/**
* @brief Return lanewise 2^a for each lane in @c a.
*
* Use of signed int means that this is only valid for values in range [0, 31].
*/
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
{
// 2^30 is the largest signed number than can be represented
assert(all(a < vint4(31)));
// This function is a horrible abuse of floating point to use the exponent
// and float conversion to generate a 2^N multiple.
// Bias the exponent
vint4 exp = a + 127;
exp = lsl<23>(exp);
// Reinterpret the bits as a float, and then convert to an int
vfloat4 f = int_as_float(exp);
return float_to_int(f);
}
/**
* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
*/
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
{
vint4 fp16_one = vint4(0x3C00);
vint4 fp16_small = lsl<8>(p);
vmask4 is_one = p == vint4(0xFFFF);
vmask4 is_small = p < vint4(4);
// Manually inline clz() on Visual Studio to avoid release build codegen bug
// see https://github.com/ARM-software/astc-encoder/issues/259
#if !defined(__clang__) && defined(_MSC_VER)
vint4 a = (~lsr<8>(p)) & p;
a = float_as_int(int_to_float(a));
a = vint4(127 + 31) - lsr<23>(a);
vint4 lz = clamp(0, 32, a) - 16;
#else
vint4 lz = clz(p) - 16;
#endif
p = p * two_to_the_n(lz + 1);
p = p & vint4(0xFFFF);
p = lsr<6>(p);
p = p | lsl<10>(vint4(14) - lz);
vint4 r = select(p, fp16_one, is_one);
r = select(r, fp16_small, is_small);
return r;
}
/**
* @brief Convert 16-bit LNS to float16.
*/
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
{
vint4 mc = p & 0x7FF;
vint4 ec = lsr<11>(p);
vint4 mc_512 = mc * 3;
vmask4 mask_512 = mc < vint4(512);
vint4 mc_1536 = mc * 4 - 512;
vmask4 mask_1536 = mc < vint4(1536);
vint4 mc_else = mc * 5 - 2048;
vint4 mt = mc_else;
mt = select(mt, mc_1536, mask_1536);
mt = select(mt, mc_512, mask_512);
vint4 res = lsl<10>(ec) | lsr<3>(mt);
return min(res, vint4(0x7BFF));
}
/**
* @brief Extract mantissa and exponent of a float value.
*
* @param a The input value.
* @param[out] exp The output exponent.
*
* @return The mantissa.
*/
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
{
// Interpret the bits as an integer
vint4 ai = float_as_int(a);
// Extract and unbias the exponent
exp = (lsr<23>(ai) & 0xFF) - 126;
// Extract and unbias the mantissa
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
return int_as_float(manti);
}
/**
* @brief Convert float to 16-bit LNS.
*/
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
{
vint4 exp;
vfloat4 mant = frexp(a, exp);
// Do these early before we start messing about ...
vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
vmask4 mask_infinity = a >= vfloat4(65536.0f);
// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
vmask4 exp_lt_m13 = exp < vint4(-13);
vfloat4 a1a = a * 33554432.0f;
vint4 expa = vint4::zero();
vfloat4 a1b = (mant - 0.5f) * 4096;
vint4 expb = exp + 14;
a = select(a1b, a1a, exp_lt_m13);
exp = select(expb, expa, exp_lt_m13);
vmask4 a_lt_384 = a < vfloat4(384.0f);
vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
vfloat4 a2a = a * (4.0f / 3.0f);
vfloat4 a2b = a + 128.0f;
vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
a = a2c;
a = select(a, a2b, a_lt_1408);
a = select(a, a2a, a_lt_384);
a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
a = select(a, vfloat4(65535.0f), mask_infinity);
a = select(a, vfloat4::zero(), mask_underflow_nan);
return a;
}
namespace astc
{
static ASTCENC_SIMD_INLINE float pow(float x, float y)
{
return pow(vfloat4(x), vfloat4(y)).lane<0>();
}
}
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,423 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Generic 4x32-bit vector functions.
*
* This module implements generic 4-wide vector functions that are valid for
* all instruction sets, typically implemented using lower level 4-wide
* operations that are ISA-specific.
*/
#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
#ifndef ASTCENC_SIMD_INLINE
#error "Include astcenc_vecmathlib.h, do not include directly"
#endif
#include <cstdio>
// ============================================================================
// vmask4 operators and functions
// ============================================================================
/**
* @brief True if any lanes are enabled, false otherwise.
*/
ASTCENC_SIMD_INLINE bool any(vmask4 a)
{
return mask(a) != 0;
}
/**
* @brief True if all lanes are enabled, false otherwise.
*/
ASTCENC_SIMD_INLINE bool all(vmask4 a)
{
return mask(a) == 0xF;
}
// ============================================================================
// vint4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by scalar addition.
*/
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
{
return a + vint4(b);
}
/**
* @brief Overload: vector by vector incremental addition.
*/
ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
{
a = a + b;
return a;
}
/**
* @brief Overload: vector by scalar subtraction.
*/
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
{
return a - vint4(b);
}
/**
* @brief Overload: vector by scalar multiplication.
*/
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
{
return a * vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise or.
*/
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
{
return a | vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise and.
*/
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
{
return a & vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise xor.
*/
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
{
return a ^ vint4(b);
}
/**
* @brief Return the clamped value between min and max.
*/
ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
{
return min(max(a, vint4(minv)), vint4(maxv));
}
/**
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
{
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}
// ============================================================================
// vfloat4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by vector incremental addition.
*/
ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
{
a = a + b;
return a;
}
/**
* @brief Overload: vector by scalar addition.
*/
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
{
return a + vfloat4(b);
}
/**
* @brief Overload: vector by scalar subtraction.
*/
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
{
return a - vfloat4(b);
}
/**
* @brief Overload: vector by scalar multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
{
return a * vfloat4(b);
}
/**
* @brief Overload: scalar by vector multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
{
return vfloat4(a) * b;
}
/**
* @brief Overload: vector by scalar division.
*/
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
{
return a / vfloat4(b);
}
/**
* @brief Overload: scalar by vector division.
*/
ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
{
return vfloat4(a) / b;
}
/**
* @brief Return the min vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
{
return min(a, vfloat4(b));
}
/**
* @brief Return the max vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
{
return max(a, vfloat4(b));
}
/**
* @brief Return the clamped value between min and max.
*
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
* then @c min will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, minv), maxv);
}
/**
* @brief Return the clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), maxv);
}
/**
* @brief Return the clamped value between 0.0f and 1.0f.
*
* If @c a is NaN then zero will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), 1.0f);
}
/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
{
return hmin(a).lane<0>();
}
/**
* @brief Return the horizontal min of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
{
a.set_lane<3>(a.lane<0>());
return hmin_s(a);
}
/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
{
return hmax(a).lane<0>();
}
/**
* @brief Accumulate lane-wise sums for a vector.
*/
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
{
accum = accum + a;
}
/**
* @brief Accumulate lane-wise sums for a masked vector.
*/
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
{
a = select(vfloat4::zero(), a, m);
haccumulate(accum, a);
}
/**
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
{
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}
#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
/**
* @brief Return the dot product for the full 4 lanes, returning scalar.
*/
ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return hadd_s(m);
}
/**
* @brief Return the dot product for the full 4 lanes, returning vector.
*/
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return vfloat4(hadd_s(m));
}
/**
* @brief Return the dot product for the bottom 3 lanes, returning scalar.
*/
ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return hadd_rgb_s(m);
}
/**
* @brief Return the dot product for the bottom 3 lanes, returning vector.
*/
ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
float d3 = hadd_rgb_s(m);
return vfloat4(d3, d3, d3, 0.0f);
}
#endif
#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
/**
* @brief Population bit count.
*
* @param v The value to population count.
*
* @return The number of 1 bits.
*/
static inline int popcount(uint64_t v)
{
uint64_t mask1 = 0x5555555555555555ULL;
uint64_t mask2 = 0x3333333333333333ULL;
uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
v -= (v >> 1) & mask1;
v = (v & mask2) + ((v >> 2) & mask2);
v += v >> 4;
v &= mask3;
v *= 0x0101010101010101ULL;
v >>= 56;
return static_cast<int>(v);
}
#endif
/**
* @brief Apply signed bit transfer.
*
* @param input0 The first encoded endpoint.
* @param input1 The second encoded endpoint.
*/
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
vint4& input0,
vint4& input1
) {
input1 = lsr<1>(input1) | (input0 & 0x80);
input0 = lsr<1>(input0) & 0x3F;
vmask4 mask = (input0 & 0x20) != vint4::zero();
input0 = select(input0, input0 - 0x40, mask);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void print(vint4 a)
{
alignas(16) int v[4];
storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void printx(vint4 a)
{
alignas(16) int v[4];
storea(a, v);
printf("v4_i32:\n %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3]);
}
/**
* @brief Debug function to print a vector of floats.
*/
ASTCENC_SIMD_INLINE void print(vfloat4 a)
{
alignas(16) float v[4];
storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),
static_cast<double>(v[2]), static_cast<double>(v[3]));
}
/**
* @brief Debug function to print a vector of masks.
*/
ASTCENC_SIMD_INLINE void print(vmask4 a)
{
print(select(vint4(0), vint4(1), a));
}
#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,479 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions for angular-sum algorithm for weight alignment.
*
* This algorithm works as follows:
* - we compute a complex number P as (cos s*i, sin s*i) for each weight,
* where i is the input value and s is a scaling factor based on the spacing between the weights.
* - we then add together complex numbers for all the weights.
* - we then compute the length and angle of the resulting sum.
*
* This should produce the following results:
* - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
* - even distribution results in a vector of length 0.
* - all samples identical results in perfect alignment for every scaling.
*
* For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
* should then result in some scalings standing out as having particularly good alignment factors;
* we can use this to produce a set of candidate scale/shift values for various quantization levels;
* we should then actually try them and see what happens.
*/
#include "astcenc_internal.h"
#include "astcenc_vecmathlib.h"
#include <stdio.h>
#include <cassert>
#include <cstring>
static constexpr unsigned int ANGULAR_STEPS { 32 };
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
static_assert(ANGULAR_STEPS >= 32,
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
// Store a reduced sin/cos table for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
static constexpr unsigned int SINCOS_STEPS { 64 };
static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
};
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true };
#endif
/* See header for documentation. */
void prepare_angular_tables()
{
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
{
float angle_step = static_cast<float>(i + 1);
for (unsigned int j = 0; j < SINCOS_STEPS; j++)
{
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
}
}
}
/**
* @brief Compute the angular alignment factors and offsets.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param[out] offsets The output angular offsets array.
*/
static void compute_angular_offsets(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_angular_steps,
float* offsets
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
// Add 2^23 and interpreting bits extracts round-to-nearest int
vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
storea(isample, isamplev + i);
}
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
{
vfloat anglesum_x = vfloat::zero();
vfloat anglesum_y = vfloat::zero();
for (unsigned int j = 0; j < weight_count; j++)
{
int isample = isamplev[j];
anglesum_x += loada(cos_table[isample] + i);
anglesum_y += loada(sin_table[isample] + i);
}
vfloat angle = atan2(anglesum_y, anglesum_x);
vfloat ofs = angle * mult;
storea(ofs, offsets + i);
}
}
/**
* @brief For a given step size compute the lowest and highest weight.
*
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
* offset, and then compute the resulting error. The cut errors indicate the error that results from
* forcing samples that should have had one weight value one step up or down.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param max_quant_steps The maximum quantization level to be tested.
* @param offsets The angular offsets array.
* @param[out] lowest_weight Per angular step, the lowest weight.
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
* @param[out] error Per angular step, the error.
* @param[out] cut_low_weight_error Per angular step, the low weight cut error.
* @param[out] cut_high_weight_error Per angular step, the high weight cut error.
*/
static void compute_lowest_and_highest_weight(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_angular_steps,
unsigned int max_quant_steps,
const float* offsets,
float* lowest_weight,
int* weight_span,
float* error,
float* cut_low_weight_error,
float* cut_high_weight_error
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
{
vfloat minidx(128.0f);
vfloat maxidx(-128.0f);
vfloat errval = vfloat::zero();
vfloat cut_low_weight_err = vfloat::zero();
vfloat cut_high_weight_err = vfloat::zero();
vfloat offset = loada(offsets + sp);
for (unsigned int j = 0; j < weight_count; j++)
{
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
vfloat svalrte = round(sval);
vfloat diff = sval - svalrte;
errval += diff * diff;
// Reset tracker on min hit
vmask mask = svalrte < minidx;
minidx = select(minidx, svalrte, mask);
cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
// Accumulate on min hit
mask = svalrte == minidx;
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
// Reset tracker on max hit
mask = svalrte > maxidx;
maxidx = select(maxidx, svalrte, mask);
cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
// Accumulate on max hit
mask = svalrte == maxidx;
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
}
// Write out min weight and weight span; clamp span to a usable range
vint span = float_to_int(maxidx - minidx + vfloat(1));
span = min(span, vint(max_quant_steps + 3));
span = max(span, vint(2));
storea(minidx, lowest_weight + sp);
storea(span, weight_span + sp);
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
// samples that should have had the weight value one step (up/down).
vfloat ssize = 1.0f / rcp_stepsize;
vfloat errscale = ssize * ssize;
storea(errval * errscale, error + sp);
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
}
}
/**
* @brief The main function for the angular algorithm.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_quant_level The maximum quantization level to be tested.
* @param[out] low_value Per angular step, the lowest weight value.
* @param[out] high_value Per angular step, the highest weight value.
*/
static void compute_angular_endpoints_for_quant_levels(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_quant_level,
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
) {
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,
angular_offsets, lowest_weight, weight_span, error,
cut_low_weight_error, cut_high_weight_error);
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
// branches can become selects. This involves some integer to float casts, but the values are
// small enough so they never round the wrong way.
vfloat4 best_results[36];
// Initialize the array to some safe defaults
promise(max_quant_steps > 0);
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
{
// Lane<0> = Best error
// Lane<1> = Best scale; -1 indicates no solution found
// Lane<2> = Cut low weight
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
}
promise(max_angular_steps > 0);
for (unsigned int i = 0; i < max_angular_steps; i++)
{
float i_flt = static_cast<float>(i);
int idx_span = weight_span[i];
float error_cut_low = error[i] + cut_low_weight_error[i];
float error_cut_high = error[i] + cut_high_weight_error[i];
float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
// Check best error against record N
vfloat4 best_result = best_results[idx_span];
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
best_results[idx_span] = select(best_result, new_result, mask);
// Check best error against record N-1 with either cut low or cut high
best_result = best_results[idx_span - 1];
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
best_result = select(best_result, new_result, mask);
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
best_results[idx_span - 1] = select(best_result, new_result, mask);
// Check best error against record N-2 with both cut low and high
best_result = best_results[idx_span - 2];
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
best_results[idx_span - 2] = select(best_result, new_result, mask);
}
for (unsigned int i = 0; i <= max_quant_level; i++)
{
unsigned int q = steps_for_quant_level[i];
int bsi = static_cast<int>(best_results[q].lane<1>());
// Did we find anything?
#if defined(ASTCENC_DIAGNOSTICS)
if ((bsi < 0) && print_once)
{
print_once = false;
printf("INFO: Unable to find full encoding within search error limit.\n\n");
}
#endif
bsi = astc::max(0, bsi);
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
float hwi = lwi + static_cast<float>(q) - 1.0f;
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
}
}
/* See header for documentation. */
void compute_angular_endpoints_1plane(
bool only_always,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
: bsd.decimation_mode_count_selected;
promise(max_decimation_modes > 0);
for (unsigned int i = 0; i < max_decimation_modes; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
unsigned int max_precision = dm.maxprec_1plane;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
if (max_precision > max_weight_quant)
{
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
: bsd.block_mode_count_1plane_selected;
promise(max_block_modes > 0);
for (unsigned int i = 0; i < max_block_modes; i++)
{
const block_mode& bm = bsd.block_modes[i];
assert(!bm.is_dual_plane);
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value[i] = low_values[decim_mode][quant_mode];
high_value[i] = high_values[decim_mode][quant_mode];
}
else
{
low_value[i] = 0.0f;
high_value[i] = 1.0f;
}
}
}
/* See header for documentation. */
void compute_angular_endpoints_2planes(
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
promise(bsd.decimation_mode_count_selected > 0);
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
unsigned int max_precision = dm.maxprec_2planes;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
if (max_precision > max_weight_quant)
{
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
unsigned int start = bsd.block_mode_count_1plane_selected;
unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
for (unsigned int i = start; i < end; i++)
{
const block_mode& bm = bsd.block_modes[i];
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value1[i] = low_values1[decim_mode][quant_mode];
high_value1[i] = high_values1[decim_mode][quant_mode];
low_value2[i] = low_values2[decim_mode][quant_mode];
high_value2[i] = high_values2[decim_mode][quant_mode];
}
else
{
low_value1[i] = 0.0f;
high_value1[i] = 1.0f;
low_value2[i] = 0.0f;
high_value2[i] = 1.0f;
}
}
}
#endif

View File

@ -0,0 +1,147 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Data tables for quantization transfer.
*/
#include "astcenc_internal.h"
#define _ 0 // Using _ to indicate an entry that will not be used.
const quant_and_transfer_table quant_and_xfer_tables[12] {
// QUANT2, range 0..1
{
{0, 64},
{0, 1},
{0, 64},
{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
0x4000}
},
// QUANT_3, range 0..2
{
{0, 32, 64},
{0, 1, 2},
{0, 32, 64},
{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,0x4020}
},
// QUANT_4, range 0..3
{
{0, 21, 43, 64},
{0, 1, 2, 3},
{0, 21, 43, 64},
{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,0x402b}
},
//QUANT_5, range 0..4
{
{0, 16, 32, 48, 64},
{0, 1, 2, 3, 4},
{0, 16, 32, 48, 64},
{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,0x4030}
},
// QUANT_6, range 0..5
{
{0, 12, 25, 39, 52, 64},
{0, 2, 4, 5, 3, 1},
{0, 64, 12, 52, 25, 39},
{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
_,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
},
// QUANT_8, range 0..7
{
{0, 9, 18, 27, 37, 46, 55, 64},
{0, 1, 2, 3, 4, 5, 6, 7},
{0, 9, 18, 27, 37, 46, 55, 64},
{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
_,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
},
// QUANT_10, range 0..9
{
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
_,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
_,0x4039}
},
// QUANT_12, range 0..11
{
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
},
// QUANT_16, range 0..15
{
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
_,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
_,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
},
// QUANT_20, range 0..19
{
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
},
// QUANT_24, range 0..23
{
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
_,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
_,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
0x403b,_,0x403e}
},
// QUANT_32, range 0..31
{
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
0x403c,_,0x403e}
}
};