Add ASTC compression and decompression with Arm astcenc.

Co-authored-by: Gordon A Macpherson <gordon.a.macpherson@gmail.com> Co-authored-by: Rémi Verschelde <rverschelde@gmail.com>
2024-11-24 21:22:48 +00:00 · 2022-12-20 10:54:01 -08:00 · 2022-12-20 10:54:01 -08:00 · 696346f4cc
commit 696346f4cc
parent 14fdd28de9
44 changed files with 29247 additions and 6 deletions
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@ -141,6 +141,11 @@ Comment: AMD FidelityFX Super Resolution
 Copyright: 2021, Advanced Micro Devices, Inc.
 License: Expat

+Files: ./thirdparty/astcenc/
+Comment: Arm ASTC Encoder
+Copyright: 2011-2023, Arm Limited
+License: Apache-2.0
+
 Files: ./thirdparty/basis_universal/
 Comment: Basis Universal
 Copyright: 2022, Binomial LLC.
--- a/modules/astcenc/SCsub
+++ b/modules/astcenc/SCsub
@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+env_astcenc = env_modules.Clone()
+
+# Thirdparty source files
+
+thirdparty_obj = []
+
+thirdparty_dir = "#thirdparty/astcenc/"
+thirdparty_sources = [
+    "astcenc_averages_and_directions.cpp",
+    "astcenc_block_sizes.cpp",
+    "astcenc_color_quantize.cpp",
+    "astcenc_color_unquantize.cpp",
+    "astcenc_compress_symbolic.cpp",
+    "astcenc_compute_variance.cpp",
+    "astcenc_decompress_symbolic.cpp",
+    "astcenc_diagnostic_trace.cpp",
+    "astcenc_entry.cpp",
+    "astcenc_find_best_partitioning.cpp",
+    "astcenc_ideal_endpoints_and_weights.cpp",
+    "astcenc_image.cpp",
+    "astcenc_integer_sequence.cpp",
+    "astcenc_mathlib.cpp",
+    "astcenc_mathlib_softfloat.cpp",
+    "astcenc_partition_tables.cpp",
+    "astcenc_percentile_tables.cpp",
+    "astcenc_pick_best_endpoint_format.cpp",
+    "astcenc_platform_isa_detection.cpp",
+    "astcenc_quantization.cpp",
+    "astcenc_symbolic_physical.cpp",
+    "astcenc_weight_align.cpp",
+    "astcenc_weight_quant_xfer_tables.cpp",
+]
+thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
+
+env_astcenc.Prepend(CPPPATH=[thirdparty_dir])
+
+env_thirdparty = env_astcenc.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_astcenc.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
--- a/modules/astcenc/config.py
+++ b/modules/astcenc/config.py
@ -0,0 +1,6 @@
+def can_build(env, platform):
+    return env.editor_build
+
+
+def configure(env):
+    pass
--- a/modules/astcenc/image_compress_astcenc.cpp
+++ b/modules/astcenc/image_compress_astcenc.cpp
@ -0,0 +1,251 @@
+/**************************************************************************/
+/*  image_compress_astcenc.cpp                                            */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "image_compress_astcenc.h"
+
+#include "core/os/os.h"
+#include "core/string/print_string.h"
+
+#include <astcenc.h>
+
+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// TODO: See how to handle lossy quality.
+
+	Image::Format img_format = r_img->get_format();
+	if (img_format >= Image::FORMAT_DXT1) {
+		return; // Do not compress, already compressed.
+	}
+
+	bool is_hdr = false;
+	if ((img_format >= Image::FORMAT_RH) && (img_format <= Image::FORMAT_RGBE9995)) {
+		is_hdr = true;
+		r_img->convert(Image::FORMAT_RGBAF);
+	} else {
+		r_img->convert(Image::FORMAT_RGBA8);
+	}
+
+	// Determine encoder output format from our enum.
+
+	Image::Format target_format = Image::FORMAT_RGBA8;
+	astcenc_profile profile = ASTCENC_PRF_LDR;
+	unsigned int block_x = 4;
+	unsigned int block_y = 4;
+
+	if (p_format == Image::ASTCFormat::ASTC_FORMAT_4x4) {
+		if (is_hdr) {
+			target_format = Image::FORMAT_ASTC_4x4_HDR;
+			profile = ASTCENC_PRF_HDR;
+		} else {
+			target_format = Image::FORMAT_ASTC_4x4;
+		}
+	} else if (p_format == Image::ASTCFormat::ASTC_FORMAT_8x8) {
+		if (is_hdr) {
+			target_format = Image::FORMAT_ASTC_8x8_HDR;
+			profile = ASTCENC_PRF_HDR;
+		} else {
+			target_format = Image::FORMAT_ASTC_8x8;
+		}
+		block_x = 8;
+		block_y = 8;
+	}
+
+	// Compress image data and (if required) mipmaps.
+
+	const bool mipmaps = r_img->has_mipmaps();
+	int width = r_img->get_width();
+	int height = r_img->get_height();
+
+	print_verbose(vformat("astcenc: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
+
+	// Initialize astcenc.
+
+	astcenc_config config;
+	config.block_x = block_x;
+	config.block_y = block_y;
+	config.profile = profile;
+	const float quality = ASTCENC_PRE_MEDIUM;
+
+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
+
+	// Context allocation.
+
+	astcenc_context *context;
+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
+
+	status = astcenc_context_alloc(&config, thread_count, &context);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
+
+	// Compress image.
+
+	Vector<uint8_t> image_data = r_img->get_data();
+	uint8_t *slices = image_data.ptrw();
+
+	astcenc_image image;
+	image.dim_x = width;
+	image.dim_y = height;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	if (is_hdr) {
+		image.data_type = ASTCENC_TYPE_F32;
+	}
+	image.data = reinterpret_cast<void **>(&slices);
+
+	// Compute the number of ASTC blocks in each dimension.
+	unsigned int block_count_x = (width + block_x - 1) / block_x;
+	unsigned int block_count_y = (height + block_y - 1) / block_y;
+	size_t comp_len = block_count_x * block_count_y * 16;
+
+	Vector<uint8_t> compressed_data;
+	compressed_data.resize(comp_len);
+	compressed_data.fill(0);
+
+	const astcenc_swizzle swizzle = {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	status = astcenc_compress_image(context, &image, &swizzle, compressed_data.ptrw(), comp_len, 0);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: ASTC image compression failed: %s.", astcenc_get_error_string(status)));
+
+	// Replace original image with compressed one.
+
+	r_img->set_data(width, height, mipmaps, target_format, compressed_data);
+
+	print_verbose(vformat("astcenc: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+}
+
+void _decompress_astc(Image *r_img) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// Determine decompression parameters from image format.
+
+	Image::Format img_format = r_img->get_format();
+	bool is_hdr = false;
+	unsigned int block_x = 0;
+	unsigned int block_y = 0;
+	if (img_format == Image::FORMAT_ASTC_4x4) {
+		block_x = 4;
+		block_y = 4;
+		is_hdr = false;
+	} else if (img_format == Image::FORMAT_ASTC_4x4_HDR) {
+		block_x = 4;
+		block_y = 4;
+		is_hdr = true;
+	} else if (img_format == Image::FORMAT_ASTC_8x8) {
+		block_x = 8;
+		block_y = 8;
+		is_hdr = false;
+	} else if (img_format == Image::FORMAT_ASTC_8x8_HDR) {
+		block_x = 8;
+		block_y = 8;
+		is_hdr = true;
+	} else {
+		ERR_FAIL_MSG("astcenc: Cannot decompress Image with a non-ASTC format.");
+	}
+
+	// Initialize astcenc.
+
+	astcenc_profile profile = ASTCENC_PRF_LDR;
+	if (is_hdr) {
+		profile = ASTCENC_PRF_HDR;
+	}
+	astcenc_config config;
+	const float quality = ASTCENC_PRE_MEDIUM;
+
+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
+
+	// Context allocation.
+
+	astcenc_context *context = nullptr;
+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
+
+	status = astcenc_context_alloc(&config, thread_count, &context);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
+
+	// Decompress image.
+
+	const bool mipmaps = r_img->has_mipmaps();
+	int width = r_img->get_width();
+	int height = r_img->get_height();
+
+	astcenc_image image;
+	image.dim_x = width;
+	image.dim_y = height;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	Image::Format target_format = Image::FORMAT_RGBA8;
+	if (is_hdr) {
+		target_format = Image::FORMAT_RGBAF;
+		image.data_type = ASTCENC_TYPE_F32;
+	}
+
+	Vector<uint8_t> image_data = r_img->get_data();
+
+	Vector<uint8_t> new_image_data;
+	new_image_data.resize(Image::get_image_data_size(width, height, target_format, false));
+	new_image_data.fill(0);
+	uint8_t *slices = new_image_data.ptrw();
+	image.data = reinterpret_cast<void **>(&slices);
+
+	const astcenc_swizzle swizzle = {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	status = astcenc_decompress_image(context, image_data.ptr(), image_data.size(), &image, &swizzle, 0);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: ASTC decompression failed: %s.", astcenc_get_error_string(status)));
+	ERR_FAIL_COND_MSG(image.dim_z > 1,
+			"astcenc: ASTC decompression failed because this is a 3D texture, which is not supported.");
+
+	// Replace original image with compressed one.
+
+	Image::Format image_format = Image::FORMAT_RGBA8;
+	if (image.data_type == ASTCENC_TYPE_F32) {
+		image_format = Image::FORMAT_RGBAF;
+	} else if (image.data_type == ASTCENC_TYPE_U8) {
+		image_format = Image::FORMAT_RGBA8;
+	} else if (image.data_type == ASTCENC_TYPE_F16) {
+		image_format = Image::FORMAT_RGBAH;
+	} else {
+		ERR_FAIL_MSG("astcenc: ASTC decompression failed with an unknown format.");
+	}
+
+	r_img->set_data(image.dim_x, image.dim_y, mipmaps, image_format, new_image_data);
+
+	print_verbose(vformat("astcenc: Decompression took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+}
--- a/modules/astcenc/image_compress_astcenc.h
+++ b/modules/astcenc/image_compress_astcenc.h
@ -0,0 +1,39 @@
+/**************************************************************************/
+/*  image_compress_astcenc.h                                              */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef IMAGE_COMPRESS_ASTCENC_H
+#define IMAGE_COMPRESS_ASTCENC_H
+
+#include "core/io/image.h"
+
+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format);
+void _decompress_astc(Image *r_img);
+
+#endif // IMAGE_COMPRESS_ASTCENC_H
--- a/modules/astcenc/register_types.cpp
+++ b/modules/astcenc/register_types.cpp
@ -0,0 +1,48 @@
+/**************************************************************************/
+/*  register_types.cpp                                                    */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "register_types.h"
+
+#include "image_compress_astcenc.h"
+
+void initialize_astcenc_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	Image::_image_compress_astc_func = _compress_astc;
+	Image::_image_decompress_astc = _decompress_astc;
+}
+
+void uninitialize_astcenc_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+}
--- a/modules/astcenc/register_types.h
+++ b/modules/astcenc/register_types.h
@ -0,0 +1,39 @@
+/**************************************************************************/
+/*  register_types.h                                                      */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef ASTCENC_REGISTER_TYPES_H
+#define ASTCENC_REGISTER_TYPES_H
+
+#include "modules/register_module_types.h"
+
+void initialize_astcenc_module(ModuleInitializationLevel p_level);
+void uninitialize_astcenc_module(ModuleInitializationLevel p_level);
+
+#endif // ASTCENC_REGISTER_TYPES_H
--- a/modules/etcpak/image_compress_etcpak.cpp
+++ b/modules/etcpak/image_compress_etcpak.cpp
@ -33,8 +33,8 @@
 #include "core/os/os.h"
 #include "core/string/print_string.h"

-#include "thirdparty/etcpak/ProcessDxtc.hpp"
-#include "thirdparty/etcpak/ProcessRGB.hpp"
+#include <ProcessDxtc.hpp>
+#include <ProcessRGB.hpp>

 EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
 	switch (p_channels) {
@ -130,7 +130,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) {
 		target_format = Image::FORMAT_DXT5;
 	} else {
-		ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+		ERR_FAIL_MSG("Invalid or unsupported etcpak compression format, not ETC or DXT.");
 	}

 	// Compress image data and (if required) mipmaps.
@ -171,7 +171,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua

 	const uint8_t *src_read = r_img->get_data().ptr();

-	print_verbose(vformat("ETCPAK: Encoding image size %dx%d to format %s.", width, height, Image::get_format_name(target_format)));
+	print_verbose(vformat("etcpak: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));

 	int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps);
 	Vector<uint8_t> dest_data;
@ -232,12 +232,12 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
 			CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w);
 		} else {
-			ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+			ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format.");
 		}
 	}

 	// Replace original image with compressed one.
 	r_img->set_data(width, height, mipmaps, target_format, dest_data);

-	print_verbose(vformat("ETCPAK encode took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+	print_verbose(vformat("etcpak: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
 }
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -17,6 +17,18 @@ Files extracted from upstream source:
 - `license.txt`


+## astcenc
+
+- Upstream: https://github.com/ARM-software/astc-encoder
+- Version: 4.3.0 (ec83dda79fcefe07f69cdae7ed980d169bf2c4d4, 2023)
+- License: Apache 2.0
+
+Files extracted from upstream source:
+
+- `astcenc_*` and `astcenc.h` files from `Source`
+- `LICENSE.txt`
+
+
 ## basis_universal

 - Upstream: https://github.com/BinomialLLC/basis_universal
--- a/thirdparty/astcenc/LICENSE.txt
+++ b/thirdparty/astcenc/LICENSE.txt
@ -0,0 +1,175 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
--- a/thirdparty/astcenc/astcenc.h
+++ b/thirdparty/astcenc/astcenc.h
@ -0,0 +1,815 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief The core astcenc codec library interface.
+ *
+ * This interface is the entry point to the core astcenc codec. It aims to be easy to use for
+ * non-experts, but also to allow experts to have fine control over the compressor heuristics if
+ * needed. The core codec only handles compression and decompression, transferring all inputs and
+ * outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
+ * security and stability problems, all transfer buffers are explicitly sized.
+ *
+ * While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
+ * interface tied to a specific source version. We are not trying to maintain backwards
+ * compatibility across codec versions.
+ *
+ * The API state management is based around an explicit context object, which is the context for all
+ * allocated memory resources needed to compress and decompress a single image. A context can be
+ * used to sequentially compress multiple images using the same configuration, allowing setup
+ * overheads to be amortized over multiple images, which is particularly important when images are
+ * small.
+ *
+ * Multi-threading can be used two ways.
+ *
+ *     * An application wishing to process multiple images in parallel can allocate multiple
+ *       contexts and assign each context to a thread.
+ *     * An application wishing to process a single image in using multiple threads can configure
+ *       contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
+ *       for faster processing. The caller is responsible for creating the worker threads, and
+ *       synchronizing between images.
+ *
+ * Threading
+ * =========
+ *
+ * In pseudo-code, the usage for manual user threading looks like this:
+ *
+ *     // Configure the compressor run
+ *     astcenc_config my_config;
+ *     astcenc_config_init(..., &my_config);
+ *
+ *     // Power users can tweak <my_config> settings here ...
+ *
+ *     // Allocate working state given config and thread_count
+ *     astcenc_context* my_context;
+ *     astcenc_context_alloc(&my_config, thread_count, &my_context);
+ *
+ *     // Compress each image using these config settings
+ *     foreach image:
+ *         // For each thread in the thread pool
+ *         for i in range(0, thread_count):
+ *             astcenc_compress_image(my_context, &my_input, my_output, i);
+ *
+ *         astcenc_compress_reset(my_context);
+ *
+ *     // Clean up
+ *     astcenc_context_free(my_context);
+ *
+ * Images
+ * ======
+ *
+ * The codec supports compressing single images, which can be either 2D images or volumetric 3D
+ * images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
+ * texture arrays, or sliced 3D textures.
+ *
+ * Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
+ * half-float, or 32-bit float, as indicated by the data_type field.
+ *
+ * Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
+ *
+ * Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
+ * within an image slice is always tightly packed without padding. Addressing looks like this:
+ *
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4    ]   // Red
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1]   // Green
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2]   // Blue
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3]   // Alpha
+ *
+ * Common compressor usage
+ * =======================
+ *
+ * One of the most important things for coding image quality is to align the input data component
+ * count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
+ * actually need in the endpoint colors.
+ *
+ *         | Input data   | Encoding swizzle | Sampling swizzle |
+ *         | ------------ | ---------------- | ---------------- |
+ *         | 1 component  | RRR1             | .[rgb]           |
+ *         | 2 components | RRRG             | .[rgb]a          |
+ *         | 3 components | RGB1             | .rgb             |
+ *         | 4 components | RGBA             | .rgba            |
+ *
+ * The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
+ * provide best compatibility with other texture formats where the green component may be stored at
+ * higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
+ * the luminance endpoint component will be returned for all three.
+ *
+ * When using the normal map compression mode ASTC will store normals as a two component X+Y map.
+ * Input images must contain unit-length normalized and should be passed in using a two component
+ * swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
+ * to use GGGR for compatability with BC5n which will work just as well. The Z component can be
+ * recovered programmatically in shader code, using knowledge that the vector is unit length and
+ * that Z must be positive for a tangent-space normal map.
+ *
+ * Decompress-only usage
+ * =====================
+ *
+ * For some use cases it is useful to have a cut-down context and/or library which supports
+ * decompression but not compression.
+ *
+ * A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
+ * is allocated. These contexts have lower dynamic memory footprint than a full context.
+ *
+ * The entire library can be made decompress-only by building the files with the define
+ * ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
+ * exclude the functionality which is only needed for compression. This reduces the binary size by
+ * ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
+ *
+ * Note that context structures returned by a library built as decompress-only are incompatible with
+ * a library built with compression included, and visa versa, as they have different sizes and
+ * memory layout.
+ *
+ * Self-decompress-only usage
+ * ==========================
+ *
+ * ASTC is a complex format with a large search space. The parts of this search space that are
+ * searched is determined by heuristics that are, in part, tied to the quality level used when
+ * creating the context.
+ *
+ * A normal context is capable of decompressing any ASTC texture, including those generated by other
+ * compressors with unknown heuristics. This is the most flexible implementation, but forces the
+ * data tables used by the codec to include entries that are not needed during compression. This
+ * can slow down context creation by a significant amount, especially for the faster compression
+ * modes where few data table entries are actually used. To optimize this use case the context can
+ * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
+ * only be asked to decompress images that it compressed itself, allowing the data tables to
+ * exclude entries that are not needed by the current compression configuration. This reduces the
+ * size of the context data tables in memory and improves context creation performance. Note that,
+ * as of the 3.6 release, this flag no longer affects compression performance.
+ *
+ * Using this flag while attempting to decompress an valid image which was created by another
+ * compressor, or even another astcenc compressor version or configuration, may result in blocks
+ * returning as solid magenta or NaN value error blocks.
+ */
+
+#ifndef ASTCENC_INCLUDED
+#define ASTCENC_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ASTCENC_DYNAMIC_LIBRARY)
+	#if defined(_MSC_VER)
+		#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
+	#else
+		#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
+	#endif
+#else
+	#define ASTCENC_PUBLIC
+#endif
+
+/* ============================================================================
+    Data declarations
+============================================================================ */
+
+/**
+ * @brief An opaque structure; see astcenc_internal.h for definition.
+ */
+struct astcenc_context;
+
+/**
+ * @brief A codec API error code.
+ */
+enum astcenc_error {
+	/** @brief The call was successful. */
+	ASTCENC_SUCCESS = 0,
+	/** @brief The call failed due to low memory, or undersized I/O buffers. */
+	ASTCENC_ERR_OUT_OF_MEM,
+	/** @brief The call failed due to the build using fast math. */
+	ASTCENC_ERR_BAD_CPU_FLOAT,
+	/** @brief The call failed due to the build using an unsupported ISA. */
+	ASTCENC_ERR_BAD_CPU_ISA,
+	/** @brief The call failed due to an out-of-spec parameter. */
+	ASTCENC_ERR_BAD_PARAM,
+	/** @brief The call failed due to an out-of-spec block size. */
+	ASTCENC_ERR_BAD_BLOCK_SIZE,
+	/** @brief The call failed due to an out-of-spec color profile. */
+	ASTCENC_ERR_BAD_PROFILE,
+	/** @brief The call failed due to an out-of-spec quality value. */
+	ASTCENC_ERR_BAD_QUALITY,
+	/** @brief The call failed due to an out-of-spec component swizzle. */
+	ASTCENC_ERR_BAD_SWIZZLE,
+	/** @brief The call failed due to an out-of-spec flag set. */
+	ASTCENC_ERR_BAD_FLAGS,
+	/** @brief The call failed due to the context not supporting the operation. */
+	ASTCENC_ERR_BAD_CONTEXT,
+	/** @brief The call failed due to unimplemented functionality. */
+	ASTCENC_ERR_NOT_IMPLEMENTED,
+#if defined(ASTCENC_DIAGNOSTICS)
+	/** @brief The call failed due to an issue with diagnostic tracing. */
+	ASTCENC_ERR_DTRACE_FAILURE,
+#endif
+};
+
+/**
+ * @brief A codec color profile.
+ */
+enum astcenc_profile {
+	/** @brief The LDR sRGB color profile. */
+	ASTCENC_PRF_LDR_SRGB = 0,
+	/** @brief The LDR linear color profile. */
+	ASTCENC_PRF_LDR,
+	/** @brief The HDR RGB with LDR alpha color profile. */
+	ASTCENC_PRF_HDR_RGB_LDR_A,
+	/** @brief The HDR RGBA color profile. */
+	ASTCENC_PRF_HDR
+};
+
+/** @brief The fastest, lowest quality, search preset. */
+static const float ASTCENC_PRE_FASTEST = 0.0f;
+
+/** @brief The fast search preset. */
+static const float ASTCENC_PRE_FAST = 10.0f;
+
+/** @brief The medium quality search preset. */
+static const float ASTCENC_PRE_MEDIUM = 60.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_THOROUGH = 98.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
+
+/** @brief The exhaustive, highest quality, search preset. */
+static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
+
+/**
+ * @brief A codec component swizzle selector.
+ */
+enum astcenc_swz
+{
+	/** @brief Select the red component. */
+	ASTCENC_SWZ_R = 0,
+	/** @brief Select the green component. */
+	ASTCENC_SWZ_G = 1,
+	/** @brief Select the blue component. */
+	ASTCENC_SWZ_B = 2,
+	/** @brief Select the alpha component. */
+	ASTCENC_SWZ_A = 3,
+	/** @brief Use a constant zero component. */
+	ASTCENC_SWZ_0 = 4,
+	/** @brief Use a constant one component. */
+	ASTCENC_SWZ_1 = 5,
+	/** @brief Use a reconstructed normal vector Z component. */
+	ASTCENC_SWZ_Z = 6
+};
+
+/**
+ * @brief A texel component swizzle.
+ */
+struct astcenc_swizzle
+{
+	/** @brief The red component selector. */
+	astcenc_swz r;
+	/** @brief The green component selector. */
+	astcenc_swz g;
+	/** @brief The blue component selector. */
+	astcenc_swz b;
+	/** @brief The alpha component selector. */
+	astcenc_swz a;
+};
+
+/**
+ * @brief A texel component data format.
+ */
+enum astcenc_type
+{
+	/** @brief Unorm 8-bit data per component. */
+	ASTCENC_TYPE_U8 = 0,
+	/** @brief 16-bit float per component. */
+	ASTCENC_TYPE_F16 = 1,
+	/** @brief 32-bit float per component. */
+	ASTCENC_TYPE_F32 = 2
+};
+
+/**
+ * @brief Enable normal map compression.
+ *
+ * Input data will be treated a two component normal map, storing X and Y, and the codec will
+ * optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
+ * be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
+ * used by BC5n).
+ */
+static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
+
+/**
+ * @brief Enable alpha weighting.
+ *
+ * The input alpha value is used for transparency, so errors in the RGB components are weighted by
+ * the transparency level. This allows the codec to more accurately encode the alpha value in areas
+ * where the color value is less significant.
+ */
+static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT     = 1 << 2;
+
+/**
+ * @brief Enable perceptual error metrics.
+ *
+ * This mode enables perceptual compression mode, which will optimize for perceptual error rather
+ * than best PSNR. Only some input modes support perceptual error metrics.
+ */
+static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL       = 1 << 3;
+
+/**
+ * @brief Create a decompression-only context.
+ *
+ * This mode disables support for compression. This enables context allocation to skip some
+ * transient buffer allocation, resulting in lower memory usage.
+ */
+static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY      = 1 << 4;
+
+/**
+ * @brief Create a self-decompression context.
+ *
+ * This mode configures the compressor so that it is only guaranteed to be able to decompress images
+ * that were actually created using the current context. This is the common case for compression use
+ * cases, and setting this flag enables additional optimizations, but does mean that the context
+ * cannot reliably decompress arbitrary ASTC images.
+ */
+static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
+
+/**
+ * @brief Enable RGBM map compression.
+ *
+ * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
+ * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
+ * compression function, this flag is only used to control the use of RGBM-specific heuristics and
+ * error metrics.
+ *
+ * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
+ * M values can round to zero due to quantization and result in black or white pixels. It is highly
+ * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
+ * 16 or 32). Applying this threshold reduces the number of very dark colors that can be
+ * represented, but is still higher precision than 8-bit LDR.
+ *
+ * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
+ * factor used during reconstruction. This defaults to 5 when in RGBM mode.
+ *
+ * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
+ * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
+ * matching the default scale factor.
+ */
+static const unsigned int ASTCENC_FLG_MAP_RGBM             = 1 << 6;
+
+/**
+ * @brief The bit mask of all valid flags.
+ */
+static const unsigned int ASTCENC_ALL_FLAGS =
+                              ASTCENC_FLG_MAP_NORMAL |
+                              ASTCENC_FLG_MAP_RGBM |
+                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
+                              ASTCENC_FLG_USE_PERCEPTUAL |
+                              ASTCENC_FLG_DECOMPRESS_ONLY |
+                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+
+/**
+ * @brief The config structure.
+ *
+ * This structure will initially be populated by a call to astcenc_config_init, but power users may
+ * modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
+ * documentation of the power-user settings.
+ *
+ * Note for any settings which are associated with a specific color component, the value in the
+ * config applies to the component that exists after any compression data swizzle is applied.
+ */
+struct astcenc_config
+{
+	/** @brief The color profile. */
+	astcenc_profile profile;
+
+	/** @brief The set of set flags. */
+	unsigned int flags;
+
+	/** @brief The ASTC block size X dimension. */
+	unsigned int block_x;
+
+	/** @brief The ASTC block size Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The ASTC block size Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The red component weight scale for error weighting (-cw). */
+	float cw_r_weight;
+
+	/** @brief The green component weight scale for error weighting (-cw). */
+	float cw_g_weight;
+
+	/** @brief The blue component weight scale for error weighting (-cw). */
+	float cw_b_weight;
+
+	/** @brief The alpha component weight scale for error weighting (-cw). */
+	float cw_a_weight;
+
+	/**
+	 * @brief The radius for any alpha-weight scaling (-a).
+	 *
+	 * It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
+	 * will be sampled using linear texture filtering to minimize color bleed out of transparent
+	 * texels that are adjacent to non-transparent texels.
+	 */
+	unsigned int a_scale_radius;
+
+	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
+	float rgbm_m_scale;
+
+	/**
+	 * @brief The maximum number of partitions searched (-partitioncountlimit).
+	 *
+	 * Valid values are between 1 and 4.
+	 */
+	unsigned int tune_partition_count_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_2partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_3partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_4partition_index_limit;
+
+	/**
+	 * @brief The maximum centile for block modes searched (-blockmodelimit).
+	 *
+	 * Valid values are between 1 and 100.
+	 */
+	unsigned int tune_block_mode_limit;
+
+	/**
+	 * @brief The maximum iterative refinements applied (-refinementlimit).
+	 *
+	 * Valid values are between 1 and N; there is no technical upper limit
+	 * but little benefit is expected after N=4.
+	 */
+	unsigned int tune_refinement_limit;
+
+	/**
+	 * @brief The number of trial candidates per mode search (-candidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
+	 */
+	unsigned int tune_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_2partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_3partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_4partitioning_candidate_limit;
+
+	/**
+	 * @brief The dB threshold for stopping block search (-dblimit).
+	 *
+	 * This option is ineffective for HDR textures.
+	 */
+	float tune_db_limit;
+
+	/**
+	 * @brief The amount of MSE overshoot needed to early-out trials.
+	 *
+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
+	 * the high probability block modes. This can short-cut compression for simple blocks.
+	 *
+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
+	 * reached.
+	 */
+	float tune_mse_overshoot;
+
+	/**
+	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_2_partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_3_partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
+	 *
+	 * This option is ineffective for normal maps.
+	 */
+	float tune_2_plane_early_out_limit_correlation;
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	/**
+	 * @brief The path to save the diagnostic trace data to.
+	 *
+	 * This option is not part of the public API, and requires special builds
+	 * of the library.
+	 */
+	const char* trace_file_path;
+#endif
+};
+
+/**
+ * @brief An uncompressed 2D or 3D image.
+ *
+ * 3D image are passed in as an array of 2D slices. Each slice has identical
+ * size and color format.
+ */
+struct astcenc_image
+{
+	/** @brief The X dimension of the image, in texels. */
+	unsigned int dim_x;
+
+	/** @brief The Y dimension of the image, in texels. */
+	unsigned int dim_y;
+
+	/** @brief The Z dimension of the image, in texels. */
+	unsigned int dim_z;
+
+	/** @brief The data type per component. */
+	astcenc_type data_type;
+
+	/** @brief The array of 2D slices, of length @c dim_z. */
+	void** data;
+};
+
+/**
+ * @brief A block encoding metadata query result.
+ *
+ * If the block is an error block or a constant color block or an error block all fields other than
+ * the profile, block dimensions, and error/constant indicator will be zero.
+ */
+struct astcenc_block_info
+{
+	/** @brief The block encoding color profile. */
+	astcenc_profile profile;
+
+	/** @brief The number of texels in the X dimension. */
+	unsigned int block_x;
+
+	/** @brief The number of texels in the Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The number of texel in the Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The number of texels in the block. */
+	unsigned int texel_count;
+
+	/** @brief True if this block is an error block. */
+	bool is_error_block;
+
+	/** @brief True if this block is a constant color block. */
+	bool is_constant_block;
+
+	/** @brief True if this block is an HDR block. */
+	bool is_hdr_block;
+
+	/** @brief True if this block uses two weight planes. */
+	bool is_dual_plane_block;
+
+	/** @brief The number of partitions if not constant color. */
+	unsigned int partition_count;
+
+	/** @brief The partition index if 2 - 4 partitions used. */
+	unsigned int partition_index;
+
+	/** @brief The component index of the second plane if dual plane. */
+	unsigned int dual_plane_component;
+
+	/** @brief The color endpoint encoding mode for each partition. */
+	unsigned int color_endpoint_modes[4];
+
+	/** @brief The number of color endpoint quantization levels. */
+	unsigned int color_level_count;
+
+	/** @brief The number of weight quantization levels. */
+	unsigned int weight_level_count;
+
+	/** @brief The number of weights in the X dimension. */
+	unsigned int weight_x;
+
+	/** @brief The number of weights in the Y dimension. */
+	unsigned int weight_y;
+
+	/** @brief The number of weights in the Z dimension. */
+	unsigned int weight_z;
+
+	/** @brief The unpacked color endpoints for each partition. */
+	float color_endpoints[4][2][4];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane1[216];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane2[216];
+
+	/** @brief The per-texel partition assignments for the block. */
+	uint8_t partition_assignment[216];
+};
+
+/**
+ * Populate a codec config based on default settings.
+ *
+ * Power users can edit the returned config struct to fine tune before allocating the context.
+ *
+ * @param      profile   Color profile.
+ * @param      block_x   ASTC block size X dimension.
+ * @param      block_y   ASTC block size Y dimension.
+ * @param      block_z   ASTC block size Z dimension.
+ * @param      quality   Search quality preset / effort level. Either an
+ *                       @c ASTCENC_PRE_* value, or a effort level between 0
+ *                       and 100. Performance is not linear between 0 and 100.
+
+ * @param      flags     A valid set of @c ASTCENC_FLG_* flag bits.
+ * @param[out] config    Output config struct to populate.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
+ * either individually, or in combination.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_config_init(
+	astcenc_profile profile,
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z,
+	float quality,
+	unsigned int flags,
+	astcenc_config* config);
+
+/**
+ * @brief Allocate a new codec context based on a config.
+ *
+ * This function allocates all of the memory resources and threads needed by the codec. This can be
+ * slow, so it is recommended that contexts are reused to serially compress or decompress multiple
+ * images to amortize setup cost.
+ *
+ * Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
+ * flag when creating the configuration. The compression functions will fail if invoked. For a
+ * decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
+ * any context.
+ *
+ * @param[in]  config         Codec config.
+ * @param      thread_count   Thread count to configure for.
+ * @param[out] context        Location to store an opaque context pointer.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
+	const astcenc_config* config,
+	unsigned int thread_count,
+	astcenc_context** context);
+
+/**
+ * @brief Compress an image.
+ *
+ * A single context can only compress or decompress a single image at a time.
+ *
+ * For a context configured for multi-threading, any set of the N threads can call this function.
+ * Work will be dynamically scheduled across the threads available. Each thread must have a unique
+ * @c thread_index.
+ *
+ * @param         context        Codec context.
+ * @param[in,out] image          An input image, in 2D slices.
+ * @param         swizzle        Compression data swizzle, applied before compression.
+ * @param[out]    data_out       Pointer to output data array.
+ * @param         data_len       Length of the output data array.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
+	astcenc_context* context,
+	astcenc_image* image,
+	const astcenc_swizzle* swizzle,
+	uint8_t* data_out,
+	size_t data_len,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new compression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_compress_image() function for image N,
+ * but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
+	astcenc_context* context);
+
+/**
+ * @brief Decompress an image.
+ *
+ * @param         context        Codec context.
+ * @param[in]     data           Pointer to compressed data.
+ * @param         data_len       Length of the compressed data, in bytes.
+ * @param[in,out] image_out      Output image.
+ * @param         swizzle        Decompression data swizzle, applied after decompression.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
+	astcenc_context* context,
+	const uint8_t* data,
+	size_t data_len,
+	astcenc_image* image_out,
+	const astcenc_swizzle* swizzle,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new decompression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_decompress_image() function for image
+ * N, but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
+	astcenc_context* context);
+
+/**
+ * Free the compressor context.
+ *
+ * @param context   The codec context.
+ */
+ASTCENC_PUBLIC void astcenc_context_free(
+	astcenc_context* context);
+
+/**
+ * @brief Provide a high level summary of a block's encoding.
+ *
+ * This feature is primarily useful for codec developers but may be useful for developers building
+ * advanced content packaging pipelines.
+ *
+ * @param context   Codec context.
+ * @param data      One block of compressed ASTC data.
+ * @param info      The output info structure to populate.
+ *
+ * @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
+ *         function will return success even if the block itself was an error block encoding, as the
+ *         decode was correctly handled.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
+	astcenc_context* context,
+	const uint8_t data[16],
+	astcenc_block_info* info);
+
+/**
+ * @brief Get a printable string for specific status code.
+ *
+ * @param status   The status value.
+ *
+ * @return A human readable nul-terminated string.
+ */
+ASTCENC_PUBLIC const char* astcenc_get_error_string(
+	astcenc_error status);
+
+#endif
--- a/thirdparty/astcenc/astcenc_averages_and_directions.cpp
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@ -0,0 +1,995 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for finding dominant direction of a set of colors.
+ */
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Compute the average RGB color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized, and lane<3> will be zero.
+ */
+static void compute_partition_averages_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean.swz<0, 1, 2>();
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloatacc pp_avg_rgb[3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
+		                           hadd_s(pp_avg_rgb[1]),
+		                           hadd_s(pp_avg_rgb[2]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloatacc pp_avg_rgb[2][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloatacc pp_avg_rgb[3][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
+		                           hadd_s(pp_avg_rgb[2][1]),
+		                           hadd_s(pp_avg_rgb[2][2]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/**
+ * @brief Compute the average RGBA color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized.
+ */
+static void compute_partition_averages_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean;
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloat4 pp_avg_rgba[4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
+		                           hadd_s(pp_avg_rgba[1]),
+		                           hadd_s(pp_avg_rgba[2]),
+		                           hadd_s(pp_avg_rgba[3]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloat4 pp_avg_rgba[2][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloat4 pp_avg_rgba[3][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
+		                           hadd_s(pp_avg_rgba[2][1]),
+		                           hadd_s(pp_avg_rgba[2][2]),
+		                           hadd_s(pp_avg_rgba[2][3]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_4_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	for (int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+		vfloat4 sum_wp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = blk.texel(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+
+			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
+			sum_wp += select(zero, texel_datum, tdm3);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+		vfloat4 prod_wp = dot(sum_wp, sum_wp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+		best_sum = select(best_sum, prod_zp, mask);
+
+		mask = prod_wp > best_sum;
+		best_vector = select(best_vector, sum_wp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int omitted_component,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	const float* data_vr = blk.data_r;
+	const float* data_vg = blk.data_g;
+	const float* data_vb = blk.data_b;
+
+	// TODO: Data-driven permute would be useful to avoid this ...
+	if (omitted_component == 0)
+	{
+		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 1)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
+
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 2)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
+
+		data_vb = blk.data_a;
+	}
+	else
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
+	}
+
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = vfloat3(data_vr[iwt],
+			                              data_vg[iwt],
+			                              data_vb[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgb(pi, blk, partition_averages);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = blk.texel3(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_2_comp(
+	const partition_info& pt,
+	const image_block& blk,
+	unsigned int component1,
+	unsigned int component2,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	vfloat4 average;
+
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+
+	if (component1 == 0 && component2 == 1)
+	{
+		average = blk.data_mean.swz<0, 1>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+	}
+	else if (component1 == 0 && component2 == 2)
+	{
+		average = blk.data_mean.swz<0, 2>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+	}
+	else // (component1 == 1 && component2 == 2)
+	{
+		assert(component1 == 1 && component2 == 2);
+
+		average = blk.data_mean.swz<1, 2>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+	}
+
+	unsigned int partition_count = pt.partition_count;
+	promise(partition_count > 0);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
+		unsigned int texel_count = pt.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Only compute a partition mean if more than one partition
+		if (partition_count > 1)
+		{
+			average = vfloat4::zero();
+			for (unsigned int i = 0; i < texel_count; i++)
+			{
+				unsigned int iwt = texel_indexes[i];
+				average += vfloat2(data_vr[iwt], data_vg[iwt]);
+			}
+
+			average = average / static_cast<float>(texel_count);
+		}
+
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
+	float uncor_lengths[BLOCK_MAX_PARTITIONS],
+	float samec_lengths[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+
+		float uncor_loparam = 1e10f;
+		float uncor_hiparam = -1e10f;
+
+		float samec_loparam = 1e10f;
+		float samec_hiparam = -1e10f;
+
+		processed_line4 l_uncor = uncor_plines[partition];
+		processed_line4 l_samec = samec_plines[partition];
+
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+		vfloat l_samec_bs3(l_samec.bs.lane<3>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat samec_loparamv(1e10f);
+		vfloat samec_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+		vfloat ew_a(blk.channel_weight.lane<3>());
+
+		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
+		// array to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint(texel_count);
+			vint texel_idxs(texel_indexes + i);
+
+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf(blk.data_a, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2)
+			                   + (data_a * l_uncor_bs3);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+			                   + (uncor_param * l_uncor_bs3);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2)
+			                 + (ew_a * uncor_dist3 * uncor_dist3);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2)
+			                   + (data_a * l_samec_bs3);
+
+			samec_loparamv = min(samec_param, samec_loparamv);
+			samec_hiparamv = max(samec_param, samec_hiparamv);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2)
+			                 + (ew_a * samec_dist3 * samec_dist3);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		uncor_loparam = hmin_s(uncor_loparamv);
+		uncor_hiparam = hmax_s(uncor_hiparamv);
+
+		samec_loparam = hmin_s(samec_loparamv);
+		samec_hiparam = hmax_s(samec_hiparamv);
+
+		float uncor_linelen = uncor_hiparam - uncor_loparam;
+		float samec_linelen = samec_hiparam - samec_loparam;
+
+		// Turn very small numbers and NaNs into a small number
+		uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
+		samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		partition_lines3& pl = plines[partition];
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		float uncor_loparam = 1e10f;
+		float uncor_hiparam = -1e10f;
+
+		float samec_loparam = 1e10f;
+		float samec_hiparam = -1e10f;
+
+		processed_line3 l_uncor = pl.uncor_pline;
+		processed_line3 l_samec = pl.samec_pline;
+
+		// This implementation is an example vectorization of this function.
+		// It works for - the codec is a 2-4% faster than not vectorizing - but
+		// the benefit is limited by the use of gathers and register pressure
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat samec_loparamv(1e10f);
+		vfloat samec_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+
+		// This implementation over-shoots, but this is safe as we initialize the weights array
+		// to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint(texel_count);
+			vint texel_idxs(texel_indexes + i);
+
+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2);
+
+			samec_loparamv = min(samec_param, samec_loparamv);
+			samec_hiparamv = max(samec_param, samec_hiparamv);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		uncor_loparam = hmin_s(uncor_loparamv);
+		uncor_hiparam = hmax_s(uncor_hiparamv);
+
+		samec_loparam = hmin_s(samec_loparamv);
+		samec_hiparam = hmax_s(samec_hiparamv);
+
+		float uncor_linelen = uncor_hiparam - uncor_loparam;
+		float samec_linelen = samec_hiparam - samec_loparam;
+
+		// Turn very small numbers and NaNs into a small number
+		pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
+		pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_block_sizes.cpp
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
--- a/thirdparty/astcenc/astcenc_color_quantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_quantize.cpp
--- a/thirdparty/astcenc/astcenc_color_unquantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@ -0,0 +1,941 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include <utility>
+
+/**
+ * @brief Functions for color unquantization.
+ */
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Un-blue-contract a color.
+ *
+ * This function reverses any applied blue contraction.
+ *
+ * @param input   The input color that has been blue-contracted.
+ *
+ * @return The uncontracted color.
+ */
+static ASTCENC_SIMD_INLINE vint4 uncontract_color(
+	vint4 input
+) {
+	vmask4 mask(true, true, false, false);
+	vint4 bc0 = asr<1>(input + input.lane<2>());
+	return select(input, bc0, mask);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses delta encoding.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgba_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply bit transfer
+	bit_transfer_signed(input1, input0);
+
+	// Apply blue-uncontraction if needed
+	int rgb_sum = hadd_rgb_s(input1);
+	input1 = input1 + input0;
+	if (rgb_sum < 0)
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = clamp(0, 255, input0);
+	output1 = clamp(0, 255, input1);
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses delta encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_delta_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses direct encoding.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgba_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply blue-uncontraction if needed
+	if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = input0;
+	output1 = input1;
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses direct encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses scaled encoding.
+ *
+ * Note only the RGB channels use the scaled encoding, alpha uses direct.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      alpha1    The packed endpoint 1 alpha value.
+ * @param      scale     The packed quantized scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_alpha_unpack(
+	vint4 input0,
+	uint8_t alpha1,
+	uint8_t scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(alpha1);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(input0.lane<3>());
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses scaled encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      scale     The packed scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_unpack(
+	vint4 input0,
+	int scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(255);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses direct encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	output0 = vint4(lum0, lum0, lum0, 255);
+	output1 = vint4(lum1, lum1, lum1, 255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses delta encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints (L0, L1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_delta_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int l0 = (v0 >> 2) | (v1 & 0xC0);
+	int l1 = l0 + (v1 & 0x3F);
+
+	l1 = astc::min(l1, 255);
+
+	output0 = vint4(l0, l0, l0, 255);
+	output1 = vint4(l1, l1, l1, 255);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses direct encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses delta encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_delta_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+
+	lum0 |= (lum1 & 0x80) << 1;
+	alpha0 |= (alpha1 & 0x80) << 1;
+	lum1 &= 0x7F;
+	alpha1 &= 0x7F;
+
+	if (lum1 & 0x40)
+	{
+		lum1 -= 0x80;
+	}
+
+	if (alpha1 & 0x40)
+	{
+		alpha1 -= 0x80;
+	}
+
+	lum0 >>= 1;
+	lum1 >>= 1;
+	alpha0 >>= 1;
+	alpha1 >>= 1;
+	lum1 += lum0;
+	alpha1 += alpha0;
+
+	lum1 = astc::clamp(lum1, 0, 255);
+	alpha1 = astc::clamp(alpha1, 0, 255);
+
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an HDR RGB + offset encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgbo_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+
+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
+
+	int majcomp;
+	int mode;
+	if ((modeval & 0xC) != 0xC)
+	{
+		majcomp = modeval >> 2;
+		mode = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		majcomp = modeval & 3;
+		mode = 4;
+	}
+	else
+	{
+		majcomp = 0;
+		mode = 5;
+	}
+
+	int red = v0 & 0x3F;
+	int green = v1 & 0x1F;
+	int blue = v2 & 0x1F;
+	int scale = v3 & 0x1F;
+
+	int bit0 = (v1 >> 6) & 1;
+	int bit1 = (v1 >> 5) & 1;
+	int bit2 = (v2 >> 6) & 1;
+	int bit3 = (v2 >> 5) & 1;
+	int bit4 = (v3 >> 7) & 1;
+	int bit5 = (v3 >> 6) & 1;
+	int bit6 = (v3 >> 5) & 1;
+
+	int ohcomp = 1 << mode;
+
+	if (ohcomp & 0x30)
+		green |= bit0 << 6;
+	if (ohcomp & 0x3A)
+		green |= bit1 << 5;
+	if (ohcomp & 0x30)
+		blue |= bit2 << 6;
+	if (ohcomp & 0x3A)
+		blue |= bit3 << 5;
+
+	if (ohcomp & 0x3D)
+		scale |= bit6 << 5;
+	if (ohcomp & 0x2D)
+		scale |= bit5 << 6;
+	if (ohcomp & 0x04)
+		scale |= bit4 << 7;
+
+	if (ohcomp & 0x3B)
+		red |= bit4 << 6;
+	if (ohcomp & 0x04)
+		red |= bit3 << 6;
+
+	if (ohcomp & 0x10)
+		red |= bit5 << 7;
+	if (ohcomp & 0x0F)
+		red |= bit2 << 7;
+
+	if (ohcomp & 0x05)
+		red |= bit1 << 8;
+	if (ohcomp & 0x0A)
+		red |= bit0 << 8;
+
+	if (ohcomp & 0x05)
+		red |= bit0 << 9;
+	if (ohcomp & 0x02)
+		red |= bit6 << 9;
+
+	if (ohcomp & 0x01)
+		red |= bit3 << 10;
+	if (ohcomp & 0x02)
+		red |= bit5 << 10;
+
+	// expand to 12 bits.
+	static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
+	int shamt = shamts[mode];
+	red <<= shamt;
+	green <<= shamt;
+	blue <<= shamt;
+	scale <<= shamt;
+
+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
+	// not absolute values.
+	if (mode != 5)
+	{
+		green = red - green;
+		blue = red - blue;
+	}
+
+	// switch around components.
+	int temp;
+	switch (majcomp)
+	{
+	case 1:
+		temp = red;
+		red = green;
+		green = temp;
+		break;
+	case 2:
+		temp = red;
+		red = blue;
+		blue = temp;
+		break;
+	default:
+		break;
+	}
+
+	int red0 = red - scale;
+	int green0 = green - scale;
+	int blue0 = blue - scale;
+
+	// clamp to [0,0xFFF].
+	if (red < 0)
+		red = 0;
+	if (green < 0)
+		green = 0;
+	if (blue < 0)
+		blue = 0;
+
+	if (red0 < 0)
+		red0 = 0;
+	if (green0 < 0)
+		green0 = 0;
+	if (blue0 < 0)
+		blue0 = 0;
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_unpack(
+	const uint8_t input[6],
+	vint4& output0,
+	vint4& output1
+) {
+
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+	int v4 = input[4];
+	int v5 = input[5];
+
+	// extract all the fixed-placement bitfields
+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
+
+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
+
+	if (majcomp == 3)
+	{
+		output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
+		output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
+		return;
+	}
+
+	int a = v0 | ((v1 & 0x40) << 2);
+	int b0 = v2 & 0x3f;
+	int b1 = v3 & 0x3f;
+	int c = v1 & 0x3f;
+	int d0 = v4 & 0x7f;
+	int d1 = v5 & 0x7f;
+
+	// get hold of the number of bits in 'd0' and 'd1'
+	static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
+	int dbits = dbits_tab[modeval];
+
+	// extract six variable-placement bits
+	int bit0 = (v2 >> 6) & 1;
+	int bit1 = (v3 >> 6) & 1;
+	int bit2 = (v4 >> 6) & 1;
+	int bit3 = (v5 >> 6) & 1;
+	int bit4 = (v4 >> 5) & 1;
+	int bit5 = (v5 >> 5) & 1;
+
+	// and prepend the variable-placement bits depending on mode.
+	int ohmod = 1 << modeval;	// one-hot-mode
+	if (ohmod & 0xA4)
+		a |= bit0 << 9;
+	if (ohmod & 0x8)
+		a |= bit2 << 9;
+	if (ohmod & 0x50)
+		a |= bit4 << 9;
+
+	if (ohmod & 0x50)
+		a |= bit5 << 10;
+	if (ohmod & 0xA0)
+		a |= bit1 << 10;
+
+	if (ohmod & 0xC0)
+		a |= bit2 << 11;
+
+	if (ohmod & 0x4)
+		c |= bit1 << 6;
+	if (ohmod & 0xE8)
+		c |= bit3 << 6;
+
+	if (ohmod & 0x20)
+		c |= bit2 << 7;
+
+	if (ohmod & 0x5B)
+	{
+		b0 |= bit0 << 6;
+		b1 |= bit1 << 6;
+	}
+
+	if (ohmod & 0x12)
+	{
+		b0 |= bit2 << 7;
+		b1 |= bit3 << 7;
+	}
+
+	if (ohmod & 0xAF)
+	{
+		d0 |= bit4 << 5;
+		d1 |= bit5 << 5;
+	}
+
+	if (ohmod & 0x5)
+	{
+		d0 |= bit2 << 6;
+		d1 |= bit3 << 6;
+	}
+
+	// sign-extend 'd0' and 'd1'
+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
+	int32_t d0x = d0;
+	int32_t d1x = d1;
+	int sx_shamt = 32 - dbits;
+	d0x <<= sx_shamt;
+	d0x >>= sx_shamt;
+	d1x <<= sx_shamt;
+	d1x >>= sx_shamt;
+	d0 = d0x;
+	d1 = d1x;
+
+	// expand all values to 12 bits, with left-shift as needed.
+	int val_shamt = (modeval >> 1) ^ 3;
+	a <<= val_shamt;
+	b0 <<= val_shamt;
+	b1 <<= val_shamt;
+	c <<= val_shamt;
+	d0 <<= val_shamt;
+	d1 <<= val_shamt;
+
+	// then compute the actual color values.
+	int red1 = a;
+	int green1 = a - b0;
+	int blue1 = a - b1;
+	int red0 = a - c;
+	int green0 = a - b0 - c - d0;
+	int blue0 = a - b1 - c - d1;
+
+	// clamp the color components to [0,2^12 - 1]
+	red0 = astc::clamp(red0, 0, 4095);
+	green0 = astc::clamp(green0, 0, 4095);
+	blue0 = astc::clamp(blue0, 0, 4095);
+
+	red1 = astc::clamp(red1, 0, 4095);
+	green1 = astc::clamp(green1, 0, 4095);
+	blue1 = astc::clamp(blue1, 0, 4095);
+
+	// switch around the color components
+	int temp0, temp1;
+	switch (majcomp)
+	{
+	case 1:					// switch around red and green
+		temp0 = red0;
+		temp1 = red1;
+		red0 = green0;
+		red1 = green1;
+		green0 = temp0;
+		green1 = temp1;
+		break;
+	case 2:					// switch around red and blue
+		temp0 = red0;
+		temp1 = red1;
+		red0 = blue0;
+		red1 = blue1;
+		blue0 = temp0;
+		blue1 = temp1;
+		break;
+	case 0:					// no switch
+		break;
+	}
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB + LDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_ldr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int v6 = input[6];
+	int v7 = input[7];
+	output0.set_lane<3>(v6);
+	output1.set_lane<3>(v7);
+}
+
+/**
+ * @brief Unpack an HDR L (small range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_small_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v0 & 0x80)
+	{
+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+		y1 = (v1 & 0x1F) << 2;
+	}
+	else
+	{
+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+		y1 = (v1 & 0xF) << 1;
+	}
+
+	y1 += y0;
+	if (y1 > 0xFFF)
+	{
+		y1 = 0xFFF;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR L (large range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_large_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v1 >= v0)
+	{
+		y0 = v0 << 4;
+		y1 = v1 << 4;
+	}
+	else
+	{
+		y0 = (v1 << 4) + 8;
+		y1 = (v0 << 4) - 8;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_alpha_unpack(
+	const uint8_t input[2],
+	int& output0,
+	int& output1
+) {
+
+	int v6 = input[0];
+	int v7 = input[1];
+
+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+	v6 &= 0x7F;
+	v7 &= 0x7F;
+	if (selector == 3)
+	{
+		output0 = v6 << 5;
+		output1 = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (selector + 1)) & 0x780;
+		v7 &= (0x3f >> selector);
+		v7 ^= 32 >> selector;
+		v7 -= 32 >> selector;
+		v6 <<= (4 - selector);
+		v7 <<= (4 - selector);
+		v7 += v6;
+
+		if (v7 < 0)
+		{
+			v7 = 0;
+		}
+		else if (v7 > 0xFFF)
+		{
+			v7 = 0xFFF;
+		}
+
+		output0 = v6;
+		output1 = v7;
+	}
+
+	output0 <<= 4;
+	output1 <<= 4;
+}
+
+/**
+ * @brief Unpack an HDR RGBA direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_hdr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int alpha0, alpha1;
+	hdr_alpha_unpack(input + 6, alpha0, alpha1);
+
+	output0.set_lane<3>(alpha0);
+	output1.set_lane<3>(alpha1);
+}
+
+/* See header for documentation. */
+void unpack_color_endpoints(
+	astcenc_profile decode_mode,
+	int format,
+	const uint8_t* input,
+	bool& rgb_hdr,
+	bool& alpha_hdr,
+	vint4& output0,
+	vint4& output1
+) {
+	// Assume no NaNs and LDR endpoints unless set later
+	rgb_hdr = false;
+	alpha_hdr = false;
+
+	bool alpha_hdr_default = false;
+
+	switch (format)
+	{
+	case FMT_LUMINANCE:
+		luminance_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_DELTA:
+		luminance_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_small_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_large_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		luminance_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA_DELTA:
+		luminance_alpha_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE:
+		{
+			vint4 input0q(input[0], input[1], input[2], 0);
+			uint8_t scale = input[3];
+			rgb_scale_unpack(input0q, scale, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		{
+			vint4 input0q(input[0], input[1], input[2], input[4]);
+			uint8_t alpha1q = input[5];
+			uint8_t scaleq = input[3];
+			rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgbo_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgb_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGBA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGBA_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		rgb_hdr = true;
+		hdr_rgb_ldr_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_RGBA:
+		rgb_hdr = true;
+		alpha_hdr = true;
+		hdr_rgb_hdr_alpha_unpack(input, output0, output1);
+		break;
+	}
+
+	// Assign a correct default alpha
+	if (alpha_hdr_default)
+	{
+		if (decode_mode == ASTCENC_PRF_HDR)
+		{
+			output0.set_lane<3>(0x7800);
+			output1.set_lane<3>(0x7800);
+			alpha_hdr = true;
+		}
+		else
+		{
+			output0.set_lane<3>(0x00FF);
+			output1.set_lane<3>(0x00FF);
+			alpha_hdr = false;
+		}
+	}
+
+	vint4 ldr_scale(257);
+	vint4 hdr_scale(1);
+	vint4 output_scale = ldr_scale;
+
+	// An LDR profile image
+	if ((decode_mode == ASTCENC_PRF_LDR) ||
+	    (decode_mode == ASTCENC_PRF_LDR_SRGB))
+	{
+		// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
+		if (rgb_hdr == true)
+		{
+			output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
+			output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
+			output_scale = hdr_scale;
+
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+	}
+	// An HDR profile image
+	else
+	{
+		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
+		output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
+	}
+
+	output0 = output0 * output_scale;
+	output1 = output1 * output_scale;
+}
--- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
--- a/thirdparty/astcenc/astcenc_compute_variance.cpp
+++ b/thirdparty/astcenc/astcenc_compute_variance.cpp
@ -0,0 +1,472 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions to calculate variance per component in a NxN footprint.
+ *
+ * We need N to be parametric, so the routine below uses summed area tables in order to execute in
+ * O(1) time independent of how big N is.
+ *
+ * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
+ * perform a binary reduction, and then distributes the results. This method means that there is no
+ * serial dependency between a given element and the next one, and also significantly improves
+ * numerical stability allowing us to use floats rather than doubles.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
+ *
+ * This will take an input array of the form:
+ *     v0, v1, v2, ...
+ * ... and modify in-place to turn it into a prefix-sum array of the form:
+ *     v0, v0+v1, v0+v1+v2, ...
+ *
+ * @param d      The array to prefix-sum.
+ * @param items  The number of items in the array.
+ * @param stride The item spacing in the array; i.e. dense arrays should use 1.
+ */
+static void brent_kung_prefix_sum(
+	vfloat4* d,
+	size_t items,
+	int stride
+) {
+	if (items < 2)
+		return;
+
+	size_t lc_stride = 2;
+	size_t log2_stride = 1;
+
+	// The reduction-tree loop
+	do {
+		size_t step = lc_stride >> 1;
+		size_t start = lc_stride - 1;
+		size_t iters = items >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+
+		log2_stride += 1;
+		lc_stride <<= 1;
+	} while (lc_stride <= items);
+
+	// The expansion-tree loop
+	do {
+		log2_stride -= 1;
+		lc_stride >>= 1;
+
+		size_t step = lc_stride >> 1;
+		size_t start = step + lc_stride - 1;
+		size_t iters = (items - step) >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+	} while (lc_stride > 2);
+}
+
+/* See header for documentation. */
+void compute_pixel_region_variance(
+	astcenc_contexti& ctx,
+	const pixel_region_args& arg
+) {
+	// Unpack the memory structure into local variables
+	const astcenc_image* img = arg.img;
+	astcenc_swizzle swz = arg.swz;
+	bool have_z = arg.have_z;
+
+	int size_x = arg.size_x;
+	int size_y = arg.size_y;
+	int size_z = arg.size_z;
+
+	int offset_x = arg.offset_x;
+	int offset_y = arg.offset_y;
+	int offset_z = arg.offset_z;
+
+	int alpha_kernel_radius = arg.alpha_kernel_radius;
+
+	float*   input_alpha_averages = ctx.input_alpha_averages;
+	vfloat4* work_memory = arg.work_memory;
+
+	// Compute memory sizes and dimensions that we need
+	int kernel_radius = alpha_kernel_radius;
+	int kerneldim = 2 * kernel_radius + 1;
+	int kernel_radius_xy = kernel_radius;
+	int kernel_radius_z = have_z ? kernel_radius : 0;
+
+	int padsize_x = size_x + kerneldim;
+	int padsize_y = size_y + kerneldim;
+	int padsize_z = size_z + (have_z ? kerneldim : 0);
+	int sizeprod = padsize_x * padsize_y * padsize_z;
+
+	int zd_start = have_z ? 1 : 0;
+
+	vfloat4 *varbuf1 = work_memory;
+	vfloat4 *varbuf2 = work_memory + sizeprod;
+
+	// Scaling factors to apply to Y and Z for accesses into the work buffers
+	int yst = padsize_x;
+	int zst = padsize_x * padsize_y;
+
+	// Scaling factors to apply to Y and Z for accesses into result buffers
+	int ydt = img->dim_x;
+	int zdt = img->dim_x * img->dim_y;
+
+	// Macros to act as accessor functions for the work-memory
+	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
+	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
+
+	// Load N and N^2 values into the work buffers
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE
+		uint8_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 255;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					uint8_t r = data[swz.r];
+					uint8_t g = data[swz.g];
+					uint8_t b = data[swz.b];
+					uint8_t a = data[swz.a];
+
+					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
+					                     g * (1.0f / 255.0f),
+					                     b * (1.0f / 255.0f),
+					                     a * (1.0f / 255.0f));
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		uint16_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 0x3C00;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					vfloat4 d = float16_to_float(di);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		float data[6];
+		data[ASTCENC_SWZ_0] = 0.0f;
+		data[ASTCENC_SWZ_1] = 1.0f;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			float* data32 = static_cast<float*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					float r = data[swz.r];
+					float g = data[swz.g];
+					float b = data[swz.b];
+					float a = data[swz.a];
+
+					vfloat4 d(r, g, b, a);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+
+	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
+	vfloat4 vbz = vfloat4::zero();
+	for (int z = 0; z < padsize_z; z++)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			VARBUF1(z, y, 0) = vbz;
+			VARBUF2(z, y, 0) = vbz;
+		}
+
+		for (int x = 0; x < padsize_x; x++)
+		{
+			VARBUF1(z, 0, x) = vbz;
+			VARBUF2(z, 0, x) = vbz;
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			for (int x = 0; x < padsize_x; x++)
+			{
+				VARBUF1(0, y, x) = vbz;
+				VARBUF2(0, y, x) = vbz;
+			}
+		}
+	}
+
+	// Generate summed-area tables for N and N^2; this is done in-place, using
+	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
+			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
+		}
+	}
+
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int x = 1; x < padsize_x; x++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
+			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			for (int x = 1; x < padsize_x; x++)
+			{
+				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
+				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
+			}
+		}
+	}
+
+	// Compute a few constants used in the variance-calculation.
+	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
+	float alpha_rsamples;
+
+	if (have_z)
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
+	}
+	else
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
+	}
+
+	// Use the summed-area tables to compute variance for each neighborhood
+	if (have_z)
+	{
+		for (int z = 0; z < size_z; z++)
+		{
+			int z_src = z + kernel_radius_z;
+			int z_dst = z + offset_z;
+			int z_low  = z_src - alpha_kernel_radius;
+			int z_high = z_src + alpha_kernel_radius + 1;
+
+			for (int y = 0; y < size_y; y++)
+			{
+				int y_src = y + kernel_radius_xy;
+				int y_dst = y + offset_y;
+				int y_low  = y_src - alpha_kernel_radius;
+				int y_high = y_src + alpha_kernel_radius + 1;
+
+				for (int x = 0; x < size_x; x++)
+				{
+					int x_src = x + kernel_radius_xy;
+					int x_dst = x + offset_x;
+					int x_low  = x_src - alpha_kernel_radius;
+					int x_high = x_src + alpha_kernel_radius + 1;
+
+					// Summed-area table lookups for alpha average
+					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
+					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
+					               - VARBUF1(z_high, y_high, x_low).lane<3>()
+					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
+					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
+					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
+					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
+					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
+
+					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
+					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+				}
+			}
+		}
+	}
+	else
+	{
+		for (int y = 0; y < size_y; y++)
+		{
+			int y_src = y + kernel_radius_xy;
+			int y_dst = y + offset_y;
+			int y_low  = y_src - alpha_kernel_radius;
+			int y_high = y_src + alpha_kernel_radius + 1;
+
+			for (int x = 0; x < size_x; x++)
+			{
+				int x_src = x + kernel_radius_xy;
+				int x_dst = x + offset_x;
+				int x_low  = x_src - alpha_kernel_radius;
+				int x_high = x_src + alpha_kernel_radius + 1;
+
+				// Summed-area table lookups for alpha average
+				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
+				            - VARBUF1(0, y_low,  x_high).lane<3>()
+				            - VARBUF1(0, y_high, x_low).lane<3>()
+				            + VARBUF1(0, y_high, x_high).lane<3>();
+
+				int out_index = y_dst * ydt + x_dst;
+				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+unsigned int init_compute_averages(
+	const astcenc_image& img,
+	unsigned int alpha_kernel_radius,
+	const astcenc_swizzle& swz,
+	avg_args& ag
+) {
+	unsigned int size_x = img.dim_x;
+	unsigned int size_y = img.dim_y;
+	unsigned int size_z = img.dim_z;
+
+	// Compute maximum block size and from that the working memory buffer size
+	unsigned int kernel_radius = alpha_kernel_radius;
+	unsigned int kerneldim = 2 * kernel_radius + 1;
+
+	bool have_z = (size_z > 1);
+	unsigned int max_blk_size_xy = have_z ? 16 : 32;
+	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
+
+	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
+	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
+
+	// Perform block-wise averages calculations across the image
+	// Initialize fields which are not populated until later
+	ag.arg.size_x = 0;
+	ag.arg.size_y = 0;
+	ag.arg.size_z = 0;
+	ag.arg.offset_x = 0;
+	ag.arg.offset_y = 0;
+	ag.arg.offset_z = 0;
+	ag.arg.work_memory = nullptr;
+
+	ag.arg.img = &img;
+	ag.arg.swz = swz;
+	ag.arg.have_z = have_z;
+	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
+
+	ag.img_size_x = size_x;
+	ag.img_size_y = size_y;
+	ag.img_size_z = size_z;
+	ag.blk_size_xy = max_blk_size_xy;
+	ag.blk_size_z = max_blk_size_z;
+	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
+
+	// The parallel task count
+	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
+	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
+	return z_tasks * y_tasks;
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@ -0,0 +1,623 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions to decompress a symbolic block.
+ */
+
+#include "astcenc_internal.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+/**
+ * @brief Compute the integer linear interpolation of two color endpoints.
+ *
+ * @param decode_mode   The ASTC profile (linear or sRGB)
+ * @param color0        The endpoint0 color.
+ * @param color1        The endpoint1 color.
+ * @param weights        The interpolation weight (between 0 and 64).
+ *
+ * @return The interpolated color.
+ */
+static vint4 lerp_color_int(
+	astcenc_profile decode_mode,
+	vint4 color0,
+	vint4 color1,
+	vint4 weights
+) {
+	vint4 weight1 = weights;
+	vint4 weight0 = vint4(64) - weight1;
+
+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		color0 = asr<8>(color0);
+		color1 = asr<8>(color1);
+	}
+
+	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
+	color = asr<6>(color);
+
+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		color = color * vint4(257);
+	}
+
+	return color;
+}
+
+
+/**
+ * @brief Convert integer color value into a float value for the decoder.
+ *
+ * @param data       The integer color value post-interpolation.
+ * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
+ *
+ * @return The float color value.
+ */
+static inline vfloat4 decode_texel(
+	vint4 data,
+	vmask4 lns_mask
+) {
+	vint4 color_lns = vint4::zero();
+	vint4 color_unorm = vint4::zero();
+
+	if (any(lns_mask))
+	{
+		color_lns = lns_to_sf16(data);
+	}
+
+	if (!all(lns_mask))
+	{
+		color_unorm = unorm16_to_sf16(data);
+	}
+
+	// Pick components and then convert to FP16
+	vint4 datai = select(color_unorm, color_lns, lns_mask);
+	return float16_to_float(datai);
+}
+
+/* See header for documentation. */
+void unpack_weights(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const decimation_info& di,
+	bool is_dual_plane,
+	int weights_plane1[BLOCK_MAX_TEXELS],
+	int weights_plane2[BLOCK_MAX_TEXELS]
+) {
+	// Safe to overshoot as all arrays are allocated to full size
+	if (!is_dual_plane)
+	{
+		// Build full 64-entry weight lookup table
+		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
+		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
+		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
+		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
+
+		vint tab0p, tab1p, tab2p, tab3p;
+		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint summed_value(8);
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax(weight_count).lane<0>();
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(summed_value), weights_plane1 + i);
+		}
+	}
+	else
+	{
+		// Build a 32-entry weight lookup table per plane
+		// Plane 1
+		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
+		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
+		vint tab0_plane1p, tab1_plane1p;
+		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
+
+		// Plane 2
+		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
+		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
+		vint tab0_plane2p, tab1_plane2p;
+		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint sum_plane1(8);
+			vint sum_plane2(8);
+
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax(weight_count).lane<0>();
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(sum_plane1), weights_plane1 + i);
+			store(lsr<4>(sum_plane2), weights_plane2 + i);
+		}
+	}
+}
+
+/**
+ * @brief Return an FP32 NaN value for use in error colors.
+ *
+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
+ *
+ * @return The float color value.
+ */
+static float error_color_nan()
+{
+	if32 v;
+	v.u = 0xFFFFE000U;
+	return v.f;
+}
+
+/* See header for documentation. */
+void decompress_symbolic_block(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	int xpos,
+	int ypos,
+	int zpos,
+	const symbolic_compressed_block& scb,
+	image_block& blk
+) {
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	blk.data_min = vfloat4::zero();
+	blk.data_mean = vfloat4::zero();
+	blk.data_max = vfloat4::zero();
+	blk.grayscale = false;
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = error_color_nan();
+			blk.data_g[i] = error_color_nan();
+			blk.data_b[i] = error_color_nan();
+			blk.data_a[i] = error_color_nan();
+			blk.rgb_lns[i] = 0;
+			blk.alpha_lns[i] = 0;
+		}
+
+		return;
+	}
+
+	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
+	    (scb.block_type == SYM_BTYPE_CONST_U16))
+	{
+		vfloat4 color;
+		uint8_t use_lns = 0;
+
+		// UNORM16 constant color block
+		if (scb.block_type == SYM_BTYPE_CONST_U16)
+		{
+			vint4 colori(scb.constant_color);
+
+			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
+			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
+			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+			{
+				colori = asr<8>(colori) * 257;
+			}
+
+			vint4 colorf16 = unorm16_to_sf16(colori);
+			color = float16_to_float(colorf16);
+		}
+		// FLOAT16 constant color block
+		else
+		{
+			switch (decode_mode)
+			{
+			case ASTCENC_PRF_LDR_SRGB:
+			case ASTCENC_PRF_LDR:
+				color = vfloat4(error_color_nan());
+				break;
+			case ASTCENC_PRF_HDR_RGB_LDR_A:
+			case ASTCENC_PRF_HDR:
+				// Constant-color block; unpack from FP16 to FP32.
+				color = float16_to_float(vint4(scb.constant_color));
+				use_lns = 1;
+				break;
+			}
+		}
+
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = color.lane<0>();
+			blk.data_g[i] = color.lane<1>();
+			blk.data_b[i] = color.lane<2>();
+			blk.data_a[i] = color.lane<3>();
+			blk.rgb_lns[i] = use_lns;
+			blk.alpha_lns[i] = use_lns;
+		}
+
+		return;
+	}
+
+	// Get the appropriate partition-table entry
+	int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptors
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
+
+	// Now that we have endpoint colors and weights, we can unpack texel colors
+	int plane2_component = scb.plane2_component;
+	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+
+		int texel_count = pi.partition_texel_count[i];
+		for (int j = 0; j < texel_count; j++)
+		{
+			int tix = pi.texels_of_partition[i][j];
+			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
+			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
+			vfloat4 colorf = decode_texel(color, lns_mask);
+
+			blk.data_r[tix] = colorf.lane<0>();
+			blk.data_g[tix] = colorf.lane<1>();
+			blk.data_b[tix] = colorf.lane<2>();
+			blk.data_a[tix] = colorf.lane<3>();
+		}
+	}
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_2plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(scb.partition_count == 1);
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
+
+	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
+
+	vfloat4 summa = vfloat4::zero();
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	// Unpack and compute error for each texel in the partition
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
+		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
+
+		vfloat4 color = int_to_float(colori);
+		vfloat4 oldColor = blk.texel(i);
+
+		// Compare error using a perceptual decode metric for RGBM textures
+		if (config.flags & ASTCENC_FLG_MAP_RGBM)
+		{
+			// Fail encodings that result in zero weight M pixels. Note that this can cause
+			// "interesting" artifacts if we reject all useful encodings - we typically get max
+			// brightness encodings instead which look just as bad. We recommend users apply a
+			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+			// getting small M values post-quantization, but we can't prove it would never
+			// happen, especially at low bit rates ...
+			if (color.lane<3>() == 0.0f)
+			{
+				return -ERROR_CALC_DEFAULT;
+			}
+
+			// Compute error based on decoded RGBM color
+			color = vfloat4(
+				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+
+			oldColor = vfloat4(
+				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+		}
+
+		vfloat4 error = oldColor - color;
+		error = min(abs(error), 1e15f);
+		error = error * error;
+
+		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+
+	// Get the appropriate partition-table entry
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	vfloat4 summa = vfloat4::zero();
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(config.profile,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		// Unpack and compute error for each texel in the partition
+		unsigned int texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
+			                              vint4(plane1_weights[tix]));
+
+			vfloat4 color = int_to_float(colori);
+			vfloat4 oldColor = blk.texel(tix);
+
+			// Compare error using a perceptual decode metric for RGBM textures
+			if (config.flags & ASTCENC_FLG_MAP_RGBM)
+			{
+				// Fail encodings that result in zero weight M pixels. Note that this can cause
+				// "interesting" artifacts if we reject all useful encodings - we typically get max
+				// brightness encodings instead which look just as bad. We recommend users apply a
+				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+				// getting small M values post-quantization, but we can't prove it would never
+				// happen, especially at low bit rates ...
+				if (color.lane<3>() == 0.0f)
+				{
+					return -ERROR_CALC_DEFAULT;
+				}
+
+				// Compute error based on decoded RGBM color
+				color = vfloat4(
+					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+
+				oldColor = vfloat4(
+					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+			}
+
+			vfloat4 error = oldColor - color;
+			error = min(abs(error), 1e15f);
+			error = error * error;
+
+			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+		}
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane_1partition(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+
+	// Pre-shift sRGB so things round correctly
+	if (config.profile == ASTCENC_PRF_LDR_SRGB)
+	{
+		ep0 = asr<8>(ep0);
+		ep1 = asr<8>(ep1);
+	}
+
+	// Unpack and compute error for each texel in the partition
+	vfloatacc summav = vfloatacc::zero();
+
+	vint lane_id = vint::lane_id();
+	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
+
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Compute EP1 contribution
+		vint weight1 = vint::loada(plane1_weights + i);
+		vint ep1_r = vint(ep1.lane<0>()) * weight1;
+		vint ep1_g = vint(ep1.lane<1>()) * weight1;
+		vint ep1_b = vint(ep1.lane<2>()) * weight1;
+		vint ep1_a = vint(ep1.lane<3>()) * weight1;
+
+		// Compute EP0 contribution
+		vint weight0 = vint(64) - weight1;
+		vint ep0_r = vint(ep0.lane<0>()) * weight0;
+		vint ep0_g = vint(ep0.lane<1>()) * weight0;
+		vint ep0_b = vint(ep0.lane<2>()) * weight0;
+		vint ep0_a = vint(ep0.lane<3>()) * weight0;
+
+		// Shift so things round correctly
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
+
+		// Compute color diff
+		vfloat color_r = int_to_float(colori_r);
+		vfloat color_g = int_to_float(colori_g);
+		vfloat color_b = int_to_float(colori_b);
+		vfloat color_a = int_to_float(colori_a);
+
+		vfloat color_orig_r = loada(blk.data_r + i);
+		vfloat color_orig_g = loada(blk.data_g + i);
+		vfloat color_orig_b = loada(blk.data_b + i);
+		vfloat color_orig_a = loada(blk.data_a + i);
+
+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+		              + color_error_g * blk.channel_weight.lane<1>()
+		              + color_error_b * blk.channel_weight.lane<2>()
+		              + color_error_a * blk.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
+	}
+
+	return hadd_s(summav);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
@ -0,0 +1,230 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for the library entrypoint.
+ */
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <string>
+
+#include "astcenc_diagnostic_trace.h"
+
+/** @brief The global trace logger. */
+static TraceLog* g_TraceLog = nullptr;
+
+/** @brief The JSON indentation level. */
+static const size_t g_trace_indent = 2;
+
+TraceLog::TraceLog(
+	const char* file_name):
+	m_file(file_name, std::ofstream::out | std::ofstream::binary)
+{
+	assert(!g_TraceLog);
+	g_TraceLog = this;
+	m_root = new TraceNode("root");
+}
+
+/* See header for documentation. */
+TraceNode* TraceLog::get_current_leaf()
+{
+	if (m_stack.size())
+	{
+		return m_stack.back();
+	}
+
+	return nullptr;
+}
+
+/* See header for documentation. */
+size_t TraceLog::get_depth()
+{
+	return m_stack.size();
+}
+
+/* See header for documentation. */
+TraceLog::~TraceLog()
+{
+	assert(g_TraceLog == this);
+	delete m_root;
+	g_TraceLog = nullptr;
+}
+
+/* See header for documentation. */
+TraceNode::TraceNode(
+	const char* format,
+	...
+) {
+	// Format the name string
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	// Generate the node
+	TraceNode* parent = g_TraceLog->get_current_leaf();
+	size_t depth = g_TraceLog->get_depth();
+	g_TraceLog->m_stack.push_back(this);
+
+	bool comma = parent && parent->m_attrib_count;
+	auto& out = g_TraceLog->m_file;
+
+	if (parent)
+	{
+		parent->m_attrib_count++;
+	}
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	if (depth)
+	{
+		out << '\n';
+	}
+
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
+	out << in_indents << "[";
+}
+
+/* See header for documentation. */
+void TraceNode::add_attrib(
+	std::string type,
+	std::string key,
+	std::string value
+) {
+	(void)type;
+
+	size_t depth = g_TraceLog->get_depth();
+	size_t indent = (depth * 2) * g_trace_indent;
+	auto& out = g_TraceLog->m_file;
+	bool comma = m_attrib_count;
+	m_attrib_count++;
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	out << '\n';
+	out << std::string(indent, ' ') << "[ "
+	                                << "\"" << key << "\", "
+	                                << value << " ]";
+}
+
+/* See header for documentation. */
+TraceNode::~TraceNode()
+{
+	g_TraceLog->m_stack.pop_back();
+
+	auto& out = g_TraceLog->m_file;
+	size_t depth = g_TraceLog->get_depth();
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	if (m_attrib_count)
+	{
+		out << "\n" << in_indents;
+	}
+	out << "]\n";
+
+	out << out_indents << "]";
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	const char* format,
+	...
+) {
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	std::string value = "\"" + std::string(buffer) + "\"";
+
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("str", key, value);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	float value
+) {
+  	char buffer[256];
+	sprintf(buffer, "%.20g", (double)value);
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("float", key, buffer);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	unsigned int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.h
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.h
@ -0,0 +1,219 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief This module provides a set of diagnostic tracing utilities.
+ *
+ * Overview
+ * ========
+ *
+ * The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
+ * hierarchy contains three levels:
+ *
+ *    - block
+ *        - pass
+ *           - candidate
+ *
+ * One block node exists for each compressed block in the image. One pass node exists for each major
+ * pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
+ * encoding candidate trialed for a pass.
+ *
+ * Each node contains both the hierarchy but also a number of attributes which explain the behavior.
+ * For example, the block node contains the block coordinates in the image, the pass explains the
+ * pass configuration, and the candidate will explain the candidate encoding such as weight
+ * decimation, refinement error, etc.
+ *
+ * Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
+ * Constructing a trace node on the stack will automatically add it to the current node as a child,
+ * and then make it the current node. Destroying the current node will pop the stack and set the
+ * parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
+ * tree structure.
+ *
+ * A set of utility macros are provided to add attribute annotations to the current trace node.
+ *
+ * Usage
+ * =====
+ *
+ * Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
+ * in builds with diagnostics disabled.
+ *
+ * Add annotations to the current trace node using the @c trace_add_data() macro. This will
+ * similarly compile out completely in builds with diagnostics disabled.
+ *
+ * If you need to add additional code to support diagnostics-only behavior wrap
+ * it in preprocessor guards:
+ *
+ *     #if defined(ASTCENC_DIAGNOSTICS)
+ *     #endif
+ */
+
+#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+/**
+ * @brief Class representing a single node in the trace hierarchy.
+ */
+class TraceNode
+{
+public:
+	/**
+	 * @brief Construct a new node.
+	 *
+	 * Constructing a node will push to the the top of the stack, automatically making it a child of
+	 * the current node, and then setting it to become the current node.
+	 *
+	 * @param format   The format template for the node name.
+	 * @param ...      The format parameters.
+	 */
+	TraceNode(const char* format, ...);
+
+	/**
+	 * @brief Add an attribute to this node.
+	 *
+	 * Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
+	 * the caller.
+	 *
+	 * @param type    The type of the attribute.
+	 * @param key     The key of the attribute.
+	 * @param value   The value of the attribute.
+	 */
+	void add_attrib(std::string type, std::string key, std::string value);
+
+	/**
+	 * @brief Destroy this node.
+	 *
+	 * Destroying a node will pop it from the top of the stack, making its parent the current node.
+	 * It is invalid behavior to destroy a node that is not the current node; usage must conform to
+	 * stack push-pop semantics.
+	 */
+	~TraceNode();
+
+	/**
+	 * @brief The number of attributes and child nodes in this node.
+	 */
+	unsigned int m_attrib_count { 0 };
+};
+
+/**
+ * @brief Class representing the trace log file being written.
+ */
+class TraceLog
+{
+public:
+	/**
+	 * @brief Create a new trace log.
+	 *
+	 * The trace log is global; there can be only one at a time.
+	 *
+	 * @param file_name   The name of the file to write.
+	 */
+	TraceLog(const char* file_name);
+
+	/**
+	 * @brief Detroy the trace log.
+	 *
+	 * Trace logs MUST be cleanly destroyed to ensure the file gets written.
+	 */
+	~TraceLog();
+
+	/**
+	 * @brief Get the current child node.
+	 *
+	 * @return The current leaf node.
+	 */
+	TraceNode* get_current_leaf();
+
+	/**
+	 * @brief Get the stack depth of the current child node.
+	 *
+	 * @return The current leaf node stack depth.
+	 */
+	size_t get_depth();
+
+	/**
+	 * @brief The file stream to write to.
+	 */
+	std::ofstream m_file;
+
+	/**
+	 * @brief The stack of nodes (newest at the back).
+	 */
+	std::vector<TraceNode*> m_stack;
+
+private:
+	/**
+	 * @brief The root node in the JSON file.
+	 */
+	TraceNode* m_root;
+};
+
+/**
+ * @brief Utility macro to create a trace node on the stack.
+ *
+ * @param name     The variable name to use.
+ * @param ...      The name template and format parameters.
+ */
+#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
+
+/**
+ * @brief Add a string annotation to the current node.
+ *
+ * @param key      The name of the attribute.
+ * @param format   The format template for the attribute value.
+ * @param ...      The format parameters.
+ */
+void trace_add_data(const char* key, const char* format, ...);
+
+/**
+ * @brief Add a float annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, float value);
+
+/**
+ * @brief Add an integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, int value);
+
+/**
+ * @brief Add an unsigned integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, unsigned int value);
+
+#else
+
+#define TRACE_NODE(name, ...)
+
+#define trace_add_data(...)
+
+#endif
+
+#endif
--- a/thirdparty/astcenc/astcenc_entry.cpp
+++ b/thirdparty/astcenc/astcenc_entry.cpp
--- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@ -0,0 +1,780 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best partition for a block.
+ *
+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
+ * texels into an ideal partitioning for the requested partition count, and then compares that
+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
+ * partitionings that actually generate fewer than the requested partition count, but only the top
+ * N candidates are actually put through a more detailed search. N is determined by the compressor
+ * quality preset.
+ *
+ * For the detailed search, each candidate is checked against two possible encoding methods:
+ *
+ *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
+ *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
+ *
+ * This is implemented by computing the compute mean color and dominant direction for each
+ * partition. This defines two lines, both of which go through the mean color value.
+ *
+ * - One line has a direction defined by the dominant direction; this is used to assess the error
+ *   from using an uncorrelated color representation.
+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
+ *   (RGB + scale) color representation.
+ *
+ * The best candidate is selected by computing the squared-errors that result from using these
+ * lines for endpoint selection.
+ */
+
+#include <limits>
+#include "astcenc_internal.h"
+
+/**
+ * @brief Pick some initial kmeans cluster centers.
+ *
+ * @param      blk               The image block color data to compress.
+ * @param      texel_count       The number of texels in the block.
+ * @param      partition_count   The number of partitions in the block.
+ * @param[out] cluster_centers   The initial partition cluster center colors.
+ */
+static void kmeans_init(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	unsigned int clusters_selected = 0;
+	float distances[BLOCK_MAX_TEXELS];
+
+	// Pick a random sample as first cluster center; 145897 from random.org
+	unsigned int sample = 145897 % texel_count;
+	vfloat4 center_color = blk.texel(sample);
+	cluster_centers[clusters_selected] = center_color;
+	clusters_selected++;
+
+	// Compute the distance to the first cluster center
+	float distance_sum = 0.0f;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vfloat4 color = blk.texel(i);
+		vfloat4 diff = color - center_color;
+		float distance = dot_s(diff * diff, blk.channel_weight);
+		distance_sum += distance;
+		distances[i] = distance;
+	}
+
+	// More numbers from random.org for weighted-random center selection
+	const float cluster_cutoffs[9] {
+		0.626220f, 0.932770f, 0.275454f,
+		0.318558f, 0.240113f, 0.009190f,
+		0.347661f, 0.731960f, 0.156391f
+	};
+
+	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
+
+	// Pick the remaining samples as needed
+	while (true)
+	{
+		// Pick the next center in a weighted-random fashion.
+		float summa = 0.0f;
+		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
+		for (sample = 0; sample < texel_count; sample++)
+		{
+			summa += distances[sample];
+			if (summa >= distance_cutoff)
+			{
+				break;
+			}
+		}
+
+		// Clamp to a valid range and store the selected cluster center
+		sample = astc::min(sample, texel_count - 1);
+
+		center_color = blk.texel(sample);
+		cluster_centers[clusters_selected++] = center_color;
+		if (clusters_selected >= partition_count)
+		{
+			break;
+		}
+
+		// Compute the distance to the new cluster center, keep the min dist
+		distance_sum = 0.0f;
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			vfloat4 color = blk.texel(i);
+			vfloat4 diff = color - center_color;
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			distance = astc::min(distance, distances[i]);
+			distance_sum += distance;
+			distances[i] = distance;
+		}
+	}
+}
+
+/**
+ * @brief Assign texels to clusters, based on a set of chosen center points.
+ *
+ * @param      blk                  The image block color data to compress.
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_count      The number of partitions in the block.
+ * @param      cluster_centers      The partition cluster center colors.
+ * @param[out] partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_assign(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the best partition for every texel
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		float best_distance = std::numeric_limits<float>::max();
+		unsigned int best_partition = 0;
+
+		vfloat4 color = blk.texel(i);
+		for (unsigned int j = 0; j < partition_count; j++)
+		{
+			vfloat4 diff = color - cluster_centers[j];
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			if (distance < best_distance)
+			{
+				best_distance = distance;
+				best_partition = j;
+			}
+		}
+
+		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
+		partition_texel_count[best_partition]++;
+	}
+
+	// It is possible to get a situation where a partition ends up without any texels. In this case,
+	// assign texel N to partition N. This is silly, but ensures that every partition retains at
+	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
+	// so if we actually did a reassignment, run the whole loop over again.
+	bool problem_case;
+	do
+	{
+		problem_case = false;
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			if (partition_texel_count[i] == 0)
+			{
+				partition_texel_count[partition_of_texel[i]]--;
+				partition_texel_count[i]++;
+				partition_of_texel[i] = static_cast<uint8_t>(i);
+				problem_case = true;
+			}
+		}
+	} while (problem_case);
+}
+
+/**
+ * @brief Compute new cluster centers based on their center of gravity.
+ *
+ * @param       blk                  The image block color data to compress.
+ * @param       texel_count          The number of texels in the block.
+ * @param       partition_count      The number of partitions in the block.
+ * @param[out]  cluster_centers      The new cluster center colors.
+ * @param       partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_update(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero()
+	};
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the center-of-gravity in each cluster
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint8_t partition = partition_of_texel[i];
+		color_sum[partition] += blk.texel(i);
+		partition_texel_count[partition]++;
+	}
+
+	// Set the center of gravity to be the new cluster center
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
+		cluster_centers[i] = color_sum[i] * scale;
+	}
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch2(
+	const uint64_t a[2],
+	const uint64_t b[2]
+) {
+	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
+	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
+	return astc::min(v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch3(
+	const uint64_t a[3],
+	const uint64_t b[3]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+
+	int s0 = p11 + p22;
+	int s1 = p12 + p21;
+	int v0 = astc::min(s0, s1) + p00;
+
+	int s2 = p10 + p22;
+	int s3 = p12 + p20;
+	int v1 = astc::min(s2, s3) + p01;
+
+	int s4 = p10 + p21;
+	int s5 = p11 + p20;
+	int v2 = astc::min(s4, s5) + p02;
+
+	return astc::min(v0, v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch4(
+	const uint64_t a[4],
+	const uint64_t b[4]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+	int p03 = popcount(a[0] ^ b[3]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+	int p13 = popcount(a[1] ^ b[3]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+	int p23 = popcount(a[2] ^ b[3]);
+
+	int p30 = popcount(a[3] ^ b[0]);
+	int p31 = popcount(a[3] ^ b[1]);
+	int p32 = popcount(a[3] ^ b[2]);
+	int p33 = popcount(a[3] ^ b[3]);
+
+	int mx23 = astc::min(p22 + p33, p23 + p32);
+	int mx13 = astc::min(p21 + p33, p23 + p31);
+	int mx12 = astc::min(p21 + p32, p22 + p31);
+	int mx03 = astc::min(p20 + p33, p23 + p30);
+	int mx02 = astc::min(p20 + p32, p22 + p30);
+	int mx01 = astc::min(p21 + p30, p20 + p31);
+
+	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
+	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
+	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
+	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
+
+	return astc::min(v0, v1, v2, v3);
+}
+
+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
+
+/**
+ * @brief Count the partition table mismatches vs the data clustering.
+ *
+ * @param      bsd               The block size information.
+ * @param      partition_count   The number of partitions in the block.
+ * @param      bitmaps           The block texel partition assignment patterns.
+ * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
+ */
+static void count_partition_mismatch_bits(
+	const block_size_descriptor& bsd,
+	unsigned int partition_count,
+	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
+) {
+	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+	promise(active_count > 0);
+
+	if (partition_count == 2)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
+		}
+	}
+	else if (partition_count == 3)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
+		}
+	}
+}
+
+/**
+ * @brief Use counting sort on the mismatch array to sort partition candidates.
+ *
+ * @param      partitioning_count   The number of packed partitionings.
+ * @param      mismatch_count       Partitioning mismatch counts, in index order.
+ * @param[out] partition_ordering   Partition index values, in mismatch order.
+ *
+ * @return The number of active partitions in this selection.
+ */
+static unsigned int get_partition_ordering_by_mismatch_bits(
+	unsigned int partitioning_count,
+	const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	promise(partitioning_count > 0);
+	unsigned int mscount[256] { 0 };
+
+	// Create the histogram of mismatch counts
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		mscount[mismatch_count[i]]++;
+	}
+
+	unsigned int active_count = partitioning_count - mscount[255];
+
+	// Create a running sum from the histogram array
+	// Cells store previous values only; i.e. exclude self after sum
+	unsigned int summa = 0;
+	for (unsigned int i = 0; i < 256; i++)
+	{
+		unsigned int cnt = mscount[i];
+		mscount[i] = summa;
+		summa += cnt;
+	}
+
+	// Use the running sum as the index, incrementing after read to allow
+	// sequential entries with the same count
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		unsigned int idx = mscount[mismatch_count[i]]++;
+		partition_ordering[idx] = i;
+	}
+
+	return active_count;
+}
+
+/**
+ * @brief Use k-means clustering to compute a partition ordering for a block..
+ *
+ * @param      bsd                  The block size information.
+ * @param      blk                  The image block color data to compress.
+ * @param      partition_count      The desired number of partitions in the block.
+ * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
+ *
+ * @return The number of active partitionings in this selection.
+ */
+static unsigned int compute_kmeans_partition_ordering(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
+	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
+
+	// Use three passes of k-means clustering to partition the block data
+	for (unsigned int i = 0; i < 3; i++)
+	{
+		if (i == 0)
+		{
+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
+		}
+		else
+		{
+			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+		}
+
+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+	}
+
+	// Construct the block bitmaps of texel assignments to each partition
+	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
+	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+	promise(texels_to_process > 0);
+	for (unsigned int i = 0; i < texels_to_process; i++)
+	{
+		unsigned int idx = bsd.kmeans_texels[i];
+		bitmaps[texel_partitions[idx]] |= 1ULL << i;
+	}
+
+	// Count the mismatch between the block and the format's partition tables
+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
+	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
+
+	// Sort the partitions based on the number of mismatched bits
+	return get_partition_ordering_by_mismatch_bits(
+	    bsd.partitioning_count_selected[partition_count - 1],
+	    mismatch_counts, partition_ordering);
+}
+
+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param      max_values      The max number of entries in the best result arrays.
+ * @param      this_error      The error of the new entry.
+ * @param      this_partition  The partition ID of the new entry.
+ * @param[out] best_errors     The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+	unsigned int max_values,
+	float this_error,
+	unsigned int this_partition,
+	float* best_errors,
+	unsigned int* best_partitions)
+{
+	promise(max_values > 0);
+
+	// Don't bother searching if the current worst error beats the new error
+	if (this_error >= best_errors[max_values - 1])
+	{
+		return;
+	}
+
+	// Else insert into the list in error-order
+	for (unsigned int i = 0; i < max_values; i++)
+	{
+		// Existing result is better - move on ...
+		if (this_error > best_errors[i])
+		{
+			continue;
+		}
+
+		// Move existing results down one
+		for (unsigned int j = max_values - 1; j > i; j--)
+		{
+			best_errors[j] = best_errors[j - 1];
+			best_partitions[j] = best_partitions[j - 1];
+		}
+
+		// Insert new result
+		best_errors[i] = this_error;
+		best_partitions[i] = this_partition;
+		break;
+	}
+}
+
+/* See header for documentation. */
+unsigned int find_best_partition_candidates(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_search_limit,
+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+	unsigned int requested_candidates
+) {
+	// Constant used to estimate quantization error for a given partitioning; the optimal value for
+	// this depends on bitrate. These values have been determined empirically.
+	unsigned int texels_per_block = bsd.texel_count;
+	float weight_imprecision_estim = 0.055f;
+	if (texels_per_block <= 20)
+	{
+		weight_imprecision_estim = 0.03f;
+	}
+	else if (texels_per_block <= 31)
+	{
+		weight_imprecision_estim = 0.04f;
+	}
+	else if (texels_per_block <= 41)
+	{
+		weight_imprecision_estim = 0.05f;
+	}
+
+	promise(partition_count > 0);
+	promise(partition_search_limit > 0);
+
+	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
+
+	unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
+	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
+	partition_search_limit = astc::min(partition_search_limit, sequence_len);
+	requested_candidates = astc::min(partition_search_limit, requested_candidates);
+
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	// Partitioning errors assuming uncorrelated-chrominance endpoints
+	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	// Partitioning errors assuming same-chrominance endpoints
+	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
+	}
+
+	if (uses_alpha)
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
+			line4 samec_lines[BLOCK_MAX_PARTITIONS];
+
+			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
+			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
+
+			float uncor_line_lens[BLOCK_MAX_PARTITIONS];
+			float samec_line_lens[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+
+				uncor_lines[j].a = pm.avg;
+				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
+
+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+				uncor_plines[j].bs = uncor_lines[j].b;
+
+				samec_lines[j].a = vfloat4::zero();
+				samec_lines[j].b = normalize_safe(pm.avg, unit4());
+
+				samec_plines[j].amod = vfloat4::zero();
+				samec_plines[j].bs = samec_lines[j].b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgba(pi,
+			                           blk,
+			                           uncor_plines,
+			                           samec_plines,
+			                           uncor_line_lens,
+			                           samec_line_lens,
+			                           uncor_error,
+			                           samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
+				vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
+
+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+				partition_lines3& pl = plines[j];
+
+				pl.uncor_line.a = pm.avg;
+				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
+
+				pl.samec_line.a = vfloat4::zero();
+				pl.samec_line.b = normalize_safe(pm.avg, unit3());
+
+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+				pl.uncor_pline.bs   = pl.uncor_line.b;
+
+				pl.samec_pline.amod = vfloat4::zero();
+				pl.samec_pline.bs   = pl.samec_line.b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgb(pi,
+			                          blk,
+			                          plines,
+			                          uncor_error,
+			                          samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_lines3& pl = plines[j];
+
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
+				vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
+
+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+
+	bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
+
+	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		if (best_is_uncor)
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+		}
+		else
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+		}
+	}
+
+	uint64_t bitmasks[1024/64] { 0 };
+	unsigned int emitted = 0;
+
+	// Deduplicate the first "requested" entries
+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
+	{
+		unsigned int partition = interleave[i];
+
+		unsigned int word = partition / 64;
+		unsigned int bit = partition % 64;
+
+		bool written = bitmasks[word] & (1ull << bit);
+
+		if (!written)
+		{
+			best_partitions[emitted] = partition;
+			bitmasks[word] |= 1ull << bit;
+			emitted++;
+
+			if (emitted == requested_candidates)
+			{
+				break;
+			}
+		}
+	}
+
+	return emitted;
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
--- a/thirdparty/astcenc/astcenc_image.cpp
+++ b/thirdparty/astcenc/astcenc_image.cpp
@ -0,0 +1,558 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for creating in-memory ASTC image structures.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Loader pipeline function type for data fetch from memory.
+ */
+using pixel_loader = vfloat4(*)(const void*, int);
+
+/**
+ * @brief Loader pipeline function type for swizzling data in a vector.
+ */
+using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
+
+/**
+ * @brief Loader pipeline function type for converting data in a vector to LNS.
+ */
+using pixel_converter = vfloat4(*)(vfloat4, vmask4);
+
+/**
+ * @brief Load a 8-bit UNORM texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_u8(
+	const void* data,
+	int base_offset
+) {
+	const uint8_t* data8 = static_cast<const uint8_t*>(data);
+	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
+}
+
+/**
+ * @brief Load a 16-bit fp16 texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f16(
+	const void* data,
+	int base_offset
+) {
+	const uint16_t* data16 = static_cast<const uint16_t*>(data);
+	int r = data16[base_offset    ];
+	int g = data16[base_offset + 1];
+	int b = data16[base_offset + 2];
+	int a = data16[base_offset + 3];
+	return float16_to_float(vint4(r, g, b, a));
+}
+
+/**
+ * @brief Load a 32-bit float texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f32(
+	const void* data,
+	int base_offset
+) {
+	const float* data32 = static_cast<const float*>(data);
+	return vfloat4(data32 + base_offset);
+}
+
+/**
+ * @brief Dummy no-op swizzle function.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel_skip(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	return data;
+}
+
+/**
+ * @brief Swizzle a texel into a new arrangement.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	alignas(16) float datas[6];
+
+	storea(data, datas);
+	datas[ASTCENC_SWZ_0] = 0.0f;
+	datas[ASTCENC_SWZ_1] = 1.0f;
+
+	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
+}
+
+/**
+ * @brief Encode a texel that is entirely LDR linear.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_unorm(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	(void)lns_mask;
+	return data * 65535.0f;
+}
+
+/**
+ * @brief Encode a texel that includes at least some HDR LNS texels.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_lns(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	vfloat4 datav_unorm = data * 65535.0f;
+	vfloat4 datav_lns = float_to_lns(data);
+	return select(datav_unorm, datav_lns, lns_mask);
+}
+
+/* See header for documentation. */
+void load_image_block(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+	unsigned int zsize = img.dim_z;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	int idx = 0;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean(0.0f);
+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+
+	// This works because we impose the same choice everywhere during encode
+	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
+	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
+	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
+	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
+	vmask4 lns_mask = use_lns != vint4::zero();
+
+	// Set up the function pointers for loading pipeline as needed
+	pixel_loader loader = load_texel_u8;
+	if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		loader = load_texel_f16;
+	}
+	else if  (img.data_type == ASTCENC_TYPE_F32)
+	{
+		loader = load_texel_f32;
+	}
+
+	pixel_swizzler swizzler = swz_texel_skip;
+	if (needs_swz)
+	{
+		swizzler = swz_texel;
+	}
+
+	pixel_converter converter = encode_texel_unorm;
+	if (any(lns_mask))
+	{
+		converter = encode_texel_lns;
+	}
+
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		unsigned int zi = astc::min(zpos + z, zsize - 1);
+		void* plane = img.data[zi];
+
+		for (unsigned int y = 0; y < bsd.ydim; y++)
+		{
+			unsigned int yi = astc::min(ypos + y, ysize - 1);
+
+			for (unsigned int x = 0; x < bsd.xdim; x++)
+			{
+				unsigned int xi = astc::min(xpos + x, xsize - 1);
+
+				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
+				datav = swizzler(datav, swz);
+				datav = converter(datav, lns_mask);
+
+				// Compute block metadata
+				data_min = min(data_min, datav);
+				data_mean += datav * data_mean_scale;
+				data_max = max(data_max, datav);
+
+				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+				blk.data_r[idx] = datav.lane<0>();
+				blk.data_g[idx] = datav.lane<1>();
+				blk.data_b[idx] = datav.lane<2>();
+				blk.data_a[idx] = datav.lane<3>();
+
+				blk.rgb_lns[idx] = rgb_lns;
+				blk.alpha_lns[idx] = a_lns;
+
+				idx++;
+			}
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	vfloat4 data_enc = blk.texel(0);
+	vfloat4 data_enc_unorm = data_enc / 65535.0f;
+	vfloat4 data_enc_lns = vfloat4::zero();
+
+	if (rgb_lns || a_lns)
+	{
+		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
+	}
+
+	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
+
+	// Store block metadata
+	blk.data_min = data_min;
+	blk.data_mean = data_mean;
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void load_image_block_fast_ldr(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	(void)decode_mode;
+
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean = vfloat4::zero();
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+	int idx = 0;
+
+	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
+	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
+	{
+		unsigned int yi = astc::min(y, ysize - 1);
+
+		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
+		{
+			unsigned int xi = astc::min(x, xsize - 1);
+
+			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
+			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
+
+			// Compute block metadata
+			data_min = min(data_min, datav);
+			data_mean += datav;
+			data_max = max(data_max, datav);
+
+			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+			blk.data_r[idx] = datav.lane<0>();
+			blk.data_g[idx] = datav.lane<1>();
+			blk.data_b[idx] = datav.lane<2>();
+			blk.data_a[idx] = datav.lane<3>();
+
+			idx++;
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	blk.origin_texel = blk.texel(0) / 65535.0f;
+
+	// Store block metadata
+	blk.rgb_lns[0] = 0;
+	blk.alpha_lns[0] = 0;
+	blk.data_min = data_min;
+	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void store_image_block(
+	astcenc_image& img,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int x_size = img.dim_x;
+	unsigned int x_start = xpos;
+	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
+	unsigned int x_count = x_end - x_start;
+	unsigned int x_nudge = bsd.xdim - x_count;
+
+	unsigned int y_size = img.dim_y;
+	unsigned int y_start = ypos;
+	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
+	unsigned int y_count = y_end - y_start;
+	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+
+	unsigned int z_size = img.dim_z;
+	unsigned int z_start = zpos;
+	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	// True if any swizzle uses Z reconstruct
+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
+	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+
+	int idx = 0;
+	if (img.data_type == ASTCENC_TYPE_U8)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
+				{
+					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
+					unsigned int used_texels = astc::min(x_count - x, max_texels);
+
+					// Unaligned load as rows are not always SIMD_WIDTH long
+					vfloat data_r(blk.data_r + idx);
+					vfloat data_g(blk.data_g + idx);
+					vfloat data_b(blk.data_b + idx);
+					vfloat data_a(blk.data_a + idx);
+
+					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
+					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
+					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
+					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+
+					if (needs_swz)
+					{
+						vint swizzle_table[7];
+						swizzle_table[ASTCENC_SWZ_0] = vint(0);
+						swizzle_table[ASTCENC_SWZ_1] = vint(255);
+						swizzle_table[ASTCENC_SWZ_R] = data_ri;
+						swizzle_table[ASTCENC_SWZ_G] = data_gi;
+						swizzle_table[ASTCENC_SWZ_B] = data_bi;
+						swizzle_table[ASTCENC_SWZ_A] = data_ai;
+
+						if (needs_z)
+						{
+							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
+							data_z = max(data_z, 0.0f);
+							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
+
+							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
+						}
+
+						data_ri = swizzle_table[swz.r];
+						data_gi = swizzle_table[swz.g];
+						data_bi = swizzle_table[swz.b];
+						data_ai = swizzle_table[swz.a];
+					}
+
+					// Errors are NaN encoded - convert to magenta error color
+					// Branch is OK here - it is almost never true so predicts well
+					vmask nan_mask = data_r != data_r;
+					if (any(nan_mask))
+					{
+						data_ri = select(data_ri, vint(0xFF), nan_mask);
+						data_gi = select(data_gi, vint(0x00), nan_mask);
+						data_bi = select(data_bi, vint(0xFF), nan_mask);
+						data_ai = select(data_ai, vint(0xFF), nan_mask);
+					}
+
+					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
+					vmask store_mask = vint::lane_id() < vint(used_texels);
+					store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
+
+					data8_row += ASTCENC_SIMD_WIDTH * 4;
+					idx += used_texels;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vint4 color;
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = blk.data_r[idx];
+						data[ASTCENC_SWZ_G] = blk.data_g[idx];
+						data[ASTCENC_SWZ_B] = blk.data_b[idx];
+						data[ASTCENC_SWZ_A] = blk.data_a[idx];
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+						color = float_to_float16(colorf);
+					}
+					else
+					{
+						vfloat4 colorf = blk.texel(idx);
+						color = float_to_float16(colorf);
+					}
+
+					// TODO: Vectorize with store N shorts?
+					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
+					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
+					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
+					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
+					data16_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else // if (img.data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img.data_type == ASTCENC_TYPE_F32);
+
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			float* data32 = static_cast<float*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vfloat4 color = blk.texel(idx);
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = color.lane<0>();
+						data[ASTCENC_SWZ_G] = color.lane<1>();
+						data[ASTCENC_SWZ_B] = color.lane<2>();
+						data[ASTCENC_SWZ_A] = color.lane<3>();
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					}
+
+					store(color, data32_row);
+					data32_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+}
--- a/thirdparty/astcenc/astcenc_integer_sequence.cpp
+++ b/thirdparty/astcenc/astcenc_integer_sequence.cpp
@ -0,0 +1,739 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
+ */
+
+#include "astcenc_internal.h"
+
+#include <array>
+
+/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t quints_of_integer[128][3] {
+	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
+	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
+	{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
+	{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
+	{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
+	{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
+	{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
+	{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
+	{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
+	{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
+	{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
+	{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
+	{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
+	{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
+	{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
+	{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
+	{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
+	{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
+	{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
+	{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
+	{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
+	{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
+	{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
+	{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
+	{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
+	{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
+	{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
+	{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
+	{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
+	{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
+	{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
+	{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
+};
+
+/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
+static const uint8_t integer_of_quints[5][5][5] {
+	{
+		{0, 1, 2, 3, 4},
+		{8, 9, 10, 11, 12},
+		{16, 17, 18, 19, 20},
+		{24, 25, 26, 27, 28},
+		{5, 13, 21, 29, 6}
+	},
+	{
+		{32, 33, 34, 35, 36},
+		{40, 41, 42, 43, 44},
+		{48, 49, 50, 51, 52},
+		{56, 57, 58, 59, 60},
+		{37, 45, 53, 61, 14}
+	},
+	{
+		{64, 65, 66, 67, 68},
+		{72, 73, 74, 75, 76},
+		{80, 81, 82, 83, 84},
+		{88, 89, 90, 91, 92},
+		{69, 77, 85, 93, 22}
+	},
+	{
+		{96, 97, 98, 99, 100},
+		{104, 105, 106, 107, 108},
+		{112, 113, 114, 115, 116},
+		{120, 121, 122, 123, 124},
+		{101, 109, 117, 125, 30}
+	},
+	{
+		{102, 103, 70, 71, 38},
+		{110, 111, 78, 79, 46},
+		{118, 119, 86, 87, 54},
+		{126, 127, 94, 95, 62},
+		{39, 47, 55, 63, 31}
+	}
+};
+
+/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t trits_of_integer[256][5] {
+	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
+	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
+	{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
+	{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
+	{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
+	{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
+	{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
+	{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
+	{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
+	{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
+	{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
+	{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
+	{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
+	{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
+	{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
+	{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
+	{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
+	{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
+	{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
+	{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
+	{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
+	{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
+	{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
+	{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
+	{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
+	{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
+	{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
+	{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
+	{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
+	{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
+	{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
+	{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
+	{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
+	{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
+	{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
+	{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
+	{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
+	{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
+	{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
+	{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
+	{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
+	{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
+	{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
+	{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
+};
+
+/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
+static const uint8_t integer_of_trits[3][3][3][3][3] {
+	{
+		{
+			{
+				{0, 1, 2},
+				{4, 5, 6},
+				{8, 9, 10}
+			},
+			{
+				{16, 17, 18},
+				{20, 21, 22},
+				{24, 25, 26}
+			},
+			{
+				{3, 7, 15},
+				{19, 23, 27},
+				{12, 13, 14}
+			}
+		},
+		{
+			{
+				{32, 33, 34},
+				{36, 37, 38},
+				{40, 41, 42}
+			},
+			{
+				{48, 49, 50},
+				{52, 53, 54},
+				{56, 57, 58}
+			},
+			{
+				{35, 39, 47},
+				{51, 55, 59},
+				{44, 45, 46}
+			}
+		},
+		{
+			{
+				{64, 65, 66},
+				{68, 69, 70},
+				{72, 73, 74}
+			},
+			{
+				{80, 81, 82},
+				{84, 85, 86},
+				{88, 89, 90}
+			},
+			{
+				{67, 71, 79},
+				{83, 87, 91},
+				{76, 77, 78}
+			}
+		}
+	},
+	{
+		{
+			{
+				{128, 129, 130},
+				{132, 133, 134},
+				{136, 137, 138}
+			},
+			{
+				{144, 145, 146},
+				{148, 149, 150},
+				{152, 153, 154}
+			},
+			{
+				{131, 135, 143},
+				{147, 151, 155},
+				{140, 141, 142}
+			}
+		},
+		{
+			{
+				{160, 161, 162},
+				{164, 165, 166},
+				{168, 169, 170}
+			},
+			{
+				{176, 177, 178},
+				{180, 181, 182},
+				{184, 185, 186}
+			},
+			{
+				{163, 167, 175},
+				{179, 183, 187},
+				{172, 173, 174}
+			}
+		},
+		{
+			{
+				{192, 193, 194},
+				{196, 197, 198},
+				{200, 201, 202}
+			},
+			{
+				{208, 209, 210},
+				{212, 213, 214},
+				{216, 217, 218}
+			},
+			{
+				{195, 199, 207},
+				{211, 215, 219},
+				{204, 205, 206}
+			}
+		}
+	},
+	{
+		{
+			{
+				{96, 97, 98},
+				{100, 101, 102},
+				{104, 105, 106}
+			},
+			{
+				{112, 113, 114},
+				{116, 117, 118},
+				{120, 121, 122}
+			},
+			{
+				{99, 103, 111},
+				{115, 119, 123},
+				{108, 109, 110}
+			}
+		},
+		{
+			{
+				{224, 225, 226},
+				{228, 229, 230},
+				{232, 233, 234}
+			},
+			{
+				{240, 241, 242},
+				{244, 245, 246},
+				{248, 249, 250}
+			},
+			{
+				{227, 231, 239},
+				{243, 247, 251},
+				{236, 237, 238}
+			}
+		},
+		{
+			{
+				{28, 29, 30},
+				{60, 61, 62},
+				{92, 93, 94}
+			},
+			{
+				{156, 157, 158},
+				{188, 189, 190},
+				{220, 221, 222}
+			},
+			{
+				{31, 63, 127},
+				{159, 191, 255},
+				{252, 253, 254}
+			}
+		}
+	}
+};
+
+/**
+ * @brief The number of bits, trits, and quints needed for a quant level.
+ */
+struct btq_count
+{
+	/** @brief The number of bits. */
+	uint8_t bits:6;
+
+	/** @brief The number of trits. */
+	uint8_t trits:1;
+
+	/** @brief The number of quints. */
+	uint8_t quints:1;
+};
+
+/**
+ * @brief The table of bits, trits, and quints needed for a quant encode.
+ */
+static const std::array<btq_count, 21> btq_counts {{
+	{ 1, 0, 0 }, // QUANT_2
+	{ 0, 1, 0 }, // QUANT_3
+	{ 2, 0, 0 }, // QUANT_4
+	{ 0, 0, 1 }, // QUANT_5
+	{ 1, 1, 0 }, // QUANT_6
+	{ 3, 0, 0 }, // QUANT_8
+	{ 1, 0, 1 }, // QUANT_10
+	{ 2, 1, 0 }, // QUANT_12
+	{ 4, 0, 0 }, // QUANT_16
+	{ 2, 0, 1 }, // QUANT_20
+	{ 3, 1, 0 }, // QUANT_24
+	{ 5, 0, 0 }, // QUANT_32
+	{ 3, 0, 1 }, // QUANT_40
+	{ 4, 1, 0 }, // QUANT_48
+	{ 6, 0, 0 }, // QUANT_64
+	{ 4, 0, 1 }, // QUANT_80
+	{ 5, 1, 0 }, // QUANT_96
+	{ 7, 0, 0 }, // QUANT_128
+	{ 5, 0, 1 }, // QUANT_160
+	{ 6, 1, 0 }, // QUANT_192
+	{ 8, 0, 0 }  // QUANT_256
+}};
+
+/**
+ * @brief The sequence scale, round, and divisors needed to compute sizing.
+ *
+ * The length of a quantized sequence in bits is:
+ *     (scale * <sequence_len> + round) / divisor
+ */
+struct ise_size
+{
+	/** @brief The scaling parameter. */
+	uint8_t scale:6;
+
+	/** @brief The divisor parameter. */
+	uint8_t divisor:2;
+};
+
+/**
+ * @brief The table of scale, round, and divisors needed for quant sizing.
+ */
+static const std::array<ise_size, 21> ise_sizes {{
+	{  1, 0 }, // QUANT_2
+	{  8, 2 }, // QUANT_3
+	{  2, 0 }, // QUANT_4
+	{  7, 1 }, // QUANT_5
+	{ 13, 2 }, // QUANT_6
+	{  3, 0 }, // QUANT_8
+	{ 10, 1 }, // QUANT_10
+	{ 18, 2 }, // QUANT_12
+	{  4, 0 }, // QUANT_16
+	{ 13, 1 }, // QUANT_20
+	{ 23, 2 }, // QUANT_24
+	{  5, 0 }, // QUANT_32
+	{ 16, 1 }, // QUANT_40
+	{ 28, 2 }, // QUANT_48
+	{  6, 0 }, // QUANT_64
+	{ 19, 1 }, // QUANT_80
+	{ 33, 2 }, // QUANT_96
+	{  7, 0 }, // QUANT_128
+	{ 22, 1 }, // QUANT_160
+	{ 38, 2 }, // QUANT_192
+	{  8, 0 }  // QUANT_256
+}};
+
+/* See header for documentation. */
+unsigned int get_ise_sequence_bitcount(
+	unsigned int character_count,
+	quant_method quant_level
+) {
+	// Cope with out-of bounds values - input might be invalid
+	if (static_cast<size_t>(quant_level) >= ise_sizes.size())
+	{
+		// Arbitrary large number that's more than an ASTC block can hold
+		return 1024;
+	}
+
+	auto& entry = ise_sizes[quant_level];
+	unsigned int divisor = (entry.divisor << 1) + 1;
+	return (entry.scale * character_count + divisor - 1) / divisor;
+}
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	unsigned int value,
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	uint8_t ptr[2]
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline unsigned int read_bits(
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	const uint8_t* ptr
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	unsigned int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/* See header for documentation. */
+void encode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+	unsigned int mask = (1 << bits) - 1;
+
+	// Write out trits and bits
+	if (trits)
+	{
+		unsigned int i = 0;
+		unsigned int full_trit_blocks = character_count / 5;
+
+		for (unsigned int j = 0; j < full_trit_blocks; j++)
+		{
+			unsigned int i4 = input_data[i + 4] >> bits;
+			unsigned int i3 = input_data[i + 3] >> bits;
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			// The max size of a trit bit count is 6, so we can always safely
+			// pack a single MX value with the following 1 or 2 T bits.
+			uint8_t pack;
+
+			// Element 0 + T0 + T1
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 1 + T2 + T3
+			pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2 + T4
+			pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+
+			// Element 3 + T5 + T6
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 4 + T7
+			pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i4 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i4 =                            0;
+			unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
+			unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[4]  { 2, 2, 1, 2 };
+				static const uint8_t tshift[4] { 0, 2, 4, 5 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out quints and bits
+	else if (quints)
+	{
+		unsigned int i = 0;
+		unsigned int full_quint_blocks = character_count / 3;
+
+		for (unsigned int j = 0; j < full_quint_blocks; j++)
+		{
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			// The max size of a quint bit count is 5, so we can always safely
+			// pack a single M value with the following 2 or 3 T bits.
+			uint8_t pack;
+
+			// Element 0
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
+			write_bits(pack, bits + 3, bit_offset, output_data);
+			bit_offset += bits + 3;
+
+			// Element 1
+			pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i2 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i2 =                            0;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[2]  { 3, 2 };
+				static const uint8_t tshift[2] { 0, 3 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out just bits
+	else
+	{
+		for (unsigned int i = 0; i < character_count; i++)
+		{
+			write_bits(input_data[i], bits, bit_offset, output_data);
+			bit_offset += bits;
+		}
+	}
+}
+
+/* See header for documentation. */
+void decode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
+	// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
+	// but we keep 4 additional character_count of padding.
+	uint8_t results[68];
+	uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+
+	unsigned int lcounter = 0;
+	unsigned int hcounter = 0;
+
+	// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
+		bit_offset += bits;
+
+		if (trits)
+		{
+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+
+		if (quints)
+		{
+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
+			static const uint8_t block_shift[3]   { 0, 3, 5 };
+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+
+	// Unpack trit-blocks or quint-blocks as needed
+	if (trits)
+	{
+		unsigned int trit_blocks = (character_count + 4) / 5;
+		promise(trit_blocks > 0);
+		for (unsigned int i = 0; i < trit_blocks; i++)
+		{
+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
+			results[5 * i    ] |= tritptr[0] << bits;
+			results[5 * i + 1] |= tritptr[1] << bits;
+			results[5 * i + 2] |= tritptr[2] << bits;
+			results[5 * i + 3] |= tritptr[3] << bits;
+			results[5 * i + 4] |= tritptr[4] << bits;
+		}
+	}
+
+	if (quints)
+	{
+		unsigned int quint_blocks = (character_count + 2) / 3;
+		promise(quint_blocks > 0);
+		for (unsigned int i = 0; i < quint_blocks; i++)
+		{
+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+			results[3 * i    ] |= quintptr[0] << bits;
+			results[3 * i + 1] |= quintptr[1] << bits;
+			results[3 * i + 2] |= quintptr[2] << bits;
+		}
+	}
+
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		output_data[i] = results[i];
+	}
+}
--- a/thirdparty/astcenc/astcenc_internal.h
+++ b/thirdparty/astcenc/astcenc_internal.h
--- a/thirdparty/astcenc/astcenc_internal_entry.h
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@ -0,0 +1,273 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations for the outer context.
+ *
+ * The outer context includes thread-pool management, which is slower to
+ * compile due to increased use of C++ stdlib. The inner context used in the
+ * majority of the codec library does not include this.
+ */
+
+#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
+#define ASTCENC_INTERNAL_ENTRY_INCLUDED
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "astcenc_internal.h"
+
+/* ============================================================================
+  Parallel execution control
+============================================================================ */
+
+/**
+ * @brief A simple counter-based manager for parallel task execution.
+ *
+ * The task processing execution consists of:
+ *
+ *     * A single-threaded init stage.
+ *     * A multi-threaded processing stage.
+ *     * A condition variable so threads can wait for processing completion.
+ *
+ * The init stage will be executed by the first thread to arrive in the critical section, there is
+ * no main thread in the thread pool.
+ *
+ * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
+ * basis. Threads may each therefore executed different numbers of tasks, depending on their
+ * processing complexity. The task queue and the task tickets are just counters; the caller must map
+ * these integers to an actual processing partition in a specific problem domain.
+ *
+ * The exit wait condition is needed to ensure processing has finished before a worker thread can
+ * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
+ * because there are no new tasks to assign to it while other worker threads are still processing.
+ * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
+ *
+ * The basic usage model:
+ *
+ *     // --------- From single-threaded code ---------
+ *
+ *     // Reset the tracker state
+ *     manager->reset()
+ *
+ *     // --------- From multi-threaded code ---------
+ *
+ *     // Run the stage init; only first thread actually runs the lambda
+ *     manager->init(<lambda>)
+ *
+ *     do
+ *     {
+ *         // Request a task assignment
+ *         uint task_count;
+ *         uint base_index = manager->get_tasks(<granule>, task_count);
+ *
+ *         // Process any tasks we were given (task_count <= granule size)
+ *         if (task_count)
+ *         {
+ *             // Run the user task processing code for N tasks here
+ *             ...
+ *
+ *             // Flag these tasks as complete
+ *             manager->complete_tasks(task_count);
+ *         }
+ *     } while (task_count);
+ *
+ *     // Wait for all threads to complete tasks before progressing
+ *     manager->wait()
+ *
+  *     // Run the stage term; only first thread actually runs the lambda
+ *     manager->term(<lambda>)
+ */
+class ParallelManager
+{
+private:
+	/** @brief Lock used for critical section and condition synchronization. */
+	std::mutex m_lock;
+
+	/** @brief True if the stage init() step has been executed. */
+	bool m_init_done;
+
+	/** @brief True if the stage term() step has been executed. */
+	bool m_term_done;
+
+	/** @brief Condition variable for tracking stage processing completion. */
+	std::condition_variable m_complete;
+
+	/** @brief Number of tasks started, but not necessarily finished. */
+	std::atomic<unsigned int> m_start_count;
+
+	/** @brief Number of tasks finished. */
+	unsigned int m_done_count;
+
+	/** @brief Number of tasks that need to be processed. */
+	unsigned int m_task_count;
+
+public:
+	/** @brief Create a new ParallelManager. */
+	ParallelManager()
+	{
+		reset();
+	}
+
+	/**
+	 * @brief Reset the tracker for a new processing batch.
+	 *
+	 * This must be called from single-threaded code before starting the multi-threaded processing
+	 * operations.
+	 */
+	void reset()
+	{
+		m_init_done = false;
+		m_term_done = false;
+		m_start_count = 0;
+		m_done_count = 0;
+		m_task_count = 0;
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param init_func   Callable which executes the stage initialization. It must return the
+	 *                    total number of tasks in the stage.
+	 */
+	void init(std::function<unsigned int(void)> init_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = init_func();
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param task_count   Total number of tasks needing processing.
+	 */
+	void init(unsigned int task_count)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = task_count;
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Request a task assignment.
+	 *
+	 * Assign up to @c granule tasks to the caller for processing.
+	 *
+	 * @param      granule   Maximum number of tasks that can be assigned.
+	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
+	 *
+	 * @return Task index of the first assigned task; assigned tasks increment from this.
+	 */
+	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
+	{
+		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
+		if (base >= m_task_count)
+		{
+			count = 0;
+			return 0;
+		}
+
+		count = astc::min(m_task_count - base, granule);
+		return base;
+	}
+
+	/**
+	 * @brief Complete a task assignment.
+	 *
+	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
+	 * completes the processing of the stage.
+	 *
+	 * @param count   The number of completed tasks.
+	 */
+	void complete_task_assignment(unsigned int count)
+	{
+		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
+		// update here and the wait() for other threads
+		std::unique_lock<std::mutex> lck(m_lock);
+		this->m_done_count += count;
+		if (m_done_count == m_task_count)
+		{
+			lck.unlock();
+			m_complete.notify_all();
+		}
+	}
+
+	/**
+	 * @brief Wait for stage processing to complete.
+	 */
+	void wait()
+	{
+		std::unique_lock<std::mutex> lck(m_lock);
+		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage term step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * work pool termination. Caller must have called @c wait() prior to calling this function to
+	 * ensure that processing is complete.
+	 *
+	 * @param term_func   Callable which executes the stage termination.
+	 */
+	void term(std::function<void(void)> term_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_term_done)
+		{
+			term_func();
+			m_term_done = true;
+		}
+	}
+};
+
+/**
+ * @brief The astcenc compression context.
+ */
+struct astcenc_context
+{
+	/** @brief The context internal state. */
+	astcenc_contexti context;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	/** @brief The parallel manager for averages computation. */
+	ParallelManager manage_avg;
+
+	/** @brief The parallel manager for compression. */
+	ParallelManager manage_compress;
+#endif
+
+	/** @brief The parallel manager for decompression. */
+	ParallelManager manage_decompress;
+};
+
+#endif
--- a/thirdparty/astcenc/astcenc_mathlib.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib.cpp
@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include "astcenc_mathlib.h"
+
+/**
+ * @brief 64-bit rotate left.
+ *
+ * @param val   The value to rotate.
+ * @param count The rotation, in bits.
+ */
+static inline uint64_t rotl(uint64_t val, int count)
+{
+	return (val << count) | (val >> (64 - count));
+}
+
+/* See header for documentation. */
+void astc::rand_init(uint64_t state[2])
+{
+	state[0] = 0xfaf9e171cea1ec6bULL;
+	state[1] = 0xf1b318cc06af5d71ULL;
+}
+
+/* See header for documentation. */
+uint64_t astc::rand(uint64_t state[2])
+{
+	uint64_t s0 = state[0];
+	uint64_t s1 = state[1];
+	uint64_t res = s0 + s1;
+	s1 ^= s0;
+	state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
+	state[1] = rotl(s1, 37);
+	return res;
+}
--- a/thirdparty/astcenc/astcenc_mathlib.h
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@ -0,0 +1,478 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements a variety of mathematical data types and library
+ * functions used by the codec.
+ */
+
+#ifndef ASTC_MATHLIB_H_INCLUDED
+#define ASTC_MATHLIB_H_INCLUDED
+
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+
+#ifndef ASTCENC_POPCNT
+  #if defined(__POPCNT__)
+    #define ASTCENC_POPCNT 1
+  #else
+    #define ASTCENC_POPCNT 0
+  #endif
+#endif
+
+#ifndef ASTCENC_F16C
+  #if defined(__F16C__)
+    #define ASTCENC_F16C 1
+  #else
+    #define ASTCENC_F16C 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SSE
+  #if defined(__SSE4_2__)
+    #define ASTCENC_SSE 42
+  #elif defined(__SSE4_1__)
+    #define ASTCENC_SSE 41
+  #elif defined(__SSE3__)
+    #define ASTCENC_SSE 30
+  #elif defined(__SSE2__)
+    #define ASTCENC_SSE 20
+  #else
+    #define ASTCENC_SSE 0
+  #endif
+#endif
+
+#ifndef ASTCENC_AVX
+  #if defined(__AVX2__)
+    #define ASTCENC_AVX 2
+  #elif defined(__AVX__)
+    #define ASTCENC_AVX 1
+  #else
+    #define ASTCENC_AVX 0
+  #endif
+#endif
+
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
+#if ASTCENC_AVX
+  #define ASTCENC_VECALIGN 32
+#else
+  #define ASTCENC_VECALIGN 16
+#endif
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
+	#include <immintrin.h>
+#endif
+
+/* ============================================================================
+  Fast math library; note that many of the higher-order functions in this set
+  use approximations which are less accurate, but faster, than <cmath> standard
+  library equivalents.
+
+  Note: Many of these are not necessarily faster than simple C versions when
+  used on a single scalar value, but are included for testing purposes as most
+  have an option based on SSE intrinsics and therefore provide an obvious route
+  to future vectorization.
+============================================================================ */
+
+// Union for manipulation of float bit patterns
+typedef union
+{
+	uint32_t u;
+	int32_t s;
+	float f;
+} if32;
+
+// These are namespaced to avoid colliding with C standard library functions.
+namespace astc
+{
+
+static const float PI          = 3.14159265358979323846f;
+static const float PI_OVER_TWO = 1.57079632679489661923f;
+
+/**
+ * @brief SP float absolute value.
+ *
+ * @param v   The value to make absolute.
+ *
+ * @return The absolute value.
+ */
+static inline float fabs(float v)
+{
+	return std::fabs(v);
+}
+
+/**
+ * @brief Test if a float value is a nan.
+ *
+ * @param v    The value test.
+ *
+ * @return Zero is not a NaN, non-zero otherwise.
+ */
+static inline bool isnan(float v)
+{
+	return v != v;
+}
+
+/**
+ * @brief Return the minimum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q)
+{
+	return p < q ? p : q;
+}
+
+/**
+ * @brief Return the minimum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r)
+{
+	return min(min(p, q), r);
+}
+
+/**
+ * @brief Return the minimum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r, T s)
+{
+	return min(min(p, q), min(r, s));
+}
+
+/**
+ * @brief Return the maximum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q)
+{
+	return p > q ? p : q;
+}
+
+/**
+ * @brief Return the maximum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r)
+{
+	return max(max(p, q), r);
+}
+
+/**
+ * @brief Return the maximum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r, T s)
+{
+	return max(max(p, q), max(r, s));
+}
+
+/**
+ * @brief Clamp a value value between @c mn and @c mx.
+ *
+ * For floats, NaNs are turned into @c mn.
+ *
+ * @param v      The value to clamp.
+ * @param mn     The min value (inclusive).
+ * @param mx     The max value (inclusive).
+ *
+ * @return The clamped value.
+ */
+template<typename T>
+inline T clamp(T v, T mn, T mx)
+{
+	// Do not reorder; correct NaN handling relies on the fact that comparison
+	// with NaN returns false and will fall-though to the "min" value.
+	if (v > mx) return mx;
+	if (v > mn) return v;
+	return mn;
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 1.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v   The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp1f(float v)
+{
+	return astc::clamp(v, 0.0f, 1.0f);
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 255.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v  The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp255f(float v)
+{
+	return astc::clamp(v, 0.0f, 255.0f);
+}
+
+/**
+ * @brief SP float round-down.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline float flt_rd(float v)
+{
+	return std::floor(v);
+}
+
+/**
+ * @brief SP float round-to-nearest and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rtn(float v)
+{
+
+	return static_cast<int>(v + 0.5f);
+}
+
+/**
+ * @brief SP float round down and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rd(float v)
+{
+	return static_cast<int>(v);
+}
+
+/**
+ * @brief SP float bit-interpreted as an integer.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline int float_as_int(float v)
+{
+	union { int a; float b; } u;
+	u.b = v;
+	return u.a;
+}
+
+/**
+ * @brief Integer bit-interpreted as an SP float.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline float int_as_float(int v)
+{
+	union { int a; float b; } u;
+	u.a = v;
+	return u.b;
+}
+
+/**
+ * @brief Fast approximation of 1.0 / sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float rsqrt(float v)
+{
+	return 1.0f / std::sqrt(v);
+}
+
+/**
+ * @brief Fast approximation of sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float sqrt(float v)
+{
+	return std::sqrt(v);
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      v      The input value.
+ * @param[out] expo   The output exponent.
+ *
+ * @return The mantissa.
+ */
+static inline float frexp(float v, int* expo)
+{
+	if32 p;
+	p.f = v;
+	*expo = ((p.u >> 23) & 0xFF) - 126;
+	p.u = (p.u & 0x807fffff) | 0x3f000000;
+	return p.f;
+}
+
+/**
+ * @brief Initialize the seed structure for a random number generator.
+ *
+ * Important note: For the purposes of ASTC we want sets of random numbers to
+ * use the codec, but we want the same seed value across instances and threads
+ * to ensure that image output is stable across compressor runs and across
+ * platforms. Every PRNG created by this call will therefore return the same
+ * sequence of values ...
+ *
+ * @param state The state structure to initialize.
+ */
+void rand_init(uint64_t state[2]);
+
+/**
+ * @brief Return the next random number from the generator.
+ *
+ * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
+ * public-domain implementation given by David Blackman & Sebastiano Vigna at
+ * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
+ *
+ * @param state The state structure to use/update.
+ */
+uint64_t rand(uint64_t state[2]);
+
+}
+
+/* ============================================================================
+  Softfloat library with fp32 and fp16 conversion functionality.
+============================================================================ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+	/* narrowing float->float conversions */
+	uint16_t float_to_sf16(float val);
+	float sf16_to_float(uint16_t val);
+#endif
+
+/*********************************
+  Vector library
+*********************************/
+#include "astcenc_vecmathlib.h"
+
+/*********************************
+  Declaration of line types
+*********************************/
+// parametric line, 2D: The line is given by line = a + b * t.
+
+struct line2
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+// parametric line, 3D
+struct line3
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+struct line4
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+
+struct processed_line2
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line3
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line4
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+#endif
--- a/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
@ -0,0 +1,411 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Soft-float library for IEEE-754.
+ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+
+#include "astcenc_mathlib.h"
+
+/*	sized soft-float types. These are mapped to the sized integer
+    types of C99, instead of C's floating-point types; this is because
+    the library needs to maintain exact, bit-level control on all
+    operations on these data types. */
+typedef uint16_t sf16;
+typedef uint32_t sf32;
+
+/******************************************
+  helper functions and their lookup tables
+ ******************************************/
+/* count leading zeros functions. Only used when the input is nonzero. */
+
+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+#elif defined(__arm__) && defined(__ARMCC_VERSION)
+#elif defined(__arm__) && defined(__GNUC__)
+#else
+	/* table used for the slow default versions. */
+	static const uint8_t clz_table[256] =
+	{
+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+#endif
+
+/*
+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
+static uint32_t clz32(uint32_t inp)
+{
+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+		uint32_t bsr;
+		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
+		return 31 - bsr;
+	#else
+		#if defined(__arm__) && defined(__ARMCC_VERSION)
+			return __clz(inp);			/* armcc builtin */
+		#else
+			#if defined(__arm__) && defined(__GNUC__)
+				uint32_t lz;
+				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
+				return lz;
+			#else
+				/* slow default version */
+				uint32_t summa = 24;
+				if (inp >= UINT32_C(0x10000))
+				{
+					inp >>= 16;
+					summa -= 16;
+				}
+				if (inp >= UINT32_C(0x100))
+				{
+					inp >>= 8;
+					summa -= 8;
+				}
+				return summa + clz_table[inp];
+			#endif
+		#endif
+	#endif
+}
+
+/* the five rounding modes that IEEE-754r defines */
+typedef enum
+{
+	SF_UP = 0,				/* round towards positive infinity */
+	SF_DOWN = 1,			/* round towards negative infinity */
+	SF_TOZERO = 2,			/* round towards zero */
+	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
+	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
+} roundmode;
+
+
+static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
+	msk--;						/* negative if even, nonnegative if odd. */
+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
+	inp2 >>= shamt;
+	return inp2;
+}
+
+static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
+	inp += vl1;
+	inp >>= shamt;
+	return inp;
+}
+
+static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	inp += vl1;
+	inp--;
+	inp >>= shamt;
+	return inp;
+}
+
+/* convert from FP16 to FP32. */
+static sf32 sf16_to_sf32(sf16 inp)
+{
+	uint32_t inpx = inp;
+
+	/*
+		This table contains, for every FP16 sign/exponent value combination,
+		the difference between the input FP16 value and the value obtained
+		by shifting the correct FP32 result right by 13 bits.
+		This table allows us to handle every case except denormals and NaN
+		with just 1 table lookup, 2 shifts and 1 add.
+	*/
+
+	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
+	static const uint32_t tbl[64] =
+	{
+		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
+		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
+	};
+
+	uint32_t res = tbl[inpx >> 10];
+	res += inpx;
+
+	/* Normal cases: MSB of 'res' not set. */
+	if ((res & WITH_MSB(0)) == 0)
+	{
+		return res << 13;
+	}
+
+	/* Infinity and Zero: 10 LSB of 'res' not set. */
+	if ((res & 0x3FF) == 0)
+	{
+		return res << 13;
+	}
+
+	/* NaN: the exponent field of 'inp' is non-zero. */
+	if ((inpx & 0x7C00) != 0)
+	{
+		/* All NaNs are quietened. */
+		return (res << 13) | 0x400000;
+	}
+
+	/* Denormal cases */
+	uint32_t sign = (inpx & 0x8000) << 16;
+	uint32_t mskval = inpx & 0x7FFF;
+	uint32_t leadingzeroes = clz32(mskval);
+	mskval <<= leadingzeroes;
+	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
+}
+
+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
+static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
+{
+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
+	static const uint8_t tab[512] {
+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
+
+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
+	};
+
+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
+	   size. */
+	static const uint32_t tabx[60] {
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
+	};
+
+	uint32_t p;
+	uint32_t idx = rmode + tab[inp >> 23];
+	uint32_t vlx = tabx[idx];
+	switch (idx)
+	{
+		/*
+			Positive number which may be Infinity or NaN.
+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
+			(If we don't do this quieting, then a NaN  that is distinguished only by having
+			its low-order bits set, would be turned into an INF. */
+	case 50:
+	case 51:
+	case 52:
+	case 53:
+	case 54:
+	case 55:
+	case 56:
+	case 57:
+	case 58:
+	case 59:
+		/*
+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
+			For NaNs, however, this operation will keep bit 23 with the value 1.
+			We can then extract bit 23, and logical-OR bit 9 of the result with this
+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
+			of the mantissa is set.)
+		*/
+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
+		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
+		/*
+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
+		*/
+	case 0:
+		/*
+			-inp will set the MSB if the input number is nonzero.
+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
+
+		/*
+			negative, exponent = , round-mode == DOWN, need to check whether number is
+			actually 0. If it is, return 0x8000 ( float -0.0 )
+			Else return the smallest negative number ( 0x8001 ) */
+	case 6:
+		/*
+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
+			we obtain a value that is 0 if the input value is in fact zero and has
+			the MSB set if it isn't. We then right-shift the value by 31 places to
+			get a value that is 0 if the input is -0.0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
+
+		/*
+			for all other cases involving underflow/overflow, we don't need to
+			do actual tests; we just return 'vlx'.
+		*/
+	case 1:
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+	case 7:
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+	case 17:
+	case 18:
+	case 19:
+	case 40:
+	case 41:
+	case 42:
+	case 43:
+	case 44:
+	case 45:
+	case 46:
+	case 47:
+	case 48:
+	case 49:
+		return static_cast<sf16>(vlx);
+
+		/*
+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
+
+		/* normal number, all rounding modes except round-to-nearest-even: */
+	case 30:
+	case 31:
+	case 32:
+	case 34:
+	case 35:
+	case 36:
+	case 37:
+	case 39:
+		return static_cast<sf16>((inp + vlx) >> 13);
+
+		/* normal number, round-to-nearest-even. */
+	case 33:
+	case 38:
+		p = inp + vlx;
+		p += (inp >> 13) & 1;
+		return static_cast<sf16>(p >> 13);
+
+		/*
+			the various denormal cases. These are not expected to be common, so their performance is a bit
+			less important. For each of these cases, we need to extract an exponent and a mantissa
+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
+			sign of the resulting denormal number.
+		*/
+	case 21:
+	case 22:
+	case 25:
+	case 27:
+		/* denormal, round towards zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
+	case 20:
+	case 26:
+		/* denormal, round away from zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 24:
+	case 29:
+		/* denormal, round to nearest-away */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 23:
+	case 28:
+		/* denormal, round to nearest-even. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	}
+
+	return 0;
+}
+
+/* convert from soft-float to native-float */
+float sf16_to_float(uint16_t p)
+{
+	if32 i;
+	i.u = sf16_to_sf32(p);
+	return i.f;
+}
+
+/* convert from native-float to soft-float */
+uint16_t float_to_sf16(float p)
+{
+	if32 i;
+	i.f = p;
+	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_partition_tables.cpp
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for generating partition tables on demand.
+ */
+
+#include "astcenc_internal.h"
+
+/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
+#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
+
+/**
+ * @brief Generate a canonical representation of a partition pattern.
+ *
+ * The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
+ * the remapped texel index. Remapping ensures that we only match on the partition pattern,
+ * independent of the partition order generated by the hash.
+ *
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_of_texel   The partition assignments, in hash order.
+ * @param[out] bit_pattern          The output bit pattern representation.
+ */
+static void generate_canonical_partitioning(
+	unsigned int texel_count,
+	const uint8_t* partition_of_texel,
+	uint64_t bit_pattern[BIT_PATTERN_WORDS]
+) {
+	// Clear the pattern
+	for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
+	{
+		bit_pattern[i] = 0;
+	}
+
+	// Store a mapping to reorder the raw partitions so that the partitions are ordered such
+	// that the lowest texel index in partition N is smaller than the lowest texel index in
+	// partition N + 1.
+	int mapped_index[BLOCK_MAX_PARTITIONS];
+	int map_weight_count = 0;
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		mapped_index[i] = -1;
+	}
+
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		int index = partition_of_texel[i];
+		if (mapped_index[index] < 0)
+		{
+			mapped_index[index] = map_weight_count++;
+		}
+
+		uint64_t xlat_index = mapped_index[index];
+		bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
+	}
+}
+
+/**
+ * @brief Compare two canonical patterns to see if they are the same.
+ *
+ * @param part1   The first canonical bit pattern to check.
+ * @param part2   The second canonical bit pattern to check.
+ *
+ * @return @c true if the patterns are the same, @c false otherwise.
+ */
+static bool compare_canonical_partitionings(
+	const uint64_t part1[BIT_PATTERN_WORDS],
+	const uint64_t part2[BIT_PATTERN_WORDS]
+) {
+	return (part1[0] == part2[0])
+#if BIT_PATTERN_WORDS > 1
+	    && (part1[1] == part2[1])
+#endif
+#if BIT_PATTERN_WORDS > 2
+	    && (part1[2] == part2[2])
+#endif
+#if BIT_PATTERN_WORDS > 3
+	    && (part1[3] == part2[3])
+#endif
+#if BIT_PATTERN_WORDS > 4
+	    && (part1[4] == part2[4])
+#endif
+#if BIT_PATTERN_WORDS > 5
+	    && (part1[5] == part2[5])
+#endif
+#if BIT_PATTERN_WORDS > 6
+	    && (part1[6] == part2[6])
+#endif
+	    ;
+}
+
+/**
+ * @brief Hash function used for procedural partition assignment.
+ *
+ * @param inp   The hash seed.
+ *
+ * @return The hashed value.
+ */
+static uint32_t hash52(
+	uint32_t inp
+) {
+	inp ^= inp >> 15;
+
+	// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
+	inp *= 0xEEDE0891;
+	inp ^= inp >> 5;
+	inp += inp << 16;
+	inp ^= inp >> 7;
+	inp ^= inp >> 3;
+	inp ^= inp << 6;
+	inp ^= inp >> 17;
+	return inp;
+}
+
+/**
+ * @brief Select texel assignment for a single coordinate.
+ *
+ * @param seed              The seed - the partition index from the block.
+ * @param x                 The texel X coordinate in the block.
+ * @param y                 The texel Y coordinate in the block.
+ * @param z                 The texel Z coordinate in the block.
+ * @param partition_count   The total partition count of this encoding.
+ * @param small_block       @c true if the block has fewer than 32 texels.
+ *
+ * @return The assigned partition index for this texel.
+ */
+static uint8_t select_partition(
+	int seed,
+	int x,
+	int y,
+	int z,
+	int partition_count,
+	bool small_block
+) {
+	// For small blocks bias the coordinates to get better distribution
+	if (small_block)
+	{
+		x <<= 1;
+		y <<= 1;
+		z <<= 1;
+	}
+
+	seed += (partition_count - 1) * 1024;
+
+	uint32_t rnum = hash52(seed);
+
+	uint8_t seed1 = rnum & 0xF;
+	uint8_t seed2 = (rnum >> 4) & 0xF;
+	uint8_t seed3 = (rnum >> 8) & 0xF;
+	uint8_t seed4 = (rnum >> 12) & 0xF;
+	uint8_t seed5 = (rnum >> 16) & 0xF;
+	uint8_t seed6 = (rnum >> 20) & 0xF;
+	uint8_t seed7 = (rnum >> 24) & 0xF;
+	uint8_t seed8 = (rnum >> 28) & 0xF;
+	uint8_t seed9 = (rnum >> 18) & 0xF;
+	uint8_t seed10 = (rnum >> 22) & 0xF;
+	uint8_t seed11 = (rnum >> 26) & 0xF;
+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+	// Squaring all the seeds in order to bias their distribution towards lower values.
+	seed1 *= seed1;
+	seed2 *= seed2;
+	seed3 *= seed3;
+	seed4 *= seed4;
+	seed5 *= seed5;
+	seed6 *= seed6;
+	seed7 *= seed7;
+	seed8 *= seed8;
+	seed9 *= seed9;
+	seed10 *= seed10;
+	seed11 *= seed11;
+	seed12 *= seed12;
+
+	int sh1, sh2;
+	if (seed & 1)
+	{
+		sh1 = (seed & 2 ? 4 : 5);
+		sh2 = (partition_count == 3 ? 6 : 5);
+	}
+	else
+	{
+		sh1 = (partition_count == 3 ? 6 : 5);
+		sh2 = (seed & 2 ? 4 : 5);
+	}
+
+	int sh3 = (seed & 0x10) ? sh1 : sh2;
+
+	seed1 >>= sh1;
+	seed2 >>= sh2;
+	seed3 >>= sh1;
+	seed4 >>= sh2;
+	seed5 >>= sh1;
+	seed6 >>= sh2;
+	seed7 >>= sh1;
+	seed8 >>= sh2;
+
+	seed9 >>= sh3;
+	seed10 >>= sh3;
+	seed11 >>= sh3;
+	seed12 >>= sh3;
+
+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+	// Apply the saw
+	a &= 0x3F;
+	b &= 0x3F;
+	c &= 0x3F;
+	d &= 0x3F;
+
+	// Remove some of the components if we are to output < 4 partitions.
+	if (partition_count <= 3)
+	{
+		d = 0;
+	}
+
+	if (partition_count <= 2)
+	{
+		c = 0;
+	}
+
+	if (partition_count <= 1)
+	{
+		b = 0;
+	}
+
+	uint8_t partition;
+	if (a >= b && a >= c && a >= d)
+	{
+		partition = 0;
+	}
+	else if (b >= c && b >= d)
+	{
+		partition = 1;
+	}
+	else if (c >= d)
+	{
+		partition = 2;
+	}
+	else
+	{
+		partition = 3;
+	}
+
+	return partition;
+}
+
+/**
+ * @brief Generate a single partition info structure.
+ *
+ * @param[out] bsd                     The block size information.
+ * @param      partition_count         The partition count of this partitioning.
+ * @param      partition_index         The partition index / seed of this partitioning.
+ * @param      partition_remap_index   The remapped partition index of this partitioning.
+ * @param[out] pi                      The partition info structure to populate.
+ *
+ * @return True if this is a useful partition index, False if we can skip it.
+ */
+static bool generate_one_partition_info_entry(
+	block_size_descriptor& bsd,
+	unsigned int partition_count,
+	unsigned int partition_index,
+	unsigned int partition_remap_index,
+	partition_info& pi
+) {
+	int texels_per_block = bsd.texel_count;
+	bool small_block = texels_per_block < 32;
+
+	uint8_t *partition_of_texel = pi.partition_of_texel;
+
+	// Assign texels to partitions
+	int texel_idx = 0;
+	int counts[BLOCK_MAX_PARTITIONS] { 0 };
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		for (unsigned int y = 0; y <  bsd.ydim; y++)
+		{
+			for (unsigned int x = 0; x <  bsd.xdim; x++)
+			{
+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
+				pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
+				*partition_of_texel++ = part;
+			}
+		}
+	}
+
+	// Fill loop tail so we can overfetch later
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		int ptex_count = counts[i];
+		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		for (int j = ptex_count; j < ptex_count_simd; j++)
+		{
+			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
+		}
+	}
+
+	// Populate the actual procedural partition count
+	if (counts[0] == 0)
+	{
+		pi.partition_count = 0;
+	}
+	else if (counts[1] == 0)
+	{
+		pi.partition_count = 1;
+	}
+	else if (counts[2] == 0)
+	{
+		pi.partition_count = 2;
+	}
+	else if (counts[3] == 0)
+	{
+		pi.partition_count = 3;
+	}
+	else
+	{
+		pi.partition_count = 4;
+	}
+
+	// Populate the partition index
+	pi.partition_index = static_cast<uint16_t>(partition_index);
+
+	// Populate the coverage bitmaps for 2/3/4 partitions
+	uint64_t* bitmaps { nullptr };
+	if (partition_count == 2)
+	{
+		bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
+	}
+	else if (partition_count == 3)
+	{
+		bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
+	}
+	else if (partition_count == 4)
+	{
+		bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
+	}
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
+	}
+
+	// Valid partitionings have texels in all of the requested partitions
+	bool valid = pi.partition_count == partition_count;
+
+	if (bitmaps)
+	{
+		// Populate the partition coverage bitmap
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			bitmaps[i] = 0ULL;
+		}
+
+		unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+		for (unsigned int i = 0; i < texels_to_process; i++)
+		{
+			unsigned int idx = bsd.kmeans_texels[i];
+			bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
+		}
+	}
+
+	return valid;
+}
+
+static void build_partition_table_for_one_partition_count(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff,
+	unsigned int partition_count,
+	partition_info* ptab,
+	uint64_t* canonical_patterns
+) {
+	unsigned int next_index = 0;
+	bsd.partitioning_count_selected[partition_count - 1] = 0;
+	bsd.partitioning_count_all[partition_count - 1] = 0;
+
+	// Skip tables larger than config max partition count if we can omit modes
+	if (can_omit_partitionings && (partition_count > partition_count_cutoff))
+	{
+		return;
+	}
+
+	// Iterate through twice
+	//   - Pass 0: Keep selected partitionings
+	//   - Pass 1: Keep non-selected partitionings (skip if in omit mode)
+	unsigned int max_iter = can_omit_partitionings ? 1 : 2;
+
+	// Tracker for things we built in the first iteration
+	uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
+	for (unsigned int x = 0; x < max_iter; x++)
+	{
+		for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
+		{
+			// Don't include things we built in the first pass
+			if ((x == 1) && build[i])
+			{
+				continue;
+			}
+
+			bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
+			if ((x == 0) && !keep_useful)
+			{
+				continue;
+			}
+
+			generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
+			bool keep_canonical = true;
+			for (unsigned int j = 0; j < next_index; j++)
+			{
+				bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns +  j * BIT_PATTERN_WORDS);
+				if (match)
+				{
+					keep_canonical = false;
+					break;
+				}
+			}
+
+			if (keep_useful && keep_canonical)
+			{
+				if (x == 0)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_selected[partition_count - 1]++;
+					bsd.partitioning_count_all[partition_count - 1]++;
+					build[i] = 1;
+					next_index++;
+				}
+			}
+			else
+			{
+				if (x == 1)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_all[partition_count - 1]++;
+					next_index++;
+				}
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+void init_partition_tables(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff
+) {
+	partition_info* par_tab2 = bsd.partitionings;
+	partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
+
+	generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
+	bsd.partitioning_count_selected[0] = 1;
+	bsd.partitioning_count_all[0] = 1;
+
+	uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
+
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
+
+	delete[] canonical_patterns;
+}
--- a/thirdparty/astcenc/astcenc_percentile_tables.cpp
+++ b/thirdparty/astcenc/astcenc_percentile_tables.cpp
--- a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
--- a/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
+++ b/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
@ -0,0 +1,166 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Platform-specific function implementations.
+ *
+ * This module contains functions for querying the host extended ISA support.
+ */
+
+// Include before the defines below to pick up any auto-setup based on compiler
+// built-in config, if not being set explicitly by the build system
+#include "astcenc_internal.h"
+
+#if (ASTCENC_SSE > 0)    || (ASTCENC_AVX > 0) || \
+    (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
+
+static bool g_init { false };
+
+/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
+static bool g_cpu_has_sse41 { false };
+
+/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
+static bool g_cpu_has_avx2 { false };
+
+/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
+static bool g_cpu_has_popcnt { false };
+
+/** Does this CPU support F16C? Set to -1 if not yet initialized. */
+static bool g_cpu_has_f16c { false };
+
+/* ============================================================================
+   Platform code for Visual Studio
+============================================================================ */
+#if !defined(__clang__) && defined(_MSC_VER)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	int data[4];
+
+	__cpuid(data, 0);
+	int num_id = data[0];
+
+	if (num_id >= 1)
+	{
+		__cpuidex(data, 1, 0);
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	if (num_id >= 7)
+	{
+		__cpuidex(data, 7, 0);
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	MemoryBarrier();
+	g_init = true;
+}
+
+/* ============================================================================
+   Platform code for GCC and Clang
+============================================================================ */
+#else
+#include <cpuid.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	unsigned int data[4];
+
+	if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	g_cpu_has_avx2 = 0;
+	if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	__sync_synchronize();
+	g_init = true;
+}
+#endif
+
+/* See header for documentation. */
+bool cpu_supports_popcnt()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_popcnt;
+}
+
+/* See header for documentation. */
+bool cpu_supports_f16c()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_f16c;
+}
+
+/* See header for documentation. */
+bool cpu_supports_sse41()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_sse41;
+}
+
+/* See header for documentation. */
+bool cpu_supports_avx2()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_avx2;
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_quantization.cpp
+++ b/thirdparty/astcenc/astcenc_quantization.cpp
@ -0,0 +1,904 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data tables for numeric quantization..
+ */
+
+#include "astcenc_internal.h"
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+// Starts from QUANT_6
+// Not scrambled
+const uint8_t color_unquant_to_uquant_tables[17][256] {
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
+		 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  69,  69,  69,  69,  69,  69,
+		 69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
+		 92,  92,  92,  92,  92,  92,  92,  92,  92, 116, 116, 116, 116, 116, 116, 116,
+		116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
+		139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163,
+		163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
+		186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
+		209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  17,  17,  17,  17,  17,  17,
+		 17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  34,  34,  34,  34,  34,  34,
+		 34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  68,  68,  68,  68,
+		 68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  85,  85,  85,
+		 85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119,
+		119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+		136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+		170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
+		187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
+		221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
+		238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,
+		 27,  27,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
+		 54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  67,  67,  67,
+		 67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  80,  80,  80,  80,  80,  80,
+		 80,  80,  80,  80,  80,  80,  80,  80,  94,  94,  94,  94,  94,  94,  94,  94,
+		 94,  94,  94,  94,  94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
+		107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148,
+		148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161,
+		161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175,
+		175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
+		188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
+		215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228,
+		228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  44,  44,  44,  44,  44,  44,  44,  44,  44,
+		 44,  44,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  77,  77,  77,  77,  77,  77,  77,  77,
+		 77,  77,  77,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178,
+		178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211,
+		211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244,
+		244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,
+		 16,  16,  16,  16,  16,  24,  24,  24,  24,  24,  24,  24,  24,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  41,  41,  41,  41,  41,  41,  41,  41,  49,  49,
+		 49,  49,  49,  49,  49,  49,  57,  57,  57,  57,  57,  57,  57,  57,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  74,  74,  74,  74,  74,  74,  74,  74,  82,
+		 82,  82,  82,  82,  82,  82,  82,  90,  90,  90,  90,  90,  90,  90,  90,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99, 107, 107, 107, 107, 107, 107, 107, 107,
+		115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140,
+		148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173,
+		173, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206,
+		206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239,
+		239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   6,   6,   6,   6,   6,   6,  13,  13,  13,  13,  13,  13,
+		 13,  19,  19,  19,  19,  19,  19,  26,  26,  26,  26,  26,  26,  26,  32,  32,
+		 32,  32,  32,  32,  39,  39,  39,  39,  39,  39,  39,  45,  45,  45,  45,  45,
+		 45,  52,  52,  52,  52,  52,  52,  52,  58,  58,  58,  58,  58,  58,  65,  65,
+		 65,  65,  65,  65,  65,  71,  71,  71,  71,  71,  71,  78,  78,  78,  78,  78,
+		 78,  78,  84,  84,  84,  84,  84,  84,  91,  91,  91,  91,  91,  91,  91,  97,
+		 97,  97,  97,  97,  97, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110,
+		110, 110, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 145, 145,
+		145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158,
+		158, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 177, 177,
+		177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190,
+		190, 190, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 210,
+		210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223,
+		223, 223, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 242,
+		242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   5,   5,   5,   5,   5,   5,  11,  11,  11,  11,  11,  16,  16,
+		 16,  16,  16,  21,  21,  21,  21,  21,  21,  27,  27,  27,  27,  27,  32,  32,
+		 32,  32,  32,  32,  38,  38,  38,  38,  38,  43,  43,  43,  43,  43,  48,  48,
+		 48,  48,  48,  48,  54,  54,  54,  54,  54,  59,  59,  59,  59,  59,  59,  65,
+		 65,  65,  65,  65,  70,  70,  70,  70,  70,  70,  76,  76,  76,  76,  76,  81,
+		 81,  81,  81,  81,  86,  86,  86,  86,  86,  86,  92,  92,  92,  92,  92,  97,
+		 97,  97,  97,  97,  97, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 113,
+		113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124,
+		131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142,
+		142, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158,
+		158, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174,
+		174, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190,
+		190, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 207, 207, 207, 207,
+		207, 207, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 223, 223, 223, 223,
+		223, 223, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 239, 239, 239,
+		239, 239, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   4,   4,   4,   4,   8,   8,   8,   8,  12,  12,  12,  12,  16,
+		 16,  16,  16,  20,  20,  20,  20,  24,  24,  24,  24,  28,  28,  28,  28,  32,
+		 32,  32,  32,  36,  36,  36,  36,  40,  40,  40,  40,  44,  44,  44,  44,  48,
+		 48,  48,  48,  52,  52,  52,  52,  56,  56,  56,  56,  60,  60,  60,  60,  65,
+		 65,  65,  65,  65,  69,  69,  69,  69,  73,  73,  73,  73,  77,  77,  77,  77,
+		 81,  81,  81,  81,  85,  85,  85,  85,  89,  89,  89,  89,  93,  93,  93,  93,
+		 97,  97,  97,  97, 101, 101, 101, 101, 105, 105, 105, 105, 109, 109, 109, 109,
+		113, 113, 113, 113, 117, 117, 117, 117, 121, 121, 121, 121, 125, 125, 125, 125,
+		130, 130, 130, 130, 134, 134, 134, 134, 138, 138, 138, 138, 142, 142, 142, 142,
+		146, 146, 146, 146, 150, 150, 150, 150, 154, 154, 154, 154, 158, 158, 158, 158,
+		162, 162, 162, 162, 166, 166, 166, 166, 170, 170, 170, 170, 174, 174, 174, 174,
+		178, 178, 178, 178, 182, 182, 182, 182, 186, 186, 186, 186, 190, 190, 190, 190,
+		190, 195, 195, 195, 195, 199, 199, 199, 199, 203, 203, 203, 203, 207, 207, 207,
+		207, 211, 211, 211, 211, 215, 215, 215, 215, 219, 219, 219, 219, 223, 223, 223,
+		223, 227, 227, 227, 227, 231, 231, 231, 231, 235, 235, 235, 235, 239, 239, 239,
+		239, 243, 243, 243, 243, 247, 247, 247, 247, 251, 251, 251, 251, 255, 255, 255
+	},
+	{
+		  0,   0,   3,   3,   3,   6,   6,   6,   9,   9,   9,   9,  13,  13,  13,  16,
+		 16,  16,  19,  19,  19,  22,  22,  22,  25,  25,  25,  25,  29,  29,  29,  32,
+		 32,  32,  35,  35,  35,  38,  38,  38,  38,  42,  42,  42,  45,  45,  45,  48,
+		 48,  48,  51,  51,  51,  54,  54,  54,  54,  58,  58,  58,  61,  61,  61,  64,
+		 64,  64,  67,  67,  67,  67,  71,  71,  71,  74,  74,  74,  77,  77,  77,  80,
+		 80,  80,  83,  83,  83,  83,  87,  87,  87,  90,  90,  90,  93,  93,  93,  96,
+		 96,  96,  96, 100, 100, 100, 103, 103, 103, 106, 106, 106, 109, 109, 109, 112,
+		112, 112, 112, 116, 116, 116, 119, 119, 119, 122, 122, 122, 125, 125, 125, 125,
+		130, 130, 130, 130, 133, 133, 133, 136, 136, 136, 139, 139, 139, 143, 143, 143,
+		143, 146, 146, 146, 149, 149, 149, 152, 152, 152, 155, 155, 155, 159, 159, 159,
+		159, 162, 162, 162, 165, 165, 165, 168, 168, 168, 172, 172, 172, 172, 175, 175,
+		175, 178, 178, 178, 181, 181, 181, 184, 184, 184, 188, 188, 188, 188, 191, 191,
+		191, 194, 194, 194, 197, 197, 197, 201, 201, 201, 201, 204, 204, 204, 207, 207,
+		207, 210, 210, 210, 213, 213, 213, 217, 217, 217, 217, 220, 220, 220, 223, 223,
+		223, 226, 226, 226, 230, 230, 230, 230, 233, 233, 233, 236, 236, 236, 239, 239,
+		239, 242, 242, 242, 246, 246, 246, 246, 249, 249, 249, 252, 252, 252, 255, 255
+	},
+	{
+		  0,   0,   2,   2,   5,   5,   5,   8,   8,   8,  10,  10,  13,  13,  13,  16,
+		 16,  16,  18,  18,  21,  21,  21,  24,  24,  24,  26,  26,  29,  29,  29,  32,
+		 32,  32,  35,  35,  35,  37,  37,  40,  40,  40,  43,  43,  43,  45,  45,  48,
+		 48,  48,  51,  51,  51,  53,  53,  56,  56,  56,  59,  59,  59,  61,  61,  64,
+		 64,  64,  67,  67,  67,  70,  70,  70,  72,  72,  75,  75,  75,  78,  78,  78,
+		 80,  80,  83,  83,  83,  86,  86,  86,  88,  88,  91,  91,  91,  94,  94,  94,
+		 96,  96,  99,  99,  99, 102, 102, 102, 104, 104, 107, 107, 107, 110, 110, 110,
+		112, 112, 115, 115, 115, 118, 118, 118, 120, 120, 123, 123, 123, 126, 126, 126,
+		129, 129, 129, 132, 132, 132, 135, 135, 137, 137, 137, 140, 140, 140, 143, 143,
+		145, 145, 145, 148, 148, 148, 151, 151, 153, 153, 153, 156, 156, 156, 159, 159,
+		161, 161, 161, 164, 164, 164, 167, 167, 169, 169, 169, 172, 172, 172, 175, 175,
+		177, 177, 177, 180, 180, 180, 183, 183, 185, 185, 185, 188, 188, 188, 191, 191,
+		191, 194, 194, 196, 196, 196, 199, 199, 199, 202, 202, 204, 204, 204, 207, 207,
+		207, 210, 210, 212, 212, 212, 215, 215, 215, 218, 218, 220, 220, 220, 223, 223,
+		223, 226, 226, 226, 229, 229, 231, 231, 231, 234, 234, 234, 237, 237, 239, 239,
+		239, 242, 242, 242, 245, 245, 247, 247, 247, 250, 250, 250, 253, 253, 255, 255
+	},
+	{
+		  0,   0,   2,   2,   4,   4,   6,   6,   8,   8,  10,  10,  12,  12,  14,  14,
+		 16,  16,  18,  18,  20,  20,  22,  22,  24,  24,  26,  26,  28,  28,  30,  30,
+		 32,  32,  34,  34,  36,  36,  38,  38,  40,  40,  42,  42,  44,  44,  46,  46,
+		 48,  48,  50,  50,  52,  52,  54,  54,  56,  56,  58,  58,  60,  60,  62,  62,
+		 64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  74,  74,  76,  76,  78,  78,
+		 80,  80,  82,  82,  84,  84,  86,  86,  88,  88,  90,  90,  92,  92,  94,  94,
+		 96,  96,  98,  98, 100, 100, 102, 102, 104, 104, 106, 106, 108, 108, 110, 110,
+		112, 112, 114, 114, 116, 116, 118, 118, 120, 120, 122, 122, 124, 124, 126, 126,
+		129, 129, 131, 131, 133, 133, 135, 135, 137, 137, 139, 139, 141, 141, 143, 143,
+		145, 145, 147, 147, 149, 149, 151, 151, 153, 153, 155, 155, 157, 157, 159, 159,
+		161, 161, 163, 163, 165, 165, 167, 167, 169, 169, 171, 171, 173, 173, 175, 175,
+		177, 177, 179, 179, 181, 181, 183, 183, 185, 185, 187, 187, 189, 189, 191, 191,
+		193, 193, 195, 195, 197, 197, 199, 199, 201, 201, 203, 203, 205, 205, 207, 207,
+		209, 209, 211, 211, 213, 213, 215, 215, 217, 217, 219, 219, 221, 221, 223, 223,
+		225, 225, 227, 227, 229, 229, 231, 231, 233, 233, 235, 235, 237, 237, 239, 239,
+		241, 241, 243, 243, 245, 245, 247, 247, 249, 249, 251, 251, 253, 253, 255, 255
+	},
+	{
+		  0,   1,   1,   3,   4,   4,   6,   6,   8,   9,   9,  11,  12,  12,  14,  14,
+		 16,  17,  17,  19,  20,  20,  22,  22,  24,  25,  25,  27,  28,  28,  30,  30,
+		 32,  33,  33,  35,  36,  36,  38,  38,  40,  41,  41,  43,  44,  44,  46,  46,
+		 48,  49,  49,  51,  52,  52,  54,  54,  56,  57,  57,  59,  60,  60,  62,  62,
+		 64,  65,  65,  67,  68,  68,  70,  70,  72,  73,  73,  75,  76,  76,  78,  78,
+		 80,  81,  81,  83,  84,  84,  86,  86,  88,  89,  89,  91,  92,  92,  94,  94,
+		 96,  97,  97,  99, 100, 100, 102, 102, 104, 105, 105, 107, 108, 108, 110, 110,
+		112, 113, 113, 115, 116, 116, 118, 118, 120, 121, 121, 123, 124, 124, 126, 126,
+		129, 129, 131, 131, 132, 134, 134, 135, 137, 137, 139, 139, 140, 142, 142, 143,
+		145, 145, 147, 147, 148, 150, 150, 151, 153, 153, 155, 155, 156, 158, 158, 159,
+		161, 161, 163, 163, 164, 166, 166, 167, 169, 169, 171, 171, 172, 174, 174, 175,
+		177, 177, 179, 179, 180, 182, 182, 183, 185, 185, 187, 187, 188, 190, 190, 191,
+		193, 193, 195, 195, 196, 198, 198, 199, 201, 201, 203, 203, 204, 206, 206, 207,
+		209, 209, 211, 211, 212, 214, 214, 215, 217, 217, 219, 219, 220, 222, 222, 223,
+		225, 225, 227, 227, 228, 230, 230, 231, 233, 233, 235, 235, 236, 238, 238, 239,
+		241, 241, 243, 243, 244, 246, 246, 247, 249, 249, 251, 251, 252, 254, 254, 255
+	},
+	{
+		  0,   1,   2,   2,   4,   5,   6,   6,   8,   9,  10,  10,  12,  13,  14,  14,
+		 16,  17,  18,  18,  20,  21,  22,  22,  24,  25,  26,  26,  28,  29,  30,  30,
+		 32,  33,  34,  34,  36,  37,  38,  38,  40,  41,  42,  42,  44,  45,  46,  46,
+		 48,  49,  50,  50,  52,  53,  54,  54,  56,  57,  58,  58,  60,  61,  62,  62,
+		 64,  65,  66,  66,  68,  69,  70,  70,  72,  73,  74,  74,  76,  77,  78,  78,
+		 80,  81,  82,  82,  84,  85,  86,  86,  88,  89,  90,  90,  92,  93,  94,  94,
+		 96,  97,  98,  98, 100, 101, 102, 102, 104, 105, 106, 106, 108, 109, 110, 110,
+		112, 113, 114, 114, 116, 117, 118, 118, 120, 121, 122, 122, 124, 125, 126, 126,
+		129, 129, 130, 131, 133, 133, 134, 135, 137, 137, 138, 139, 141, 141, 142, 143,
+		145, 145, 146, 147, 149, 149, 150, 151, 153, 153, 154, 155, 157, 157, 158, 159,
+		161, 161, 162, 163, 165, 165, 166, 167, 169, 169, 170, 171, 173, 173, 174, 175,
+		177, 177, 178, 179, 181, 181, 182, 183, 185, 185, 186, 187, 189, 189, 190, 191,
+		193, 193, 194, 195, 197, 197, 198, 199, 201, 201, 202, 203, 205, 205, 206, 207,
+		209, 209, 210, 211, 213, 213, 214, 215, 217, 217, 218, 219, 221, 221, 222, 223,
+		225, 225, 226, 227, 229, 229, 230, 231, 233, 233, 234, 235, 237, 237, 238, 239,
+		241, 241, 242, 243, 245, 245, 246, 247, 249, 249, 250, 251, 253, 253, 254, 255
+	},
+	{
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+// Starts from QUANT_6
+// Scrambled
+const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
+		 19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  15,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,
+		  2,   2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,  10,
+		 10,  10,  10,  10,  10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  12,
+		 12,  12,  12,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,
+		 14,  14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,
+		 16,  16,  16,  16,  16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,
+		 18,  18,  18,  18,  18,  18,  18,  18,  19,  19,  19,  19,  19,  19,  19,  19,
+		 19,  20,  20,  20,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,
+		 21,  22,  22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  24,  24,  24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  25,
+		 25,  25,  26,  26,  26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  28,  28,  28,  28,  28,  28,  28,  28,  29,  29,  29,  29,  29,
+		 29,  29,  29,  30,  30,  30,  30,  30,  30,  30,  30,  31,  31,  31,  31,  31
+	},
+	{
+		  0,   0,   0,   0,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,
+		 16,  24,  24,  24,  24,  24,  24,  32,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  18,  18,  18,  18,  18,
+		 18,  26,  26,  26,  26,  26,  26,  26,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  20,  20,  20,  20,  20,
+		 20,  20,  28,  28,  28,  28,  28,  28,  36,  36,  36,  36,  36,  36,  36,   6,
+		  6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,  22,  22,  22,  22,
+		 22,  22,  30,  30,  30,  30,  30,  30,  30,  38,  38,  38,  38,  38,  38,  38,
+		 39,  39,  39,  39,  39,  39,  39,  31,  31,  31,  31,  31,  31,  31,  23,  23,
+		 23,  23,  23,  23,  15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,
+		  7,  37,  37,  37,  37,  37,  37,  37,  29,  29,  29,  29,  29,  29,  21,  21,
+		 21,  21,  21,  21,  21,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  27,  27,  27,  27,  27,  27,  27,  19,
+		 19,  19,  19,  19,  19,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  33,  25,  25,  25,  25,  25,  25,  17,
+		 17,  17,  17,  17,  17,  17,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,  16,  16,  16,  16,  16,  16,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,  18,  18,  18,  18,  18,  18,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,  20,  20,  20,  20,  20,  36,  36,  36,  36,  36,   6,   6,
+		  6,   6,   6,   6,  22,  22,  22,  22,  22,  38,  38,  38,  38,  38,  38,   8,
+		  8,   8,   8,   8,  24,  24,  24,  24,  24,  24,  40,  40,  40,  40,  40,  10,
+		 10,  10,  10,  10,  26,  26,  26,  26,  26,  26,  42,  42,  42,  42,  42,  12,
+		 12,  12,  12,  12,  12,  28,  28,  28,  28,  28,  44,  44,  44,  44,  44,  14,
+		 14,  14,  14,  14,  14,  30,  30,  30,  30,  30,  46,  46,  46,  46,  46,  46,
+		 47,  47,  47,  47,  47,  47,  31,  31,  31,  31,  31,  15,  15,  15,  15,  15,
+		 15,  45,  45,  45,  45,  45,  29,  29,  29,  29,  29,  13,  13,  13,  13,  13,
+		 13,  43,  43,  43,  43,  43,  27,  27,  27,  27,  27,  27,  11,  11,  11,  11,
+		 11,  41,  41,  41,  41,  41,  25,  25,  25,  25,  25,  25,   9,   9,   9,   9,
+		  9,  39,  39,  39,  39,  39,  39,  23,  23,  23,  23,  23,   7,   7,   7,   7,
+		  7,   7,  37,  37,  37,  37,  37,  21,  21,  21,  21,  21,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  19,  19,  19,  19,  19,  19,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  17,  17,  17,  17,  17,  17,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3,   4,
+		  4,   4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7,   8,
+		  8,   8,   8,   9,   9,   9,   9,  10,  10,  10,  10,  11,  11,  11,  11,  12,
+		 12,  12,  12,  13,  13,  13,  13,  14,  14,  14,  14,  15,  15,  15,  15,  16,
+		 16,  16,  16,  16,  17,  17,  17,  17,  18,  18,  18,  18,  19,  19,  19,  19,
+		 20,  20,  20,  20,  21,  21,  21,  21,  22,  22,  22,  22,  23,  23,  23,  23,
+		 24,  24,  24,  24,  25,  25,  25,  25,  26,  26,  26,  26,  27,  27,  27,  27,
+		 28,  28,  28,  28,  29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  31,
+		 32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
+		 36,  36,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  39,  39,  39,
+		 40,  40,  40,  40,  41,  41,  41,  41,  42,  42,  42,  42,  43,  43,  43,  43,
+		 44,  44,  44,  44,  45,  45,  45,  45,  46,  46,  46,  46,  47,  47,  47,  47,
+		 47,  48,  48,  48,  48,  49,  49,  49,  49,  50,  50,  50,  50,  51,  51,  51,
+		 51,  52,  52,  52,  52,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  55,
+		 55,  56,  56,  56,  56,  57,  57,  57,  57,  58,  58,  58,  58,  59,  59,  59,
+		 59,  60,  60,  60,  60,  61,  61,  61,  61,  62,  62,  62,  62,  63,  63,  63
+	},
+	{
+		  0,   0,  16,  16,  16,  32,  32,  32,  48,  48,  48,  48,  64,  64,  64,   2,
+		  2,   2,  18,  18,  18,  34,  34,  34,  50,  50,  50,  50,  66,  66,  66,   4,
+		  4,   4,  20,  20,  20,  36,  36,  36,  36,  52,  52,  52,  68,  68,  68,   6,
+		  6,   6,  22,  22,  22,  38,  38,  38,  38,  54,  54,  54,  70,  70,  70,   8,
+		  8,   8,  24,  24,  24,  24,  40,  40,  40,  56,  56,  56,  72,  72,  72,  10,
+		 10,  10,  26,  26,  26,  26,  42,  42,  42,  58,  58,  58,  74,  74,  74,  12,
+		 12,  12,  12,  28,  28,  28,  44,  44,  44,  60,  60,  60,  76,  76,  76,  14,
+		 14,  14,  14,  30,  30,  30,  46,  46,  46,  62,  62,  62,  78,  78,  78,  78,
+		 79,  79,  79,  79,  63,  63,  63,  47,  47,  47,  31,  31,  31,  15,  15,  15,
+		 15,  77,  77,  77,  61,  61,  61,  45,  45,  45,  29,  29,  29,  13,  13,  13,
+		 13,  75,  75,  75,  59,  59,  59,  43,  43,  43,  27,  27,  27,  27,  11,  11,
+		 11,  73,  73,  73,  57,  57,  57,  41,  41,  41,  25,  25,  25,  25,   9,   9,
+		  9,  71,  71,  71,  55,  55,  55,  39,  39,  39,  39,  23,  23,  23,   7,   7,
+		  7,  69,  69,  69,  53,  53,  53,  37,  37,  37,  37,  21,  21,  21,   5,   5,
+		  5,  67,  67,  67,  51,  51,  51,  51,  35,  35,  35,  19,  19,  19,   3,   3,
+		  3,  65,  65,  65,  49,  49,  49,  49,  33,  33,  33,  17,  17,  17,   1,   1
+	},
+	{
+		  0,   0,  32,  32,  64,  64,  64,   2,   2,   2,  34,  34,  66,  66,  66,   4,
+		  4,   4,  36,  36,  68,  68,  68,   6,   6,   6,  38,  38,  70,  70,  70,   8,
+		  8,   8,  40,  40,  40,  72,  72,  10,  10,  10,  42,  42,  42,  74,  74,  12,
+		 12,  12,  44,  44,  44,  76,  76,  14,  14,  14,  46,  46,  46,  78,  78,  16,
+		 16,  16,  48,  48,  48,  80,  80,  80,  18,  18,  50,  50,  50,  82,  82,  82,
+		 20,  20,  52,  52,  52,  84,  84,  84,  22,  22,  54,  54,  54,  86,  86,  86,
+		 24,  24,  56,  56,  56,  88,  88,  88,  26,  26,  58,  58,  58,  90,  90,  90,
+		 28,  28,  60,  60,  60,  92,  92,  92,  30,  30,  62,  62,  62,  94,  94,  94,
+		 95,  95,  95,  63,  63,  63,  31,  31,  93,  93,  93,  61,  61,  61,  29,  29,
+		 91,  91,  91,  59,  59,  59,  27,  27,  89,  89,  89,  57,  57,  57,  25,  25,
+		 87,  87,  87,  55,  55,  55,  23,  23,  85,  85,  85,  53,  53,  53,  21,  21,
+		 83,  83,  83,  51,  51,  51,  19,  19,  81,  81,  81,  49,  49,  49,  17,  17,
+		 17,  79,  79,  47,  47,  47,  15,  15,  15,  77,  77,  45,  45,  45,  13,  13,
+		 13,  75,  75,  43,  43,  43,  11,  11,  11,  73,  73,  41,  41,  41,   9,   9,
+		  9,  71,  71,  71,  39,  39,   7,   7,   7,  69,  69,  69,  37,  37,   5,   5,
+		  5,  67,  67,  67,  35,  35,   3,   3,   3,  65,  65,  65,  33,  33,   1,   1
+	},
+	{
+		  0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,
+		  8,   8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
+		 16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,
+		 24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,
+		 32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,
+		 40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,
+		 48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,
+		 56,  56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,
+		 72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,
+		 88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103,
+		104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
+		120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
+	},
+	{
+		  0,  32,  32,  64,  96,  96, 128, 128,   2,  34,  34,  66,  98,  98, 130, 130,
+		  4,  36,  36,  68, 100, 100, 132, 132,   6,  38,  38,  70, 102, 102, 134, 134,
+		  8,  40,  40,  72, 104, 104, 136, 136,  10,  42,  42,  74, 106, 106, 138, 138,
+		 12,  44,  44,  76, 108, 108, 140, 140,  14,  46,  46,  78, 110, 110, 142, 142,
+		 16,  48,  48,  80, 112, 112, 144, 144,  18,  50,  50,  82, 114, 114, 146, 146,
+		 20,  52,  52,  84, 116, 116, 148, 148,  22,  54,  54,  86, 118, 118, 150, 150,
+		 24,  56,  56,  88, 120, 120, 152, 152,  26,  58,  58,  90, 122, 122, 154, 154,
+		 28,  60,  60,  92, 124, 124, 156, 156,  30,  62,  62,  94, 126, 126, 158, 158,
+		159, 159, 127, 127,  95,  63,  63,  31, 157, 157, 125, 125,  93,  61,  61,  29,
+		155, 155, 123, 123,  91,  59,  59,  27, 153, 153, 121, 121,  89,  57,  57,  25,
+		151, 151, 119, 119,  87,  55,  55,  23, 149, 149, 117, 117,  85,  53,  53,  21,
+		147, 147, 115, 115,  83,  51,  51,  19, 145, 145, 113, 113,  81,  49,  49,  17,
+		143, 143, 111, 111,  79,  47,  47,  15, 141, 141, 109, 109,  77,  45,  45,  13,
+		139, 139, 107, 107,  75,  43,  43,  11, 137, 137, 105, 105,  73,  41,  41,   9,
+		135, 135, 103, 103,  71,  39,  39,   7, 133, 133, 101, 101,  69,  37,  37,   5,
+		131, 131,  99,  99,  67,  35,  35,   3, 129, 129,  97,  97,  65,  33,  33,   1
+	},
+	{
+		  0,  64, 128, 128,   2,  66, 130, 130,   4,  68, 132, 132,   6,  70, 134, 134,
+		  8,  72, 136, 136,  10,  74, 138, 138,  12,  76, 140, 140,  14,  78, 142, 142,
+		 16,  80, 144, 144,  18,  82, 146, 146,  20,  84, 148, 148,  22,  86, 150, 150,
+		 24,  88, 152, 152,  26,  90, 154, 154,  28,  92, 156, 156,  30,  94, 158, 158,
+		 32,  96, 160, 160,  34,  98, 162, 162,  36, 100, 164, 164,  38, 102, 166, 166,
+		 40, 104, 168, 168,  42, 106, 170, 170,  44, 108, 172, 172,  46, 110, 174, 174,
+		 48, 112, 176, 176,  50, 114, 178, 178,  52, 116, 180, 180,  54, 118, 182, 182,
+		 56, 120, 184, 184,  58, 122, 186, 186,  60, 124, 188, 188,  62, 126, 190, 190,
+		191, 191, 127,  63, 189, 189, 125,  61, 187, 187, 123,  59, 185, 185, 121,  57,
+		183, 183, 119,  55, 181, 181, 117,  53, 179, 179, 115,  51, 177, 177, 113,  49,
+		175, 175, 111,  47, 173, 173, 109,  45, 171, 171, 107,  43, 169, 169, 105,  41,
+		167, 167, 103,  39, 165, 165, 101,  37, 163, 163,  99,  35, 161, 161,  97,  33,
+		159, 159,  95,  31, 157, 157,  93,  29, 155, 155,  91,  27, 153, 153,  89,  25,
+		151, 151,  87,  23, 149, 149,  85,  21, 147, 147,  83,  19, 145, 145,  81,  17,
+		143, 143,  79,  15, 141, 141,  77,  13, 139, 139,  75,  11, 137, 137,  73,   9,
+		135, 135,  71,   7, 133, 133,  69,   5, 131, 131,  67,   3, 129, 129,  65,   1
+	},
+	{
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+#endif
+
+// Starts from QUANT_6
+// Scrambled
+static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
+	  0, 255,  51, 204, 102, 153
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
+	  0,  36,  73, 109, 146, 182, 219, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
+	  0, 255,  28, 227,  56, 199,  84, 171, 113, 142
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
+	  0, 255,  69, 186,  23, 232,  92, 163,  46, 209, 116, 139
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
+	  0,  17,  34,  51,  68,  85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
+	  0, 255,  67, 188,  13, 242,  80, 175,  27, 228,  94, 161,  40, 215, 107, 148,
+	 54, 201, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
+	  0, 255,  33, 222,  66, 189,  99, 156,  11, 244,  44, 211,  77, 178, 110, 145,
+	 22, 233,  55, 200,  88, 167, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
+	  0,   8,  16,  24,  33,  41,  49,  57,  66,  74,  82,  90,  99, 107, 115, 123,
+	132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
+	  0, 255,  32, 223,  65, 190,  97, 158,   6, 249,  39, 216,  71, 184, 104, 151,
+	 13, 242,  45, 210,  78, 177, 110, 145,  19, 236,  52, 203,  84, 171, 117, 138,
+	 26, 229,  58, 197,  91, 164, 123, 132
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  65, 190,  81, 174,  97, 158, 113, 142,
+	  5, 250,  21, 234,  38, 217,  54, 201,  70, 185,  86, 169, 103, 152, 119, 136,
+	 11, 244,  27, 228,  43, 212,  59, 196,  76, 179,  92, 163, 108, 147, 124, 131
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
+	  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
+	 65,  69,  73,  77,  81,  85,  89,  93,  97, 101, 105, 109, 113, 117, 121, 125,
+	130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
+	195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  64, 191,  80, 175,  96, 159, 112, 143,
+	  3, 252,  19, 236,  35, 220,  51, 204,  67, 188,  83, 172, 100, 155, 116, 139,
+	  6, 249,  22, 233,  38, 217,  54, 201,  71, 184,  87, 168, 103, 152, 119, 136,
+	  9, 246,  25, 230,  42, 213,  58, 197,  74, 181,  90, 165, 106, 149, 122, 133,
+	 13, 242,  29, 226,  45, 210,  61, 194,  77, 178,  93, 162, 109, 146, 125, 130
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  2, 253,  10, 245,  18, 237,  26, 229,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  5, 250,  13, 242,  21, 234,  29, 226,  37, 218,  45, 210,  53, 202,  61, 194,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
+	  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+	 32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+	 64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+	 96,  98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+	129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
+	161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
+	193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
+	225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  1, 254,   9, 246,  17, 238,  25, 230,  33, 222,  41, 214,  49, 206,  57, 198,
+	 65, 190,  73, 182,  81, 174,  89, 166,  97, 158, 105, 150, 113, 142, 121, 134,
+	  3, 252,  11, 244,  19, 236,  27, 228,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  4, 251,  12, 243,  20, 235,  28, 227,  36, 219,  44, 211,  52, 203,  60, 195,
+	 68, 187,  76, 179,  84, 171,  92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
+	  6, 249,  14, 241,  22, 233,  30, 225,  38, 217,  46, 209,  54, 201,  62, 193,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
+	  0, 255,   4, 251,   8, 247,  12, 243,  16, 239,  20, 235,  24, 231,  28, 227,
+	 32, 223,  36, 219,  40, 215,  44, 211,  48, 207,  52, 203,  56, 199,  60, 195,
+	 64, 191,  68, 187,  72, 183,  76, 179,  80, 175,  84, 171,  88, 167,  92, 163,
+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
+	  1, 254,   5, 250,   9, 246,  13, 242,  17, 238,  21, 234,  25, 230,  29, 226,
+	 33, 222,  37, 218,  41, 214,  45, 210,  49, 206,  53, 202,  57, 198,  61, 194,
+	 65, 190,  69, 186,  73, 182,  77, 178,  81, 174,  85, 170,  89, 166,  93, 162,
+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
+	  2, 253,   6, 249,  10, 245,  14, 241,  18, 237,  22, 233,  26, 229,  30, 225,
+	 34, 221,  38, 217,  42, 213,  46, 209,  50, 205,  54, 201,  58, 197,  62, 193,
+	 66, 189,  70, 185,  74, 181,  78, 177,  82, 173,  86, 169,  90, 165,  94, 161,
+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
+	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+};
+
+const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
+	color_scrambled_pquant_to_uquant_q6,
+	color_scrambled_pquant_to_uquant_q8,
+	color_scrambled_pquant_to_uquant_q10,
+	color_scrambled_pquant_to_uquant_q12,
+	color_scrambled_pquant_to_uquant_q16,
+	color_scrambled_pquant_to_uquant_q20,
+	color_scrambled_pquant_to_uquant_q24,
+	color_scrambled_pquant_to_uquant_q32,
+	color_scrambled_pquant_to_uquant_q40,
+	color_scrambled_pquant_to_uquant_q48,
+	color_scrambled_pquant_to_uquant_q64,
+	color_scrambled_pquant_to_uquant_q80,
+	color_scrambled_pquant_to_uquant_q96,
+	color_scrambled_pquant_to_uquant_q128,
+	color_scrambled_pquant_to_uquant_q160,
+	color_scrambled_pquant_to_uquant_q192,
+	color_scrambled_pquant_to_uquant_q256
+};
+
+// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
+// count and number of bits that the integer may fit into.
+const int8_t quant_mode_table[10][128] {
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    },
+    {
+         -1, -1,  0,  0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1,  0,  0,  0,  1,  2,  2,  3,  4,  5,  5,  6,  7,
+          8,  8,  9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
+          4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
+         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  1,  1,  1,
+          2,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  7,  7,  7,
+          8,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
+         14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,
+          1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,
+          5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10, 10,
+         10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
+         15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,
+          0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+          4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+          8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+         12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+         16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,
+          0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
+          2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,
+          6,  6,  6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,
+          9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
+         13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+         16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
+          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+          8,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+         11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
+         14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
+          1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
+          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
+          6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,
+          9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+         12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+         14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
+    }
+};
--- a/thirdparty/astcenc/astcenc_symbolic_physical.cpp
+++ b/thirdparty/astcenc/astcenc_symbolic_physical.cpp
@ -0,0 +1,534 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for converting between symbolic and physical encodings.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
+ * may span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	int value,
+	int bitcount,
+	int bitoffset,
+	uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline int read_bits(
+	int bitcount,
+	int bitoffset,
+	const uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/**
+ * @brief Reverse bits in a byte.
+ *
+ * @param p   The value to reverse.
+  *
+ * @return The reversed result.
+ */
+static inline int bitrev8(int p)
+{
+	p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
+	return p;
+}
+
+/* See header for documentation. */
+void symbolic_to_physical(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	physical_compressed_block& pcb
+) {
+	assert(scb.block_type != SYM_BTYPE_ERROR);
+
+	// Constant color block using UNORM16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_U16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb.data[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	// Constant color block using FP16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_F16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8]  { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb.data[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	unsigned int partition_count = scb.partition_count;
+
+	// Compress the weights.
+	// They are encoded as an ordinary integer-sequence, then bit-reversed
+	uint8_t weightbuf[16] { 0 };
+
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+	int weight_count = di.weight_count;
+	quant_method weight_quant_method = bm.get_weight_quant_mode();
+	float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
+	int is_dual_plane = bm.is_dual_plane;
+
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	uint8_t weights[64];
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i] = qat.scramble_map[qwi];
+
+			uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
+			qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i + 1] = qat.scramble_map[qwi];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[i] = qat.scramble_map[qwi];
+		}
+	}
+
+	encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
+
+	for (int i = 0; i < 16; i++)
+	{
+		pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
+	}
+
+	write_bits(scb.block_mode, 11, 0, pcb.data);
+	write_bits(partition_count - 1, 2, 11, pcb.data);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	// Encode partition index and color endpoint types for blocks with 2+ partitions
+	if (partition_count > 1)
+	{
+		write_bits(scb.partition_index, 6, 13, pcb.data);
+		write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
+
+		if (scb.color_formats_matched)
+		{
+			write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
+		}
+		else
+		{
+			// Check endpoint types for each partition to determine the lowest class present
+			int low_class = 4;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int class_of_format = scb.color_formats[i] >> 2;
+				low_class = astc::min(class_of_format, low_class);
+			}
+
+			if (low_class == 3)
+			{
+				low_class = 2;
+			}
+
+			int encoded_type = low_class + 1;
+			int bitpos = 2;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
+				encoded_type |= classbit_of_format << bitpos;
+				bitpos++;
+			}
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int lowbits_of_format = scb.color_formats[i] & 3;
+				encoded_type |= lowbits_of_format << bitpos;
+				bitpos += 2;
+			}
+
+			int encoded_type_lowpart = encoded_type & 0x3F;
+			int encoded_type_highpart = encoded_type >> 6;
+			int encoded_type_highpart_size = (3 * partition_count) - 4;
+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
+			below_weights_pos -= encoded_type_highpart_size;
+		}
+	}
+	else
+	{
+		write_bits(scb.color_formats[0], 4, 13, pcb.data);
+	}
+
+	// In dual-plane mode, encode the color component of the second plane of weights
+	if (is_dual_plane)
+	{
+		write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
+	}
+
+	// Encode the color components
+	uint8_t values_to_encode[32];
+	int valuecount_to_encode = 0;
+
+	const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
+	for (unsigned int i = 0; i < scb.partition_count; i++)
+	{
+		int vals = 2 * (scb.color_formats[i] >> 2) + 2;
+		assert(vals <= 8);
+		for (int j = 0; j < vals; j++)
+		{
+			values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
+		}
+		valuecount_to_encode += vals;
+	}
+
+	encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
+	           scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
+}
+
+/* See header for documentation. */
+void physical_to_symbolic(
+	const block_size_descriptor& bsd,
+	const physical_compressed_block& pcb,
+	symbolic_compressed_block& scb
+) {
+	uint8_t bswapped[16];
+
+	scb.block_type = SYM_BTYPE_NONCONST;
+
+	// Extract header fields
+	int block_mode = read_bits(11, 0, pcb.data);
+	if ((block_mode & 0x1FF) == 0x1FC)
+	{
+		// Constant color block
+
+		// Check what format the data has
+		if (block_mode & 0x200)
+		{
+			scb.block_type = SYM_BTYPE_CONST_F16;
+		}
+		else
+		{
+			scb.block_type = SYM_BTYPE_CONST_U16;
+		}
+
+		scb.partition_count = 0;
+		for (int i = 0; i < 4; i++)
+		{
+			scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
+		}
+
+		// Additionally, check that the void-extent
+		if (bsd.zdim == 1)
+		{
+			// 2D void-extent
+			int rsvbits = read_bits(2, 10, pcb.data);
+			if (rsvbits != 3)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+
+			int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
+			int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
+			int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
+			int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
+
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+		else
+		{
+			// 3D void-extent
+			int vx_low_s = read_bits(9, 10, pcb.data);
+			int vx_high_s = read_bits(9, 19, pcb.data);
+			int vx_low_t = read_bits(9, 28, pcb.data);
+			int vx_high_t = read_bits(9, 37, pcb.data);
+			int vx_low_p = read_bits(9, 46, pcb.data);
+			int vx_high_p = read_bits(9, 55, pcb.data);
+
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+
+		return;
+	}
+
+	unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
+	if (packed_index == BLOCK_BAD_BLOCK_MODE)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	const auto& bm = bsd.get_block_mode(block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+
+	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
+	int is_dual_plane = bm.is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int partition_count = read_bits(2, 11, pcb.data) + 1;
+	promise(partition_count > 0);
+
+	scb.block_mode = static_cast<uint16_t>(block_mode);
+	scb.partition_count = static_cast<uint8_t>(partition_count);
+
+	for (int i = 0; i < 16; i++)
+	{
+		bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
+	}
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	uint8_t indices[64];
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
+
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
+			scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
+		}
+	}
+
+	if (is_dual_plane && partition_count == 4)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	scb.color_formats_matched = 0;
+
+	// Determine the format of each endpoint pair
+	int color_formats[BLOCK_MAX_PARTITIONS];
+	int encoded_type_highpart_size = 0;
+	if (partition_count == 1)
+	{
+		color_formats[0] = read_bits(4, 13, pcb.data);
+		scb.partition_index = 0;
+	}
+	else
+	{
+		encoded_type_highpart_size = (3 * partition_count) - 4;
+		below_weights_pos -= encoded_type_highpart_size;
+		int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
+		int baseclass = encoded_type & 0x3;
+		if (baseclass == 0)
+		{
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (encoded_type >> 2) & 0xF;
+			}
+
+			below_weights_pos += encoded_type_highpart_size;
+			scb.color_formats_matched = 1;
+			encoded_type_highpart_size = 0;
+		}
+		else
+		{
+			int bitpos = 2;
+			baseclass--;
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
+				bitpos++;
+			}
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] |= (encoded_type >> bitpos) & 3;
+				bitpos += 2;
+			}
+		}
+		scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
+	}
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
+	}
+
+	// Determine number of color endpoint integers
+	int color_integer_count = 0;
+	for (int i = 0; i < partition_count; i++)
+	{
+		int endpoint_class = color_formats[i] >> 2;
+		color_integer_count += (endpoint_class + 1) * 2;
+	}
+
+	if (color_integer_count > 18)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Determine the color endpoint format to use
+	static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
+	if (is_dual_plane)
+	{
+		color_bits -= 2;
+	}
+
+	if (color_bits < 0)
+	{
+		color_bits = 0;
+	}
+
+	int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
+	if (color_quant_level < QUANT_6)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Unpack the integer color values and assign to endpoints
+	scb.quant_mode = static_cast<quant_method>(color_quant_level);
+
+	uint8_t values_to_decode[32];
+	decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
+	           values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
+
+	int valuecount_to_decode = 0;
+	const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
+	for (int i = 0; i < partition_count; i++)
+	{
+		int vals = 2 * (color_formats[i] >> 2) + 2;
+		for (int j = 0; j < vals; j++)
+		{
+			scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
+		}
+		valuecount_to_decode += vals;
+	}
+
+	// Fetch component for second-plane in the case of dual plane of weights.
+	scb.plane2_component = -1;
+	if (is_dual_plane)
+	{
+		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
+	}
+}
--- a/thirdparty/astcenc/astcenc_vecmathlib.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@ -0,0 +1,570 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+// Copyright 2008 Jose Fonseca
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements vector support for floats, ints, and vector lane
+ * control masks. It provides access to both explicit vector width types, and
+ * flexible N-wide types where N can be determined at compile time.
+ *
+ * The design of this module encourages use of vector length agnostic code, via
+ * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
+ * with that is available at compile time. The current vector width is
+ * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
+ *
+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
+ * These are provided primarily for prototyping and algorithm debug of VLA
+ * implementations.
+ *
+ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
+ * types. These are provided for use by VLA code, but are also expected to be
+ * used as a fixed-width type and will supported a reference C++ fallback for
+ * use on platforms without SIMD intrinsics.
+ *
+ * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
+ * types. These are provide for use by VLA code, and are not expected to be
+ * used as a fixed-width type in normal code. No reference C implementation is
+ * provided on platforms without underlying SIMD intrinsics.
+ *
+ * With the current implementation ISA support is provided for:
+ *
+ *     * 1-wide for scalar reference.
+ *     * 4-wide for Armv8-A NEON.
+ *     * 4-wide for x86-64 SSE2.
+ *     * 4-wide for x86-64 SSE4.1.
+ *     * 8-wide for x86-64 AVX2.
+ */
+
+#ifndef ASTC_VECMATHLIB_H_INCLUDED
+#define ASTC_VECMATHLIB_H_INCLUDED
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
+	#include <immintrin.h>
+#elif ASTCENC_NEON != 0
+	#include <arm_neon.h>
+#endif
+
+#if !defined(__clang__) && defined(_MSC_VER)
+	#define ASTCENC_SIMD_INLINE __forceinline
+	#define ASTCENC_NO_INLINE
+#elif defined(__GNUC__) && !defined(__clang__)
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#else
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#endif
+
+#if ASTCENC_AVX >= 2
+	/* If we have AVX2 expose 8-wide VLA. */
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_avx2_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+
+#elif ASTCENC_SSE >= 20
+	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
+#elif ASTCENC_NEON > 0
+	/* If we have NEON expose 4-wide VLA. */
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
+#else
+	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
+
+	// Note: We no longer expose the 1-wide scalar fallback because it is not
+	// invariant with the 4-wide path due to algorithms that use horizontal
+	// operations that accumulate a local vector sum before accumulating into
+	// a running sum.
+	//
+	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
+	//
+	//     result = ((((sum + l0) + l1) + l2) + l3)
+	//
+    // ... whereas the accumulator for a 4-wide vector sum is:
+	//
+	//     result = sum + ((l0 + l2) + (l1 + l3))
+	//
+	// In "normal maths" this is the same, but the floating point reassociation
+	// differences mean that these will not produce the same result.
+
+	#include "astcenc_vecmathlib_none_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+#endif
+
+/**
+ * @brief Round a count down to the largest multiple of 8.
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(8 - 1));
+}
+
+/**
+ * @brief Round a count down to the largest multiple of 4.
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(4 - 1));
+}
+
+/**
+ * @brief Round a count down to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
+}
+
+/**
+ * @brief Round a count up to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
+{
+	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	return multiples * ASTCENC_SIMD_WIDTH;
+}
+
+/**
+ * @brief Return @c a with lanes negated if the @c b lane is negative.
+ */
+ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
+{
+	vint ia = float_as_int(a);
+	vint ib = float_as_int(b);
+	vint sign_mask(static_cast<int>(0x80000000));
+	vint r = ia ^ (ib & sign_mask);
+	return int_as_float(r);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan(x).
+ *
+ * Max error of this implementation is 0.004883.
+ */
+ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
+{
+	vmask c = abs(x) > vfloat(1.0f);
+	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
+	vfloat y = select(x, vfloat(1.0f) / x, c);
+	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
+	return select(y, z - y, c);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan2(x, y).
+ */
+ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
+{
+	vfloat z = atan(abs(y / x));
+	vmask xmask = vmask(float_as_int(x).m);
+	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
+}
+
+/*
+ * @brief Factory that returns a unit length 4 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit4()
+{
+	return vfloat4(0.5f);
+}
+
+/**
+ * @brief Factory that returns a unit length 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit3()
+{
+	float val = 0.577350258827209473f;
+	return vfloat4(val, val, val, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a unit length 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit2()
+{
+	float val = 0.707106769084930420f;
+	return vfloat4(val, val, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
+{
+	return vfloat4(a, b, c, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
+{
+	return vfloat4(a, b, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Normalize a non-zero length vector to unit length.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
+{
+	vfloat4 length = dot(a, a);
+	return a / sqrt(length);
+}
+
+/**
+ * @brief Normalize a vector, returning @c safe if len is zero.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
+{
+	vfloat4 length = dot(a, a);
+	if (length.lane<0>() != 0.0f)
+	{
+		return a / sqrt(length);
+	}
+
+	return safe;
+}
+
+
+
+#define POLY0(x, c0)                     (                                     c0)
+#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
+#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
+#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
+#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
+#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
+
+/**
+ * @brief Compute an approximate exp2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
+{
+	x = clamp(-126.99999f, 129.0f, x);
+
+	vint4 ipart = float_to_int(x - 0.5f);
+	vfloat4 fpart = x - int_to_float(ipart);
+
+	// Integer contrib, using 1 << ipart
+	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
+
+	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
+	vfloat4 fexp = POLY5(fpart,
+	                     9.9999994e-1f,
+	                     6.9315308e-1f,
+	                     2.4015361e-1f,
+	                     5.5826318e-2f,
+	                     8.9893397e-3f,
+	                     1.8775767e-3f);
+
+	return iexp * fexp;
+}
+
+/**
+ * @brief Compute an approximate log2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
+{
+	vint4 exp(0x7F800000);
+	vint4 mant(0x007FFFFF);
+	vint4 one(0x3F800000);
+
+	vint4 i = float_as_int(x);
+
+	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
+
+	vfloat4 m = int_as_float((i & mant) | one);
+
+	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
+	vfloat4 p = POLY4(m,
+	                  2.8882704548164776201f,
+	                 -2.52074962577807006663f,
+	                  1.48116647521213171641f,
+	                 -0.465725644288844778798f,
+	                  0.0596515482674574969533f);
+
+	// Increases the polynomial degree, but ensures that log2(1) == 0
+	p = p * (m - 1.0f);
+
+	return p + e;
+}
+
+/**
+ * @brief Compute an approximate pow(x, y) for each lane in the vector.
+ *
+ * Power function based on the exp2(log2(x) * y) transform.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
+{
+	vmask4 zero_mask = y == vfloat4(0.0f);
+	vfloat4 estimate = exp2(log2(x) * y);
+
+	// Guarantee that y == 0 returns exactly 1.0f
+	return select(estimate, vfloat4(1.0f), zero_mask);
+}
+
+/**
+ * @brief Count the leading zeros for each lane in @c a.
+ *
+ * Valid for all data values of @c a; will return a per-lane value [0, 32].
+ */
+static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
+{
+	// This function is a horrible abuse of floating point exponents to convert
+	// the original integer value into a 2^N encoding we can recover easily.
+
+	// Convert to float without risk of rounding up by keeping only top 8 bits.
+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
+	a = (~lsr<8>(a)) & a;
+	a = float_as_int(int_to_float(a));
+
+	// Extract and unbias exponent
+	a = vint4(127 + 31) - lsr<23>(a);
+
+	// Clamp result to a valid 32-bit range
+	return clamp(0, 32, a);
+}
+
+/**
+ * @brief Return lanewise 2^a for each lane in @c a.
+ *
+ * Use of signed int means that this is only valid for values in range [0, 31].
+ */
+static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
+{
+	// 2^30 is the largest signed number than can be represented
+	assert(all(a < vint4(31)));
+
+	// This function is a horrible abuse of floating point to use the exponent
+	// and float conversion to generate a 2^N multiple.
+
+	// Bias the exponent
+	vint4 exp = a + 127;
+	exp = lsl<23>(exp);
+
+	// Reinterpret the bits as a float, and then convert to an int
+	vfloat4 f = int_as_float(exp);
+	return float_to_int(f);
+}
+
+/**
+ * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
+ */
+static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
+{
+	vint4 fp16_one = vint4(0x3C00);
+	vint4 fp16_small = lsl<8>(p);
+
+	vmask4 is_one = p == vint4(0xFFFF);
+	vmask4 is_small = p < vint4(4);
+
+	// Manually inline clz() on Visual Studio to avoid release build codegen bug
+	// see https://github.com/ARM-software/astc-encoder/issues/259
+#if !defined(__clang__) && defined(_MSC_VER)
+	vint4 a = (~lsr<8>(p)) & p;
+	a = float_as_int(int_to_float(a));
+	a = vint4(127 + 31) - lsr<23>(a);
+	vint4 lz = clamp(0, 32, a) - 16;
+#else
+	vint4 lz = clz(p) - 16;
+#endif
+
+	p = p * two_to_the_n(lz + 1);
+	p = p & vint4(0xFFFF);
+
+	p = lsr<6>(p);
+
+	p = p | lsl<10>(vint4(14) - lz);
+
+	vint4 r = select(p, fp16_one, is_one);
+	r = select(r, fp16_small, is_small);
+	return r;
+}
+
+/**
+ * @brief Convert 16-bit LNS to float16.
+ */
+static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
+{
+	vint4 mc = p & 0x7FF;
+	vint4 ec = lsr<11>(p);
+
+	vint4 mc_512 = mc * 3;
+	vmask4 mask_512 = mc < vint4(512);
+
+	vint4 mc_1536 = mc * 4 - 512;
+	vmask4 mask_1536 = mc < vint4(1536);
+
+	vint4 mc_else = mc * 5 - 2048;
+
+	vint4 mt = mc_else;
+	mt = select(mt, mc_1536, mask_1536);
+	mt = select(mt, mc_512, mask_512);
+
+	vint4 res = lsl<10>(ec) | lsr<3>(mt);
+	return min(res, vint4(0x7BFF));
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      a      The input value.
+ * @param[out] exp    The output exponent.
+ *
+ * @return The mantissa.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
+{
+	// Interpret the bits as an integer
+	vint4 ai = float_as_int(a);
+
+	// Extract and unbias the exponent
+	exp = (lsr<23>(ai) & 0xFF) - 126;
+
+	// Extract and unbias the mantissa
+	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
+	return int_as_float(manti);
+}
+
+/**
+ * @brief Convert float to 16-bit LNS.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
+{
+	vint4 exp;
+	vfloat4 mant = frexp(a, exp);
+
+	// Do these early before we start messing about ...
+	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
+	vmask4 mask_infinity = a >= vfloat4(65536.0f);
+
+	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
+	vmask4 exp_lt_m13 = exp < vint4(-13);
+
+	vfloat4 a1a = a * 33554432.0f;
+	vint4 expa = vint4::zero();
+
+	vfloat4 a1b = (mant - 0.5f) * 4096;
+	vint4 expb = exp + 14;
+
+	a = select(a1b, a1a, exp_lt_m13);
+	exp = select(expb, expa, exp_lt_m13);
+
+	vmask4 a_lt_384 = a < vfloat4(384.0f);
+	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
+
+	vfloat4 a2a = a * (4.0f / 3.0f);
+	vfloat4 a2b = a + 128.0f;
+	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
+
+	a = a2c;
+	a = select(a, a2b, a_lt_1408);
+	a = select(a, a2a, a_lt_384);
+
+	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
+
+	a = select(a, vfloat4(65535.0f), mask_infinity);
+	a = select(a, vfloat4::zero(), mask_underflow_nan);
+
+	return a;
+}
+
+namespace astc
+{
+
+static ASTCENC_SIMD_INLINE float pow(float x, float y)
+{
+	return pow(vfloat4(x), vfloat4(y)).lane<0>();
+}
+
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
--- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@ -0,0 +1,423 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Generic 4x32-bit vector functions.
+ *
+ * This module implements generic 4-wide vector functions that are valid for
+ * all instruction sets, typically implemented using lower level 4-wide
+ * operations that are ISA-specific.
+ */
+
+#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
+{
+	return a + vint4(b);
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
+{
+	return a - vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
+{
+	return a * vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
+{
+	return a | vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
+{
+	return a & vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
+{
+	return a ^ vint4(b);
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ */
+ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
+{
+	return min(max(a, vint4(minv)), vint4(maxv));
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
+{
+	return a + vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
+{
+	return a - vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	return a * vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	return vfloat4(a) * b;
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
+{
+	return a / vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
+{
+	return vfloat4(a) / b;
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
+{
+	return min(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
+{
+	return max(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and max.
+ *
+ * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
+ * be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), 1.0f);
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal min of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
+{
+	a.set_lane<3>(a.lane<0>());
+	return hmin_s(a);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
+{
+	accum = accum + a;
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a masked vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
+{
+	a = select(vfloat4::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_s(m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return vfloat4(hadd_s(m));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_rgb_s(m);
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	float d3 = hadd_rgb_s(m);
+	return vfloat4(d3, d3, d3, 0.0f);
+}
+
+#endif
+
+#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+static inline int popcount(uint64_t v)
+{
+	uint64_t mask1 = 0x5555555555555555ULL;
+	uint64_t mask2 = 0x3333333333333333ULL;
+	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
+	v -= (v >> 1) & mask1;
+	v = (v & mask2) + ((v >> 2) & mask2);
+	v += v >> 4;
+	v &= mask3;
+	v *= 0x0101010101010101ULL;
+	v >>= 56;
+	return static_cast<int>(v);
+}
+
+#endif
+
+/**
+ * @brief Apply signed bit transfer.
+ *
+ * @param input0   The first encoded endpoint.
+ * @param input1   The second encoded endpoint.
+ */
+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
+	vint4& input0,
+	vint4& input1
+) {
+	input1 = lsr<1>(input1) | (input0 & 0x80);
+	input0 = lsr<1>(input0) & 0x3F;
+
+	vmask4 mask = (input0 & 0x20) != vint4::zero();
+	input0 = select(input0, input0 - 0x40, mask);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %08x %08x %08x %08x\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	alignas(16) float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask4 a)
+{
+	print(select(vint4(0), vint4(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@ -0,0 +1,479 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for angular-sum algorithm for weight alignment.
+ *
+ * This algorithm works as follows:
+ * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
+ *   where i is the input value and s is a scaling factor based on the spacing between the weights.
+ * - we then add together complex numbers for all the weights.
+ * - we then compute the length and angle of the resulting sum.
+ *
+ * This should produce the following results:
+ * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
+ * - even distribution results in a vector of length 0.
+ * - all samples identical results in perfect alignment for every scaling.
+ *
+ * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
+ * should then result in some scalings standing out as having particularly good alignment factors;
+ * we can use this to produce a set of candidate scale/shift values for various quantization levels;
+ * we should then actually try them and see what happens.
+ */
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+#include <stdio.h>
+#include <cassert>
+#include <cstring>
+
+static constexpr unsigned int ANGULAR_STEPS { 32 };
+
+static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
+              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert(ANGULAR_STEPS >= 32,
+              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
+
+// Store a reduced sin/cos table for 64 possible weight values; this causes
+// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
+static constexpr unsigned int SINCOS_STEPS { 64 };
+
+static const uint8_t steps_for_quant_level[12] {
+	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
+};
+
+alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	static bool print_once { true };
+#endif
+
+/* See header for documentation. */
+void prepare_angular_tables()
+{
+	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
+	{
+		float angle_step = static_cast<float>(i + 1);
+
+		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
+		{
+			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+		}
+	}
+}
+
+/**
+ * @brief Compute the angular alignment factors and offsets.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param[out] offsets                   The output angular offsets array.
+ */
+static void compute_angular_offsets(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	float* offsets
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
+
+	// Precompute isample; arrays are always allocated 64 elements long
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Add 2^23 and interpreting bits extracts round-to-nearest int
+		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
+		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
+		storea(isample, isamplev + i);
+	}
+
+	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
+	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
+
+	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat anglesum_x = vfloat::zero();
+		vfloat anglesum_y = vfloat::zero();
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			int isample = isamplev[j];
+			anglesum_x += loada(cos_table[isample] + i);
+			anglesum_y += loada(sin_table[isample] + i);
+		}
+
+		vfloat angle = atan2(anglesum_y, anglesum_x);
+		vfloat ofs = angle * mult;
+		storea(ofs, offsets + i);
+	}
+}
+
+/**
+ * @brief For a given step size compute the lowest and highest weight.
+ *
+ * Compute the lowest and highest weight that results from quantizing using the given stepsize and
+ * offset, and then compute the resulting error. The cut errors indicate the error that results from
+ * forcing samples that should have had one weight value one step up or down.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param      max_quant_steps           The maximum quantization level to be tested.
+ * @param      offsets                   The angular offsets array.
+ * @param[out] lowest_weight             Per angular step, the lowest weight.
+ * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
+ * @param[out] error                     Per angular step, the error.
+ * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
+ * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
+ */
+static void compute_lowest_and_highest_weight(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	unsigned int max_quant_steps,
+	const float* offsets,
+	float* lowest_weight,
+	int* weight_span,
+	float* error,
+	float* cut_low_weight_error,
+	float* cut_high_weight_error
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+
+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat minidx(128.0f);
+		vfloat maxidx(-128.0f);
+		vfloat errval = vfloat::zero();
+		vfloat cut_low_weight_err = vfloat::zero();
+		vfloat cut_high_weight_err = vfloat::zero();
+		vfloat offset = loada(offsets + sp);
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
+			vfloat svalrte = round(sval);
+			vfloat diff = sval - svalrte;
+			errval += diff * diff;
+
+			// Reset tracker on min hit
+			vmask mask = svalrte < minidx;
+			minidx = select(minidx, svalrte, mask);
+			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
+
+			// Accumulate on min hit
+			mask = svalrte == minidx;
+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
+
+			// Reset tracker on max hit
+			mask = svalrte > maxidx;
+			maxidx = select(maxidx, svalrte, mask);
+			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
+
+			// Accumulate on max hit
+			mask = svalrte == maxidx;
+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
+		}
+
+		// Write out min weight and weight span; clamp span to a usable range
+		vint span = float_to_int(maxidx - minidx + vfloat(1));
+		span = min(span, vint(max_quant_steps + 3));
+		span = max(span, vint(2));
+		storea(minidx, lowest_weight + sp);
+		storea(span, weight_span + sp);
+
+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
+		// samples that should have had the weight value one step (up/down).
+		vfloat ssize = 1.0f / rcp_stepsize;
+		vfloat errscale = ssize * ssize;
+		storea(errval * errscale, error + sp);
+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
+
+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
+	}
+}
+
+/**
+ * @brief The main function for the angular algorithm.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_quant_level           The maximum quantization level to be tested.
+ * @param[out] low_value                 Per angular step, the lowest weight value.
+ * @param[out] high_value                Per angular step, the highest weight value.
+ */
+static void compute_angular_endpoints_for_quant_levels(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_quant_level,
+	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
+	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
+) {
+	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
+	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
+
+	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
+
+	compute_angular_offsets(weight_count, dec_weight_ideal_value,
+	                        max_angular_steps, angular_offsets);
+
+	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
+
+	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
+	                                  max_angular_steps, max_quant_steps,
+	                                  angular_offsets, lowest_weight, weight_span, error,
+	                                  cut_low_weight_error, cut_high_weight_error);
+
+	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
+	// branches can become selects. This involves some integer to float casts, but the values are
+	// small enough so they never round the wrong way.
+	vfloat4 best_results[36];
+
+	// Initialize the array to some safe defaults
+	promise(max_quant_steps > 0);
+	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
+	{
+		// Lane<0> = Best error
+		// Lane<1> = Best scale; -1 indicates no solution found
+		// Lane<2> = Cut low weight
+		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
+	}
+
+	promise(max_angular_steps > 0);
+	for (unsigned int i = 0; i < max_angular_steps; i++)
+	{
+		float i_flt = static_cast<float>(i);
+
+		int idx_span = weight_span[i];
+
+		float error_cut_low = error[i] + cut_low_weight_error[i];
+		float error_cut_high = error[i] + cut_high_weight_error[i];
+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
+
+		// Check best error against record N
+		vfloat4 best_result = best_results[idx_span];
+		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
+		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
+		best_results[idx_span] = select(best_result, new_result, mask);
+
+		// Check best error against record N-1 with either cut low or cut high
+		best_result = best_results[idx_span - 1];
+
+		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
+		best_result = select(best_result, new_result, mask);
+
+		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
+		best_results[idx_span - 1] = select(best_result, new_result, mask);
+
+		// Check best error against record N-2 with both cut low and high
+		best_result = best_results[idx_span - 2];
+		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
+		best_results[idx_span - 2] = select(best_result, new_result, mask);
+	}
+
+	for (unsigned int i = 0; i <= max_quant_level; i++)
+	{
+		unsigned int q = steps_for_quant_level[i];
+		int bsi = static_cast<int>(best_results[q].lane<1>());
+
+		// Did we find anything?
+#if defined(ASTCENC_DIAGNOSTICS)
+		if ((bsi < 0) && print_once)
+		{
+			print_once = false;
+			printf("INFO: Unable to find full encoding within search error limit.\n\n");
+		}
+#endif
+
+		bsi = astc::max(0, bsi);
+
+		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
+		float hwi = lwi + static_cast<float>(q) - 1.0f;
+
+		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
+		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
+		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_1plane(
+	bool only_always,
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+
+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+
+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
+	                                                : bsd.decimation_mode_count_selected;
+	promise(max_decimation_modes > 0);
+	for (unsigned int i = 0; i < max_decimation_modes; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_1plane;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values[i], high_values[i]);
+	}
+
+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
+	                                           : bsd.block_mode_count_1plane_selected;
+	promise(max_block_modes > 0);
+	for (unsigned int i = 0; i < max_block_modes; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		assert(!bm.is_dual_plane);
+
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value[i] = low_values[decim_mode][quant_mode];
+			high_value[i] = high_values[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value[i] = 0.0f;
+			high_value[i] = 1.0f;
+		}
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_2planes(
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
+
+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
+
+	promise(bsd.decimation_mode_count_selected > 0);
+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_2planes;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values1[i], high_values1[i]);
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
+		    max_precision, low_values2[i], high_values2[i]);
+	}
+
+	unsigned int start = bsd.block_mode_count_1plane_selected;
+	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
+	for (unsigned int i = start; i < end; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value1[i] = low_values1[decim_mode][quant_mode];
+			high_value1[i] = high_values1[decim_mode][quant_mode];
+			low_value2[i] = low_values2[decim_mode][quant_mode];
+			high_value2[i] = high_values2[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value1[i] = 0.0f;
+			high_value1[i] = 1.0f;
+			low_value2[i] = 0.0f;
+			high_value2[i] = 1.0f;
+		}
+	}
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
+++ b/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Data tables for quantization transfer.
+ */
+
+#include "astcenc_internal.h"
+
+#define _ 0 // Using _ to indicate an entry that will not be used.
+
+const quant_and_transfer_table quant_and_xfer_tables[12] {
+	// QUANT2, range 0..1
+	{
+		{0, 64},
+		{0, 1},
+		{0, 64},
+		{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x4000}
+	},
+	// QUANT_3, range 0..2
+	{
+		{0, 32, 64},
+		{0, 1, 2},
+		{0, 32, 64},
+		{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,0x4020}
+	},
+	// QUANT_4, range 0..3
+	{
+		{0, 21, 43, 64},
+		{0, 1, 2, 3},
+		{0, 21, 43, 64},
+		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,0x402b}
+	},
+	//QUANT_5, range 0..4
+	{
+		{0, 16, 32, 48, 64},
+		{0, 1, 2, 3, 4},
+		{0, 16, 32, 48, 64},
+		{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
+	},
+	// QUANT_6, range 0..5
+	{
+		{0, 12, 25, 39, 52, 64},
+		{0, 2, 4, 5, 3, 1},
+		{0, 64, 12, 52, 25, 39},
+		{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
+	},
+	// QUANT_8, range 0..7
+	{
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7},
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
+		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
+		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
+	},
+	// QUANT_10, range 0..9
+	{
+		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
+		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
+		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
+		{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
+		 0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
+		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
+		 _,0x4039}
+	},
+	// QUANT_12, range 0..11
+	{
+		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
+		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
+		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
+		{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
+		 0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
+		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
+		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
+	},
+	// QUANT_16, range 0..15
+	{
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
+		 0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
+		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
+		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
+	},
+	// QUANT_20, range 0..19
+	{
+		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
+		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
+		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
+		{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
+		 0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
+		 0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
+		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
+		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
+	},
+	// QUANT_24, range 0..23
+	{
+		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
+		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
+		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
+		{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
+		 _,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
+		 0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
+		 0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
+		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
+		 0x403b,_,0x403e}
+	},
+	// QUANT_32, range 0..31
+	{
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
+		 0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
+		 0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
+		 0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
+		 0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
+		 0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
+		 0x403c,_,0x403e}
+	}
+};