scripts/mkutf8data is used only when regenerating utf8data.h, which never happens in the normal kernel build. However, it is irrespectively built if CONFIG_UNICODE is enabled. Moreover, there is no good reason for it to reside in the scripts/ directory since it is only used in fs/unicode/. Hence, move it from scripts/ to fs/unicode/. In some cases, we bypass build artifacts in the normal build. The conventional way to do so is to surround the code with ifdef REGENERATE_*. For example, -7373f4f83c("kbuild: add implicit rules for parser generation") -6aaf49b495("crypto: arm,arm64 - Fix random regeneration of S_shipped") I rewrote the rule in a more kbuild'ish style. In the normal build, utf8data.h is just shipped from the check-in file. $ make [ snip ] SHIPPED fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a If you want to generate utf8data.h based on UCD, put *.txt files into fs/unicode/, then pass REGENERATE_UTF8DATA=1 from the command line. The mkutf8data tool will be automatically compiled to generate the utf8data.h from the *.txt files. $ make REGENERATE_UTF8DATA=1 [ snip ] HOSTCC fs/unicode/mkutf8data GEN fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a I renamed the check-in utf8data.h to utf8data.h_shipped so that this will work for the out-of-tree build. You can update it based on the latest UCD like this: $ make REGENERATE_UTF8DATA=1 fs/unicode/ $ cp fs/unicode/utf8data.h fs/unicode/utf8data.h_shipped Also, I added entries to .gitignore and dontdiff. Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
		
			
				
	
	
		
			3420 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			3420 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2014 SGI.
 | |
|  * All rights reserved.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License as
 | |
|  * published by the Free Software Foundation.
 | |
|  *
 | |
|  * This program is distributed in the hope that it would be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public License
 | |
|  * along with this program; if not, write the Free Software Foundation,
 | |
|  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 | |
|  */
 | |
| 
 | |
| /* Generator for a compact trie for unicode normalization */
 | |
| 
 | |
| #include <sys/types.h>
 | |
| #include <stddef.h>
 | |
| #include <stdlib.h>
 | |
| #include <stdio.h>
 | |
| #include <assert.h>
 | |
| #include <string.h>
 | |
| #include <unistd.h>
 | |
| #include <errno.h>
 | |
| 
 | |
| /* Default names of the in- and output files. */
 | |
| 
 | |
| #define AGE_NAME	"DerivedAge.txt"
 | |
| #define CCC_NAME	"DerivedCombiningClass.txt"
 | |
| #define PROP_NAME	"DerivedCoreProperties.txt"
 | |
| #define DATA_NAME	"UnicodeData.txt"
 | |
| #define FOLD_NAME	"CaseFolding.txt"
 | |
| #define NORM_NAME	"NormalizationCorrections.txt"
 | |
| #define TEST_NAME	"NormalizationTest.txt"
 | |
| #define UTF8_NAME	"utf8data.h"
 | |
| 
 | |
| const char	*age_name  = AGE_NAME;
 | |
| const char	*ccc_name  = CCC_NAME;
 | |
| const char	*prop_name = PROP_NAME;
 | |
| const char	*data_name = DATA_NAME;
 | |
| const char	*fold_name = FOLD_NAME;
 | |
| const char	*norm_name = NORM_NAME;
 | |
| const char	*test_name = TEST_NAME;
 | |
| const char	*utf8_name = UTF8_NAME;
 | |
| 
 | |
| int verbose = 0;
 | |
| 
 | |
| /* An arbitrary line size limit on input lines. */
 | |
| 
 | |
| #define LINESIZE	1024
 | |
| char line[LINESIZE];
 | |
| char buf0[LINESIZE];
 | |
| char buf1[LINESIZE];
 | |
| char buf2[LINESIZE];
 | |
| char buf3[LINESIZE];
 | |
| 
 | |
| const char *argv0;
 | |
| 
 | |
| #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| /*
 | |
|  * Unicode version numbers consist of three parts: major, minor, and a
 | |
|  * revision.  These numbers are packed into an unsigned int to obtain
 | |
|  * a single version number.
 | |
|  *
 | |
|  * To save space in the generated trie, the unicode version is not
 | |
|  * stored directly, instead we calculate a generation number from the
 | |
|  * unicode versions seen in the DerivedAge file, and use that as an
 | |
|  * index into a table of unicode versions.
 | |
|  */
 | |
| #define UNICODE_MAJ_SHIFT		(16)
 | |
| #define UNICODE_MIN_SHIFT		(8)
 | |
| 
 | |
| #define UNICODE_MAJ_MAX			((unsigned short)-1)
 | |
| #define UNICODE_MIN_MAX			((unsigned char)-1)
 | |
| #define UNICODE_REV_MAX			((unsigned char)-1)
 | |
| 
 | |
| #define UNICODE_AGE(MAJ,MIN,REV)			\
 | |
| 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
 | |
| 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
 | |
| 	 ((unsigned int)(REV)))
 | |
| 
 | |
| unsigned int *ages;
 | |
| int ages_count;
 | |
| 
 | |
| unsigned int unicode_maxage;
 | |
| 
 | |
| static int age_valid(unsigned int major, unsigned int minor,
 | |
| 		     unsigned int revision)
 | |
| {
 | |
| 	if (major > UNICODE_MAJ_MAX)
 | |
| 		return 0;
 | |
| 	if (minor > UNICODE_MIN_MAX)
 | |
| 		return 0;
 | |
| 	if (revision > UNICODE_REV_MAX)
 | |
| 		return 0;
 | |
| 	return 1;
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| /*
 | |
|  * utf8trie_t
 | |
|  *
 | |
|  * A compact binary tree, used to decode UTF-8 characters.
 | |
|  *
 | |
|  * Internal nodes are one byte for the node itself, and up to three
 | |
|  * bytes for an offset into the tree.  The first byte contains the
 | |
|  * following information:
 | |
|  *  NEXTBYTE  - flag        - advance to next byte if set
 | |
|  *  BITNUM    - 3 bit field - the bit number to tested
 | |
|  *  OFFLEN    - 2 bit field - number of bytes in the offset
 | |
|  * if offlen == 0 (non-branching node)
 | |
|  *  RIGHTPATH - 1 bit field - set if the following node is for the
 | |
|  *                            right-hand path (tested bit is set)
 | |
|  *  TRIENODE  - 1 bit field - set if the following node is an internal
 | |
|  *                            node, otherwise it is a leaf node
 | |
|  * if offlen != 0 (branching node)
 | |
|  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 | |
|  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 | |
|  *
 | |
|  * Due to the way utf8 works, there cannot be branching nodes with
 | |
|  * NEXTBYTE set, and moreover those nodes always have a righthand
 | |
|  * descendant.
 | |
|  */
 | |
| typedef unsigned char utf8trie_t;
 | |
| #define BITNUM		0x07
 | |
| #define NEXTBYTE	0x08
 | |
| #define OFFLEN		0x30
 | |
| #define OFFLEN_SHIFT	4
 | |
| #define RIGHTPATH	0x40
 | |
| #define TRIENODE	0x80
 | |
| #define RIGHTNODE	0x40
 | |
| #define LEFTNODE	0x80
 | |
| 
 | |
| /*
 | |
|  * utf8leaf_t
 | |
|  *
 | |
|  * The leaves of the trie are embedded in the trie, and so the same
 | |
|  * underlying datatype, unsigned char.
 | |
|  *
 | |
|  * leaf[0]: The unicode version, stored as a generation number that is
 | |
|  *          an index into utf8agetab[].  With this we can filter code
 | |
|  *          points based on the unicode version in which they were
 | |
|  *          defined.  The CCC of a non-defined code point is 0.
 | |
|  * leaf[1]: Canonical Combining Class. During normalization, we need
 | |
|  *          to do a stable sort into ascending order of all characters
 | |
|  *          with a non-zero CCC that occur between two characters with
 | |
|  *          a CCC of 0, or at the begin or end of a string.
 | |
|  *          The unicode standard guarantees that all CCC values are
 | |
|  *          between 0 and 254 inclusive, which leaves 255 available as
 | |
|  *          a special value.
 | |
|  *          Code points with CCC 0 are known as stoppers.
 | |
|  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 | |
|  *          start of a NUL-terminated string that is the decomposition
 | |
|  *          of the character.
 | |
|  *          The CCC of a decomposable character is the same as the CCC
 | |
|  *          of the first character of its decomposition.
 | |
|  *          Some characters decompose as the empty string: these are
 | |
|  *          characters with the Default_Ignorable_Code_Point property.
 | |
|  *          These do affect normalization, as they all have CCC 0.
 | |
|  *
 | |
|  * The decompositions in the trie have been fully expanded.
 | |
|  *
 | |
|  * Casefolding, if applicable, is also done using decompositions.
 | |
|  */
 | |
| typedef unsigned char utf8leaf_t;
 | |
| 
 | |
| #define LEAF_GEN(LEAF)	((LEAF)[0])
 | |
| #define LEAF_CCC(LEAF)	((LEAF)[1])
 | |
| #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
 | |
| 
 | |
| #define MAXGEN		(255)
 | |
| 
 | |
| #define MINCCC		(0)
 | |
| #define MAXCCC		(254)
 | |
| #define STOPPER		(0)
 | |
| #define DECOMPOSE	(255)
 | |
| #define HANGUL		((char)(255))
 | |
| 
 | |
| #define UTF8HANGULLEAF	(12)
 | |
| 
 | |
| struct tree;
 | |
| static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
 | |
| 			       const char *, size_t);
 | |
| static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
 | |
| 
 | |
| unsigned char *utf8data;
 | |
| size_t utf8data_size;
 | |
| 
 | |
| utf8trie_t *nfdi;
 | |
| utf8trie_t *nfdicf;
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| /*
 | |
|  * UTF8 valid ranges.
 | |
|  *
 | |
|  * The UTF-8 encoding spreads the bits of a 32bit word over several
 | |
|  * bytes. This table gives the ranges that can be held and how they'd
 | |
|  * be represented.
 | |
|  *
 | |
|  * 0x00000000 0x0000007F: 0xxxxxxx
 | |
|  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 | |
|  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  *
 | |
|  * There is an additional requirement on UTF-8, in that only the
 | |
|  * shortest representation of a 32bit value is to be used.  A decoder
 | |
|  * must not decode sequences that do not satisfy this requirement.
 | |
|  * Thus the allowed ranges have a lower bound.
 | |
|  *
 | |
|  * 0x00000000 0x0000007F: 0xxxxxxx
 | |
|  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 | |
|  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | |
|  *
 | |
|  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 | |
|  * 17 planes of 65536 values.  This limits the sequences actually seen
 | |
|  * even more, to just the following.
 | |
|  *
 | |
|  *          0 -     0x7f: 0                     0x7f
 | |
|  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 | |
|  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 | |
|  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 | |
|  *
 | |
|  * Even within those ranges not all values are allowed: the surrogates
 | |
|  * 0xd800 - 0xdfff should never be seen.
 | |
|  *
 | |
|  * Note that the longest sequence seen with valid usage is 4 bytes,
 | |
|  * the same a single UTF-32 character.  This makes the UTF-8
 | |
|  * representation of Unicode strictly smaller than UTF-32.
 | |
|  *
 | |
|  * The shortest sequence requirement was introduced by:
 | |
|  *    Corrigendum #1: UTF-8 Shortest Form
 | |
|  * It can be found here:
 | |
|  *    http://www.unicode.org/versions/corrigendum1.html
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #define UTF8_2_BITS     0xC0
 | |
| #define UTF8_3_BITS     0xE0
 | |
| #define UTF8_4_BITS     0xF0
 | |
| #define UTF8_N_BITS     0x80
 | |
| #define UTF8_2_MASK     0xE0
 | |
| #define UTF8_3_MASK     0xF0
 | |
| #define UTF8_4_MASK     0xF8
 | |
| #define UTF8_N_MASK     0xC0
 | |
| #define UTF8_V_MASK     0x3F
 | |
| #define UTF8_V_SHIFT    6
 | |
| 
 | |
| static int utf8encode(char *str, unsigned int val)
 | |
| {
 | |
| 	int len;
 | |
| 
 | |
| 	if (val < 0x80) {
 | |
| 		str[0] = val;
 | |
| 		len = 1;
 | |
| 	} else if (val < 0x800) {
 | |
| 		str[1] = val & UTF8_V_MASK;
 | |
| 		str[1] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[0] = val;
 | |
| 		str[0] |= UTF8_2_BITS;
 | |
| 		len = 2;
 | |
| 	} else if (val < 0x10000) {
 | |
| 		str[2] = val & UTF8_V_MASK;
 | |
| 		str[2] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[1] = val & UTF8_V_MASK;
 | |
| 		str[1] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[0] = val;
 | |
| 		str[0] |= UTF8_3_BITS;
 | |
| 		len = 3;
 | |
| 	} else if (val < 0x110000) {
 | |
| 		str[3] = val & UTF8_V_MASK;
 | |
| 		str[3] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[2] = val & UTF8_V_MASK;
 | |
| 		str[2] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[1] = val & UTF8_V_MASK;
 | |
| 		str[1] |= UTF8_N_BITS;
 | |
| 		val >>= UTF8_V_SHIFT;
 | |
| 		str[0] = val;
 | |
| 		str[0] |= UTF8_4_BITS;
 | |
| 		len = 4;
 | |
| 	} else {
 | |
| 		printf("%#x: illegal val\n", val);
 | |
| 		len = 0;
 | |
| 	}
 | |
| 	return len;
 | |
| }
 | |
| 
 | |
| static unsigned int utf8decode(const char *str)
 | |
| {
 | |
| 	const unsigned char *s = (const unsigned char*)str;
 | |
| 	unsigned int unichar = 0;
 | |
| 
 | |
| 	if (*s < 0x80) {
 | |
| 		unichar = *s;
 | |
| 	} else if (*s < UTF8_3_BITS) {
 | |
| 		unichar = *s++ & 0x1F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s & 0x3F;
 | |
| 	} else if (*s < UTF8_4_BITS) {
 | |
| 		unichar = *s++ & 0x0F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s++ & 0x3F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s & 0x3F;
 | |
| 	} else {
 | |
| 		unichar = *s++ & 0x0F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s++ & 0x3F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s++ & 0x3F;
 | |
| 		unichar <<= UTF8_V_SHIFT;
 | |
| 		unichar |= *s & 0x3F;
 | |
| 	}
 | |
| 	return unichar;
 | |
| }
 | |
| 
 | |
| static int utf32valid(unsigned int unichar)
 | |
| {
 | |
| 	return unichar < 0x110000;
 | |
| }
 | |
| 
 | |
| #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
 | |
| 
 | |
| #define NODE 1
 | |
| #define LEAF 0
 | |
| 
 | |
| struct tree {
 | |
| 	void *root;
 | |
| 	int childnode;
 | |
| 	const char *type;
 | |
| 	unsigned int maxage;
 | |
| 	struct tree *next;
 | |
| 	int (*leaf_equal)(void *, void *);
 | |
| 	void (*leaf_print)(void *, int);
 | |
| 	int (*leaf_mark)(void *);
 | |
| 	int (*leaf_size)(void *);
 | |
| 	int *(*leaf_index)(struct tree *, void *);
 | |
| 	unsigned char *(*leaf_emit)(void *, unsigned char *);
 | |
| 	int leafindex[0x110000];
 | |
| 	int index;
 | |
| };
 | |
| 
 | |
| struct node {
 | |
| 	int index;
 | |
| 	int offset;
 | |
| 	int mark;
 | |
| 	int size;
 | |
| 	struct node *parent;
 | |
| 	void *left;
 | |
| 	void *right;
 | |
| 	unsigned char bitnum;
 | |
| 	unsigned char nextbyte;
 | |
| 	unsigned char leftnode;
 | |
| 	unsigned char rightnode;
 | |
| 	unsigned int keybits;
 | |
| 	unsigned int keymask;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * Example lookup function for a tree.
 | |
|  */
 | |
| static void *lookup(struct tree *tree, const char *key)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	void *leaf = NULL;
 | |
| 
 | |
| 	node = tree->root;
 | |
| 	while (!leaf && node) {
 | |
| 		if (node->nextbyte)
 | |
| 			key++;
 | |
| 		if (*key & (1 << (node->bitnum & 7))) {
 | |
| 			/* Right leg */
 | |
| 			if (node->rightnode == NODE) {
 | |
| 				node = node->right;
 | |
| 			} else if (node->rightnode == LEAF) {
 | |
| 				leaf = node->right;
 | |
| 			} else {
 | |
| 				node = NULL;
 | |
| 			}
 | |
| 		} else {
 | |
| 			/* Left leg */
 | |
| 			if (node->leftnode == NODE) {
 | |
| 				node = node->left;
 | |
| 			} else if (node->leftnode == LEAF) {
 | |
| 				leaf = node->left;
 | |
| 			} else {
 | |
| 				node = NULL;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return leaf;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * A simple non-recursive tree walker: keep track of visits to the
 | |
|  * left and right branches in the leftmask and rightmask.
 | |
|  */
 | |
| static void tree_walk(struct tree *tree)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	int indent = 1;
 | |
| 	int nodes, singletons, leaves;
 | |
| 
 | |
| 	nodes = singletons = leaves = 0;
 | |
| 
 | |
| 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 | |
| 	if (tree->childnode == LEAF) {
 | |
| 		assert(tree->root);
 | |
| 		tree->leaf_print(tree->root, indent);
 | |
| 		leaves = 1;
 | |
| 	} else {
 | |
| 		assert(tree->childnode == NODE);
 | |
| 		node = tree->root;
 | |
| 		leftmask = rightmask = 0;
 | |
| 		while (node) {
 | |
| 			printf("%*snode @ %p bitnum %d nextbyte %d"
 | |
| 			       " left %p right %p mask %x bits %x\n",
 | |
| 				indent, "", node,
 | |
| 				node->bitnum, node->nextbyte,
 | |
| 				node->left, node->right,
 | |
| 				node->keymask, node->keybits);
 | |
| 			nodes += 1;
 | |
| 			if (!(node->left && node->right))
 | |
| 				singletons += 1;
 | |
| 
 | |
| 			while (node) {
 | |
| 				bitmask = 1 << node->bitnum;
 | |
| 				if ((leftmask & bitmask) == 0) {
 | |
| 					leftmask |= bitmask;
 | |
| 					if (node->leftnode == LEAF) {
 | |
| 						assert(node->left);
 | |
| 						tree->leaf_print(node->left,
 | |
| 								 indent+1);
 | |
| 						leaves += 1;
 | |
| 					} else if (node->left) {
 | |
| 						assert(node->leftnode == NODE);
 | |
| 						indent += 1;
 | |
| 						node = node->left;
 | |
| 						break;
 | |
| 					}
 | |
| 				}
 | |
| 				if ((rightmask & bitmask) == 0) {
 | |
| 					rightmask |= bitmask;
 | |
| 					if (node->rightnode == LEAF) {
 | |
| 						assert(node->right);
 | |
| 						tree->leaf_print(node->right,
 | |
| 								 indent+1);
 | |
| 						leaves += 1;
 | |
| 					} else if (node->right) {
 | |
| 						assert(node->rightnode == NODE);
 | |
| 						indent += 1;
 | |
| 						node = node->right;
 | |
| 						break;
 | |
| 					}
 | |
| 				}
 | |
| 				leftmask &= ~bitmask;
 | |
| 				rightmask &= ~bitmask;
 | |
| 				node = node->parent;
 | |
| 				indent -= 1;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	printf("nodes %d leaves %d singletons %d\n",
 | |
| 	       nodes, leaves, singletons);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Allocate an initialize a new internal node.
 | |
|  */
 | |
| static struct node *alloc_node(struct node *parent)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	int bitnum;
 | |
| 
 | |
| 	node = malloc(sizeof(*node));
 | |
| 	node->left = node->right = NULL;
 | |
| 	node->parent = parent;
 | |
| 	node->leftnode = NODE;
 | |
| 	node->rightnode = NODE;
 | |
| 	node->keybits = 0;
 | |
| 	node->keymask = 0;
 | |
| 	node->mark = 0;
 | |
| 	node->index = 0;
 | |
| 	node->offset = -1;
 | |
| 	node->size = 4;
 | |
| 
 | |
| 	if (node->parent) {
 | |
| 		bitnum = parent->bitnum;
 | |
| 		if ((bitnum & 7) == 0) {
 | |
| 			node->bitnum = bitnum + 7 + 8;
 | |
| 			node->nextbyte = 1;
 | |
| 		} else {
 | |
| 			node->bitnum = bitnum - 1;
 | |
| 			node->nextbyte = 0;
 | |
| 		}
 | |
| 	} else {
 | |
| 		node->bitnum = 7;
 | |
| 		node->nextbyte = 0;
 | |
| 	}
 | |
| 
 | |
| 	return node;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Insert a new leaf into the tree, and collapse any subtrees that are
 | |
|  * fully populated and end in identical leaves. A nextbyte tagged
 | |
|  * internal node will not be removed to preserve the tree's integrity.
 | |
|  * Note that due to the structure of utf8, no nextbyte tagged node
 | |
|  * will be a candidate for removal.
 | |
|  */
 | |
| static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	struct node *parent;
 | |
| 	void **cursor;
 | |
| 	int keybits;
 | |
| 
 | |
| 	assert(keylen >= 1 && keylen <= 4);
 | |
| 
 | |
| 	node = NULL;
 | |
| 	cursor = &tree->root;
 | |
| 	keybits = 8 * keylen;
 | |
| 
 | |
| 	/* Insert, creating path along the way. */
 | |
| 	while (keybits) {
 | |
| 		if (!*cursor)
 | |
| 			*cursor = alloc_node(node);
 | |
| 		node = *cursor;
 | |
| 		if (node->nextbyte)
 | |
| 			key++;
 | |
| 		if (*key & (1 << (node->bitnum & 7)))
 | |
| 			cursor = &node->right;
 | |
| 		else
 | |
| 			cursor = &node->left;
 | |
| 		keybits--;
 | |
| 	}
 | |
| 	*cursor = leaf;
 | |
| 
 | |
| 	/* Merge subtrees if possible. */
 | |
| 	while (node) {
 | |
| 		if (*key & (1 << (node->bitnum & 7)))
 | |
| 			node->rightnode = LEAF;
 | |
| 		else
 | |
| 			node->leftnode = LEAF;
 | |
| 		if (node->nextbyte)
 | |
| 			break;
 | |
| 		if (node->leftnode == NODE || node->rightnode == NODE)
 | |
| 			break;
 | |
| 		assert(node->left);
 | |
| 		assert(node->right);
 | |
| 		/* Compare */
 | |
| 		if (! tree->leaf_equal(node->left, node->right))
 | |
| 			break;
 | |
| 		/* Keep left, drop right leaf. */
 | |
| 		leaf = node->left;
 | |
| 		/* Check in parent */
 | |
| 		parent = node->parent;
 | |
| 		if (!parent) {
 | |
| 			/* root of tree! */
 | |
| 			tree->root = leaf;
 | |
| 			tree->childnode = LEAF;
 | |
| 		} else if (parent->left == node) {
 | |
| 			parent->left = leaf;
 | |
| 			parent->leftnode = LEAF;
 | |
| 			if (parent->right) {
 | |
| 				parent->keymask = 0;
 | |
| 				parent->keybits = 0;
 | |
| 			} else {
 | |
| 				parent->keymask |= (1 << node->bitnum);
 | |
| 			}
 | |
| 		} else if (parent->right == node) {
 | |
| 			parent->right = leaf;
 | |
| 			parent->rightnode = LEAF;
 | |
| 			if (parent->left) {
 | |
| 				parent->keymask = 0;
 | |
| 				parent->keybits = 0;
 | |
| 			} else {
 | |
| 				parent->keymask |= (1 << node->bitnum);
 | |
| 				parent->keybits |= (1 << node->bitnum);
 | |
| 			}
 | |
| 		} else {
 | |
| 			/* internal tree error */
 | |
| 			assert(0);
 | |
| 		}
 | |
| 		free(node);
 | |
| 		node = parent;
 | |
| 	}
 | |
| 
 | |
| 	/* Propagate keymasks up along singleton chains. */
 | |
| 	while (node) {
 | |
| 		parent = node->parent;
 | |
| 		if (!parent)
 | |
| 			break;
 | |
| 		/* Nix the mask for parents with two children. */
 | |
| 		if (node->keymask == 0) {
 | |
| 			parent->keymask = 0;
 | |
| 			parent->keybits = 0;
 | |
| 		} else if (parent->left && parent->right) {
 | |
| 			parent->keymask = 0;
 | |
| 			parent->keybits = 0;
 | |
| 		} else {
 | |
| 			assert((parent->keymask & node->keymask) == 0);
 | |
| 			parent->keymask |= node->keymask;
 | |
| 			parent->keymask |= (1 << parent->bitnum);
 | |
| 			parent->keybits |= node->keybits;
 | |
| 			if (parent->right)
 | |
| 				parent->keybits |= (1 << parent->bitnum);
 | |
| 		}
 | |
| 		node = parent;
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Prune internal nodes.
 | |
|  *
 | |
|  * Fully populated subtrees that end at the same leaf have already
 | |
|  * been collapsed.  There are still internal nodes that have for both
 | |
|  * their left and right branches a sequence of singletons that make
 | |
|  * identical choices and end in identical leaves.  The keymask and
 | |
|  * keybits collected in the nodes describe the choices made in these
 | |
|  * singleton chains.  When they are identical for the left and right
 | |
|  * branch of a node, and the two leaves comare identical, the node in
 | |
|  * question can be removed.
 | |
|  *
 | |
|  * Note that nodes with the nextbyte tag set will not be removed by
 | |
|  * this to ensure tree integrity.  Note as well that the structure of
 | |
|  * utf8 ensures that these nodes would not have been candidates for
 | |
|  * removal in any case.
 | |
|  */
 | |
| static void prune(struct tree *tree)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	struct node *left;
 | |
| 	struct node *right;
 | |
| 	struct node *parent;
 | |
| 	void *leftleaf;
 | |
| 	void *rightleaf;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	int count;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Pruning %s_%x\n", tree->type, tree->maxage);
 | |
| 
 | |
| 	count = 0;
 | |
| 	if (tree->childnode == LEAF)
 | |
| 		return;
 | |
| 	if (!tree->root)
 | |
| 		return;
 | |
| 
 | |
| 	leftmask = rightmask = 0;
 | |
| 	node = tree->root;
 | |
| 	while (node) {
 | |
| 		if (node->nextbyte)
 | |
| 			goto advance;
 | |
| 		if (node->leftnode == LEAF)
 | |
| 			goto advance;
 | |
| 		if (node->rightnode == LEAF)
 | |
| 			goto advance;
 | |
| 		if (!node->left)
 | |
| 			goto advance;
 | |
| 		if (!node->right)
 | |
| 			goto advance;
 | |
| 		left = node->left;
 | |
| 		right = node->right;
 | |
| 		if (left->keymask == 0)
 | |
| 			goto advance;
 | |
| 		if (right->keymask == 0)
 | |
| 			goto advance;
 | |
| 		if (left->keymask != right->keymask)
 | |
| 			goto advance;
 | |
| 		if (left->keybits != right->keybits)
 | |
| 			goto advance;
 | |
| 		leftleaf = NULL;
 | |
| 		while (!leftleaf) {
 | |
| 			assert(left->left || left->right);
 | |
| 			if (left->leftnode == LEAF)
 | |
| 				leftleaf = left->left;
 | |
| 			else if (left->rightnode == LEAF)
 | |
| 				leftleaf = left->right;
 | |
| 			else if (left->left)
 | |
| 				left = left->left;
 | |
| 			else if (left->right)
 | |
| 				left = left->right;
 | |
| 			else
 | |
| 				assert(0);
 | |
| 		}
 | |
| 		rightleaf = NULL;
 | |
| 		while (!rightleaf) {
 | |
| 			assert(right->left || right->right);
 | |
| 			if (right->leftnode == LEAF)
 | |
| 				rightleaf = right->left;
 | |
| 			else if (right->rightnode == LEAF)
 | |
| 				rightleaf = right->right;
 | |
| 			else if (right->left)
 | |
| 				right = right->left;
 | |
| 			else if (right->right)
 | |
| 				right = right->right;
 | |
| 			else
 | |
| 				assert(0);
 | |
| 		}
 | |
| 		if (! tree->leaf_equal(leftleaf, rightleaf))
 | |
| 			goto advance;
 | |
| 		/*
 | |
| 		 * This node has identical singleton-only subtrees.
 | |
| 		 * Remove it.
 | |
| 		 */
 | |
| 		parent = node->parent;
 | |
| 		left = node->left;
 | |
| 		right = node->right;
 | |
| 		if (parent->left == node)
 | |
| 			parent->left = left;
 | |
| 		else if (parent->right == node)
 | |
| 			parent->right = left;
 | |
| 		else
 | |
| 			assert(0);
 | |
| 		left->parent = parent;
 | |
| 		left->keymask |= (1 << node->bitnum);
 | |
| 		node->left = NULL;
 | |
| 		while (node) {
 | |
| 			bitmask = 1 << node->bitnum;
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 			if (node->leftnode == NODE && node->left) {
 | |
| 				left = node->left;
 | |
| 				free(node);
 | |
| 				count++;
 | |
| 				node = left;
 | |
| 			} else if (node->rightnode == NODE && node->right) {
 | |
| 				right = node->right;
 | |
| 				free(node);
 | |
| 				count++;
 | |
| 				node = right;
 | |
| 			} else {
 | |
| 				node = NULL;
 | |
| 			}
 | |
| 		}
 | |
| 		/* Propagate keymasks up along singleton chains. */
 | |
| 		node = parent;
 | |
| 		/* Force re-check */
 | |
| 		bitmask = 1 << node->bitnum;
 | |
| 		leftmask &= ~bitmask;
 | |
| 		rightmask &= ~bitmask;
 | |
| 		for (;;) {
 | |
| 			if (node->left && node->right)
 | |
| 				break;
 | |
| 			if (node->left) {
 | |
| 				left = node->left;
 | |
| 				node->keymask |= left->keymask;
 | |
| 				node->keybits |= left->keybits;
 | |
| 			}
 | |
| 			if (node->right) {
 | |
| 				right = node->right;
 | |
| 				node->keymask |= right->keymask;
 | |
| 				node->keybits |= right->keybits;
 | |
| 			}
 | |
| 			node->keymask |= (1 << node->bitnum);
 | |
| 			node = node->parent;
 | |
| 			/* Force re-check */
 | |
| 			bitmask = 1 << node->bitnum;
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 		}
 | |
| 	advance:
 | |
| 		bitmask = 1 << node->bitnum;
 | |
| 		if ((leftmask & bitmask) == 0 &&
 | |
| 		    node->leftnode == NODE &&
 | |
| 		    node->left) {
 | |
| 			leftmask |= bitmask;
 | |
| 			node = node->left;
 | |
| 		} else if ((rightmask & bitmask) == 0 &&
 | |
| 			   node->rightnode == NODE &&
 | |
| 			   node->right) {
 | |
| 			rightmask |= bitmask;
 | |
| 			node = node->right;
 | |
| 		} else {
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 			node = node->parent;
 | |
| 		}
 | |
| 	}
 | |
| 	if (verbose > 0)
 | |
| 		printf("Pruned %d nodes\n", count);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Mark the nodes in the tree that lead to leaves that must be
 | |
|  * emitted.
 | |
|  */
 | |
| static void mark_nodes(struct tree *tree)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	struct node *n;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	int marked;
 | |
| 
 | |
| 	marked = 0;
 | |
| 	if (verbose > 0)
 | |
| 		printf("Marking %s_%x\n", tree->type, tree->maxage);
 | |
| 	if (tree->childnode == LEAF)
 | |
| 		goto done;
 | |
| 
 | |
| 	assert(tree->childnode == NODE);
 | |
| 	node = tree->root;
 | |
| 	leftmask = rightmask = 0;
 | |
| 	while (node) {
 | |
| 		bitmask = 1 << node->bitnum;
 | |
| 		if ((leftmask & bitmask) == 0) {
 | |
| 			leftmask |= bitmask;
 | |
| 			if (node->leftnode == LEAF) {
 | |
| 				assert(node->left);
 | |
| 				if (tree->leaf_mark(node->left)) {
 | |
| 					n = node;
 | |
| 					while (n && !n->mark) {
 | |
| 						marked++;
 | |
| 						n->mark = 1;
 | |
| 						n = n->parent;
 | |
| 					}
 | |
| 				}
 | |
| 			} else if (node->left) {
 | |
| 				assert(node->leftnode == NODE);
 | |
| 				node = node->left;
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 		if ((rightmask & bitmask) == 0) {
 | |
| 			rightmask |= bitmask;
 | |
| 			if (node->rightnode == LEAF) {
 | |
| 				assert(node->right);
 | |
| 				if (tree->leaf_mark(node->right)) {
 | |
| 					n = node;
 | |
| 					while (n && !n->mark) {
 | |
| 						marked++;
 | |
| 						n->mark = 1;
 | |
| 						n = n->parent;
 | |
| 					}
 | |
| 				}
 | |
| 			} else if (node->right) {
 | |
| 				assert(node->rightnode == NODE);
 | |
| 				node = node->right;
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 		leftmask &= ~bitmask;
 | |
| 		rightmask &= ~bitmask;
 | |
| 		node = node->parent;
 | |
| 	}
 | |
| 
 | |
| 	/* second pass: left siblings and singletons */
 | |
| 
 | |
| 	assert(tree->childnode == NODE);
 | |
| 	node = tree->root;
 | |
| 	leftmask = rightmask = 0;
 | |
| 	while (node) {
 | |
| 		bitmask = 1 << node->bitnum;
 | |
| 		if ((leftmask & bitmask) == 0) {
 | |
| 			leftmask |= bitmask;
 | |
| 			if (node->leftnode == LEAF) {
 | |
| 				assert(node->left);
 | |
| 				if (tree->leaf_mark(node->left)) {
 | |
| 					n = node;
 | |
| 					while (n && !n->mark) {
 | |
| 						marked++;
 | |
| 						n->mark = 1;
 | |
| 						n = n->parent;
 | |
| 					}
 | |
| 				}
 | |
| 			} else if (node->left) {
 | |
| 				assert(node->leftnode == NODE);
 | |
| 				node = node->left;
 | |
| 				if (!node->mark && node->parent->mark) {
 | |
| 					marked++;
 | |
| 					node->mark = 1;
 | |
| 				}
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 		if ((rightmask & bitmask) == 0) {
 | |
| 			rightmask |= bitmask;
 | |
| 			if (node->rightnode == LEAF) {
 | |
| 				assert(node->right);
 | |
| 				if (tree->leaf_mark(node->right)) {
 | |
| 					n = node;
 | |
| 					while (n && !n->mark) {
 | |
| 						marked++;
 | |
| 						n->mark = 1;
 | |
| 						n = n->parent;
 | |
| 					}
 | |
| 				}
 | |
| 			} else if (node->right) {
 | |
| 				assert(node->rightnode == NODE);
 | |
| 				node = node->right;
 | |
| 				if (!node->mark && node->parent->mark &&
 | |
| 				    !node->parent->left) {
 | |
| 					marked++;
 | |
| 					node->mark = 1;
 | |
| 				}
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 		leftmask &= ~bitmask;
 | |
| 		rightmask &= ~bitmask;
 | |
| 		node = node->parent;
 | |
| 	}
 | |
| done:
 | |
| 	if (verbose > 0)
 | |
| 		printf("Marked %d nodes\n", marked);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Compute the index of each node and leaf, which is the offset in the
 | |
|  * emitted trie.  These values must be pre-computed because relative
 | |
|  * offsets between nodes are used to navigate the tree.
 | |
|  */
 | |
| static int index_nodes(struct tree *tree, int index)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	int count;
 | |
| 	int indent;
 | |
| 
 | |
| 	/* Align to a cache line (or half a cache line?). */
 | |
| 	while (index % 64)
 | |
| 		index++;
 | |
| 	tree->index = index;
 | |
| 	indent = 1;
 | |
| 	count = 0;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 | |
| 	if (tree->childnode == LEAF) {
 | |
| 		index += tree->leaf_size(tree->root);
 | |
| 		goto done;
 | |
| 	}
 | |
| 
 | |
| 	assert(tree->childnode == NODE);
 | |
| 	node = tree->root;
 | |
| 	leftmask = rightmask = 0;
 | |
| 	while (node) {
 | |
| 		if (!node->mark)
 | |
| 			goto skip;
 | |
| 		count++;
 | |
| 		if (node->index != index)
 | |
| 			node->index = index;
 | |
| 		index += node->size;
 | |
| skip:
 | |
| 		while (node) {
 | |
| 			bitmask = 1 << node->bitnum;
 | |
| 			if (node->mark && (leftmask & bitmask) == 0) {
 | |
| 				leftmask |= bitmask;
 | |
| 				if (node->leftnode == LEAF) {
 | |
| 					assert(node->left);
 | |
| 					*tree->leaf_index(tree, node->left) =
 | |
| 									index;
 | |
| 					index += tree->leaf_size(node->left);
 | |
| 					count++;
 | |
| 				} else if (node->left) {
 | |
| 					assert(node->leftnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->left;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			if (node->mark && (rightmask & bitmask) == 0) {
 | |
| 				rightmask |= bitmask;
 | |
| 				if (node->rightnode == LEAF) {
 | |
| 					assert(node->right);
 | |
| 					*tree->leaf_index(tree, node->right) = index;
 | |
| 					index += tree->leaf_size(node->right);
 | |
| 					count++;
 | |
| 				} else if (node->right) {
 | |
| 					assert(node->rightnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->right;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 			node = node->parent;
 | |
| 			indent -= 1;
 | |
| 		}
 | |
| 	}
 | |
| done:
 | |
| 	/* Round up to a multiple of 16 */
 | |
| 	while (index % 16)
 | |
| 		index++;
 | |
| 	if (verbose > 0)
 | |
| 		printf("Final index %d\n", index);
 | |
| 	return index;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Mark the nodes in a subtree, helper for size_nodes().
 | |
|  */
 | |
| static int mark_subtree(struct node *node)
 | |
| {
 | |
| 	int changed;
 | |
| 
 | |
| 	if (!node || node->mark)
 | |
| 		return 0;
 | |
| 	node->mark = 1;
 | |
| 	node->index = node->parent->index;
 | |
| 	changed = 1;
 | |
| 	if (node->leftnode == NODE)
 | |
| 		changed += mark_subtree(node->left);
 | |
| 	if (node->rightnode == NODE)
 | |
| 		changed += mark_subtree(node->right);
 | |
| 	return changed;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Compute the size of nodes and leaves. We start by assuming that
 | |
|  * each node needs to store a three-byte offset. The indexes of the
 | |
|  * nodes are calculated based on that, and then this function is
 | |
|  * called to see if the sizes of some nodes can be reduced.  This is
 | |
|  * repeated until no more changes are seen.
 | |
|  */
 | |
| static int size_nodes(struct tree *tree)
 | |
| {
 | |
| 	struct tree *next;
 | |
| 	struct node *node;
 | |
| 	struct node *right;
 | |
| 	struct node *n;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	unsigned int pathbits;
 | |
| 	unsigned int pathmask;
 | |
| 	unsigned int nbit;
 | |
| 	int changed;
 | |
| 	int offset;
 | |
| 	int size;
 | |
| 	int indent;
 | |
| 
 | |
| 	indent = 1;
 | |
| 	changed = 0;
 | |
| 	size = 0;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Sizing %s_%x\n", tree->type, tree->maxage);
 | |
| 	if (tree->childnode == LEAF)
 | |
| 		goto done;
 | |
| 
 | |
| 	assert(tree->childnode == NODE);
 | |
| 	pathbits = 0;
 | |
| 	pathmask = 0;
 | |
| 	node = tree->root;
 | |
| 	leftmask = rightmask = 0;
 | |
| 	while (node) {
 | |
| 		if (!node->mark)
 | |
| 			goto skip;
 | |
| 		offset = 0;
 | |
| 		if (!node->left || !node->right) {
 | |
| 			size = 1;
 | |
| 		} else {
 | |
| 			if (node->rightnode == NODE) {
 | |
| 				/*
 | |
| 				 * If the right node is not marked,
 | |
| 				 * look for a corresponding node in
 | |
| 				 * the next tree.  Such a node need
 | |
| 				 * not exist.
 | |
| 				 */
 | |
| 				right = node->right;
 | |
| 				next = tree->next;
 | |
| 				while (!right->mark) {
 | |
| 					assert(next);
 | |
| 					n = next->root;
 | |
| 					while (n->bitnum != node->bitnum) {
 | |
| 						nbit = 1 << n->bitnum;
 | |
| 						if (!(pathmask & nbit))
 | |
| 							break;
 | |
| 						if (pathbits & nbit) {
 | |
| 							if (n->rightnode == LEAF)
 | |
| 								break;
 | |
| 							n = n->right;
 | |
| 						} else {
 | |
| 							if (n->leftnode == LEAF)
 | |
| 								break;
 | |
| 							n = n->left;
 | |
| 						}
 | |
| 					}
 | |
| 					if (n->bitnum != node->bitnum)
 | |
| 						break;
 | |
| 					n = n->right;
 | |
| 					right = n;
 | |
| 					next = next->next;
 | |
| 				}
 | |
| 				/* Make sure the right node is marked. */
 | |
| 				if (!right->mark)
 | |
| 					changed += mark_subtree(right);
 | |
| 				offset = right->index - node->index;
 | |
| 			} else {
 | |
| 				offset = *tree->leaf_index(tree, node->right);
 | |
| 				offset -= node->index;
 | |
| 			}
 | |
| 			assert(offset >= 0);
 | |
| 			assert(offset <= 0xffffff);
 | |
| 			if (offset <= 0xff) {
 | |
| 				size = 2;
 | |
| 			} else if (offset <= 0xffff) {
 | |
| 				size = 3;
 | |
| 			} else { /* offset <= 0xffffff */
 | |
| 				size = 4;
 | |
| 			}
 | |
| 		}
 | |
| 		if (node->size != size || node->offset != offset) {
 | |
| 			node->size = size;
 | |
| 			node->offset = offset;
 | |
| 			changed++;
 | |
| 		}
 | |
| skip:
 | |
| 		while (node) {
 | |
| 			bitmask = 1 << node->bitnum;
 | |
| 			pathmask |= bitmask;
 | |
| 			if (node->mark && (leftmask & bitmask) == 0) {
 | |
| 				leftmask |= bitmask;
 | |
| 				if (node->leftnode == LEAF) {
 | |
| 					assert(node->left);
 | |
| 				} else if (node->left) {
 | |
| 					assert(node->leftnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->left;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			if (node->mark && (rightmask & bitmask) == 0) {
 | |
| 				rightmask |= bitmask;
 | |
| 				pathbits |= bitmask;
 | |
| 				if (node->rightnode == LEAF) {
 | |
| 					assert(node->right);
 | |
| 				} else if (node->right) {
 | |
| 					assert(node->rightnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->right;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 			pathmask &= ~bitmask;
 | |
| 			pathbits &= ~bitmask;
 | |
| 			node = node->parent;
 | |
| 			indent -= 1;
 | |
| 		}
 | |
| 	}
 | |
| done:
 | |
| 	if (verbose > 0)
 | |
| 		printf("Found %d changes\n", changed);
 | |
| 	return changed;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Emit a trie for the given tree into the data array.
 | |
|  */
 | |
| static void emit(struct tree *tree, unsigned char *data)
 | |
| {
 | |
| 	struct node *node;
 | |
| 	unsigned int leftmask;
 | |
| 	unsigned int rightmask;
 | |
| 	unsigned int bitmask;
 | |
| 	int offlen;
 | |
| 	int offset;
 | |
| 	int index;
 | |
| 	int indent;
 | |
| 	int size;
 | |
| 	int bytes;
 | |
| 	int leaves;
 | |
| 	int nodes[4];
 | |
| 	unsigned char byte;
 | |
| 
 | |
| 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
 | |
| 	leaves = 0;
 | |
| 	bytes = 0;
 | |
| 	index = tree->index;
 | |
| 	data += index;
 | |
| 	indent = 1;
 | |
| 	if (verbose > 0)
 | |
| 		printf("Emitting %s_%x\n", tree->type, tree->maxage);
 | |
| 	if (tree->childnode == LEAF) {
 | |
| 		assert(tree->root);
 | |
| 		tree->leaf_emit(tree->root, data);
 | |
| 		size = tree->leaf_size(tree->root);
 | |
| 		index += size;
 | |
| 		leaves++;
 | |
| 		goto done;
 | |
| 	}
 | |
| 
 | |
| 	assert(tree->childnode == NODE);
 | |
| 	node = tree->root;
 | |
| 	leftmask = rightmask = 0;
 | |
| 	while (node) {
 | |
| 		if (!node->mark)
 | |
| 			goto skip;
 | |
| 		assert(node->offset != -1);
 | |
| 		assert(node->index == index);
 | |
| 
 | |
| 		byte = 0;
 | |
| 		if (node->nextbyte)
 | |
| 			byte |= NEXTBYTE;
 | |
| 		byte |= (node->bitnum & BITNUM);
 | |
| 		if (node->left && node->right) {
 | |
| 			if (node->leftnode == NODE)
 | |
| 				byte |= LEFTNODE;
 | |
| 			if (node->rightnode == NODE)
 | |
| 				byte |= RIGHTNODE;
 | |
| 			if (node->offset <= 0xff)
 | |
| 				offlen = 1;
 | |
| 			else if (node->offset <= 0xffff)
 | |
| 				offlen = 2;
 | |
| 			else
 | |
| 				offlen = 3;
 | |
| 			nodes[offlen]++;
 | |
| 			offset = node->offset;
 | |
| 			byte |= offlen << OFFLEN_SHIFT;
 | |
| 			*data++ = byte;
 | |
| 			index++;
 | |
| 			while (offlen--) {
 | |
| 				*data++ = offset & 0xff;
 | |
| 				index++;
 | |
| 				offset >>= 8;
 | |
| 			}
 | |
| 		} else if (node->left) {
 | |
| 			if (node->leftnode == NODE)
 | |
| 				byte |= TRIENODE;
 | |
| 			nodes[0]++;
 | |
| 			*data++ = byte;
 | |
| 			index++;
 | |
| 		} else if (node->right) {
 | |
| 			byte |= RIGHTNODE;
 | |
| 			if (node->rightnode == NODE)
 | |
| 				byte |= TRIENODE;
 | |
| 			nodes[0]++;
 | |
| 			*data++ = byte;
 | |
| 			index++;
 | |
| 		} else {
 | |
| 			assert(0);
 | |
| 		}
 | |
| skip:
 | |
| 		while (node) {
 | |
| 			bitmask = 1 << node->bitnum;
 | |
| 			if (node->mark && (leftmask & bitmask) == 0) {
 | |
| 				leftmask |= bitmask;
 | |
| 				if (node->leftnode == LEAF) {
 | |
| 					assert(node->left);
 | |
| 					data = tree->leaf_emit(node->left,
 | |
| 							       data);
 | |
| 					size = tree->leaf_size(node->left);
 | |
| 					index += size;
 | |
| 					bytes += size;
 | |
| 					leaves++;
 | |
| 				} else if (node->left) {
 | |
| 					assert(node->leftnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->left;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			if (node->mark && (rightmask & bitmask) == 0) {
 | |
| 				rightmask |= bitmask;
 | |
| 				if (node->rightnode == LEAF) {
 | |
| 					assert(node->right);
 | |
| 					data = tree->leaf_emit(node->right,
 | |
| 							       data);
 | |
| 					size = tree->leaf_size(node->right);
 | |
| 					index += size;
 | |
| 					bytes += size;
 | |
| 					leaves++;
 | |
| 				} else if (node->right) {
 | |
| 					assert(node->rightnode == NODE);
 | |
| 					indent += 1;
 | |
| 					node = node->right;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			leftmask &= ~bitmask;
 | |
| 			rightmask &= ~bitmask;
 | |
| 			node = node->parent;
 | |
| 			indent -= 1;
 | |
| 		}
 | |
| 	}
 | |
| done:
 | |
| 	if (verbose > 0) {
 | |
| 		printf("Emitted %d (%d) leaves",
 | |
| 			leaves, bytes);
 | |
| 		printf(" %d (%d+%d+%d+%d) nodes",
 | |
| 			nodes[0] + nodes[1] + nodes[2] + nodes[3],
 | |
| 			nodes[0], nodes[1], nodes[2], nodes[3]);
 | |
| 		printf(" %d total\n", index - tree->index);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| /*
 | |
|  * Unicode data.
 | |
|  *
 | |
|  * We need to keep track of the Canonical Combining Class, the Age,
 | |
|  * and decompositions for a code point.
 | |
|  *
 | |
|  * For the Age, we store the index into the ages table.  Effectively
 | |
|  * this is a generation number that the table maps to a unicode
 | |
|  * version.
 | |
|  *
 | |
|  * The correction field is used to indicate that this entry is in the
 | |
|  * corrections array, which contains decompositions that were
 | |
|  * corrected in later revisions.  The value of the correction field is
 | |
|  * the Unicode version in which the mapping was corrected.
 | |
|  */
 | |
| struct unicode_data {
 | |
| 	unsigned int code;
 | |
| 	int ccc;
 | |
| 	int gen;
 | |
| 	int correction;
 | |
| 	unsigned int *utf32nfdi;
 | |
| 	unsigned int *utf32nfdicf;
 | |
| 	char *utf8nfdi;
 | |
| 	char *utf8nfdicf;
 | |
| };
 | |
| 
 | |
| struct unicode_data unicode_data[0x110000];
 | |
| struct unicode_data *corrections;
 | |
| int    corrections_count;
 | |
| 
 | |
| struct tree *nfdi_tree;
 | |
| struct tree *nfdicf_tree;
 | |
| 
 | |
| struct tree *trees;
 | |
| int          trees_count;
 | |
| 
 | |
| /*
 | |
|  * Check the corrections array to see if this entry was corrected at
 | |
|  * some point.
 | |
|  */
 | |
| static struct unicode_data *corrections_lookup(struct unicode_data *u)
 | |
| {
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; i != corrections_count; i++)
 | |
| 		if (u->code == corrections[i].code)
 | |
| 			return &corrections[i];
 | |
| 	return u;
 | |
| }
 | |
| 
 | |
| static int nfdi_equal(void *l, void *r)
 | |
| {
 | |
| 	struct unicode_data *left = l;
 | |
| 	struct unicode_data *right = r;
 | |
| 
 | |
| 	if (left->gen != right->gen)
 | |
| 		return 0;
 | |
| 	if (left->ccc != right->ccc)
 | |
| 		return 0;
 | |
| 	if (left->utf8nfdi && right->utf8nfdi &&
 | |
| 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
 | |
| 		return 1;
 | |
| 	if (left->utf8nfdi || right->utf8nfdi)
 | |
| 		return 0;
 | |
| 	return 1;
 | |
| }
 | |
| 
 | |
| static int nfdicf_equal(void *l, void *r)
 | |
| {
 | |
| 	struct unicode_data *left = l;
 | |
| 	struct unicode_data *right = r;
 | |
| 
 | |
| 	if (left->gen != right->gen)
 | |
| 		return 0;
 | |
| 	if (left->ccc != right->ccc)
 | |
| 		return 0;
 | |
| 	if (left->utf8nfdicf && right->utf8nfdicf &&
 | |
| 	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
 | |
| 		return 1;
 | |
| 	if (left->utf8nfdicf && right->utf8nfdicf)
 | |
| 		return 0;
 | |
| 	if (left->utf8nfdicf || right->utf8nfdicf)
 | |
| 		return 0;
 | |
| 	if (left->utf8nfdi && right->utf8nfdi &&
 | |
| 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
 | |
| 		return 1;
 | |
| 	if (left->utf8nfdi || right->utf8nfdi)
 | |
| 		return 0;
 | |
| 	return 1;
 | |
| }
 | |
| 
 | |
| static void nfdi_print(void *l, int indent)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
 | |
| 		leaf->code, leaf->ccc, leaf->gen);
 | |
| 
 | |
| 	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
 | |
| 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
 | |
| 	else if (leaf->utf8nfdi)
 | |
| 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
 | |
| 
 | |
| 	printf("\n");
 | |
| }
 | |
| 
 | |
| static void nfdicf_print(void *l, int indent)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
 | |
| 		leaf->code, leaf->ccc, leaf->gen);
 | |
| 
 | |
| 	if (leaf->utf8nfdicf)
 | |
| 		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
 | |
| 	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
 | |
| 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
 | |
| 	else if (leaf->utf8nfdi)
 | |
| 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
 | |
| 	printf("\n");
 | |
| }
 | |
| 
 | |
| static int nfdi_mark(void *l)
 | |
| {
 | |
| 	return 1;
 | |
| }
 | |
| 
 | |
| static int nfdicf_mark(void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	if (leaf->utf8nfdicf)
 | |
| 		return 1;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static int correction_mark(void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	return leaf->correction;
 | |
| }
 | |
| 
 | |
| static int nfdi_size(void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 	int size = 2;
 | |
| 
 | |
| 	if (HANGUL_SYLLABLE(leaf->code))
 | |
| 		size += 1;
 | |
| 	else if (leaf->utf8nfdi)
 | |
| 		size += strlen(leaf->utf8nfdi) + 1;
 | |
| 	return size;
 | |
| }
 | |
| 
 | |
| static int nfdicf_size(void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 	int size = 2;
 | |
| 
 | |
| 	if (HANGUL_SYLLABLE(leaf->code))
 | |
| 		size += 1;
 | |
| 	else if (leaf->utf8nfdicf)
 | |
| 		size += strlen(leaf->utf8nfdicf) + 1;
 | |
| 	else if (leaf->utf8nfdi)
 | |
| 		size += strlen(leaf->utf8nfdi) + 1;
 | |
| 	return size;
 | |
| }
 | |
| 
 | |
| static int *nfdi_index(struct tree *tree, void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	return &tree->leafindex[leaf->code];
 | |
| }
 | |
| 
 | |
| static int *nfdicf_index(struct tree *tree, void *l)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 
 | |
| 	return &tree->leafindex[leaf->code];
 | |
| }
 | |
| 
 | |
| static unsigned char *nfdi_emit(void *l, unsigned char *data)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 	unsigned char *s;
 | |
| 
 | |
| 	*data++ = leaf->gen;
 | |
| 
 | |
| 	if (HANGUL_SYLLABLE(leaf->code)) {
 | |
| 		*data++ = DECOMPOSE;
 | |
| 		*data++ = HANGUL;
 | |
| 	} else if (leaf->utf8nfdi) {
 | |
| 		*data++ = DECOMPOSE;
 | |
| 		s = (unsigned char*)leaf->utf8nfdi;
 | |
| 		while ((*data++ = *s++) != 0)
 | |
| 			;
 | |
| 	} else {
 | |
| 		*data++ = leaf->ccc;
 | |
| 	}
 | |
| 	return data;
 | |
| }
 | |
| 
 | |
| static unsigned char *nfdicf_emit(void *l, unsigned char *data)
 | |
| {
 | |
| 	struct unicode_data *leaf = l;
 | |
| 	unsigned char *s;
 | |
| 
 | |
| 	*data++ = leaf->gen;
 | |
| 
 | |
| 	if (HANGUL_SYLLABLE(leaf->code)) {
 | |
| 		*data++ = DECOMPOSE;
 | |
| 		*data++ = HANGUL;
 | |
| 	} else if (leaf->utf8nfdicf) {
 | |
| 		*data++ = DECOMPOSE;
 | |
| 		s = (unsigned char*)leaf->utf8nfdicf;
 | |
| 		while ((*data++ = *s++) != 0)
 | |
| 			;
 | |
| 	} else if (leaf->utf8nfdi) {
 | |
| 		*data++ = DECOMPOSE;
 | |
| 		s = (unsigned char*)leaf->utf8nfdi;
 | |
| 		while ((*data++ = *s++) != 0)
 | |
| 			;
 | |
| 	} else {
 | |
| 		*data++ = leaf->ccc;
 | |
| 	}
 | |
| 	return data;
 | |
| }
 | |
| 
 | |
| static void utf8_create(struct unicode_data *data)
 | |
| {
 | |
| 	char utf[18*4+1];
 | |
| 	char *u;
 | |
| 	unsigned int *um;
 | |
| 	int i;
 | |
| 
 | |
| 	if (data->utf8nfdi) {
 | |
| 		assert(data->utf8nfdi[0] == HANGUL);
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	u = utf;
 | |
| 	um = data->utf32nfdi;
 | |
| 	if (um) {
 | |
| 		for (i = 0; um[i]; i++)
 | |
| 			u += utf8encode(u, um[i]);
 | |
| 		*u = '\0';
 | |
| 		data->utf8nfdi = strdup(utf);
 | |
| 	}
 | |
| 	u = utf;
 | |
| 	um = data->utf32nfdicf;
 | |
| 	if (um) {
 | |
| 		for (i = 0; um[i]; i++)
 | |
| 			u += utf8encode(u, um[i]);
 | |
| 		*u = '\0';
 | |
| 		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
 | |
| 			data->utf8nfdicf = strdup(utf);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void utf8_init(void)
 | |
| {
 | |
| 	unsigned int unichar;
 | |
| 	int i;
 | |
| 
 | |
| 	for (unichar = 0; unichar != 0x110000; unichar++)
 | |
| 		utf8_create(&unicode_data[unichar]);
 | |
| 
 | |
| 	for (i = 0; i != corrections_count; i++)
 | |
| 		utf8_create(&corrections[i]);
 | |
| }
 | |
| 
 | |
| static void trees_init(void)
 | |
| {
 | |
| 	struct unicode_data *data;
 | |
| 	unsigned int maxage;
 | |
| 	unsigned int nextage;
 | |
| 	int count;
 | |
| 	int i;
 | |
| 	int j;
 | |
| 
 | |
| 	/* Count the number of different ages. */
 | |
| 	count = 0;
 | |
| 	nextage = (unsigned int)-1;
 | |
| 	do {
 | |
| 		maxage = nextage;
 | |
| 		nextage = 0;
 | |
| 		for (i = 0; i <= corrections_count; i++) {
 | |
| 			data = &corrections[i];
 | |
| 			if (nextage < data->correction &&
 | |
| 			    data->correction < maxage)
 | |
| 				nextage = data->correction;
 | |
| 		}
 | |
| 		count++;
 | |
| 	} while (nextage);
 | |
| 
 | |
| 	/* Two trees per age: nfdi and nfdicf */
 | |
| 	trees_count = count * 2;
 | |
| 	trees = calloc(trees_count, sizeof(struct tree));
 | |
| 
 | |
| 	/* Assign ages to the trees. */
 | |
| 	count = trees_count;
 | |
| 	nextage = (unsigned int)-1;
 | |
| 	do {
 | |
| 		maxage = nextage;
 | |
| 		trees[--count].maxage = maxage;
 | |
| 		trees[--count].maxage = maxage;
 | |
| 		nextage = 0;
 | |
| 		for (i = 0; i <= corrections_count; i++) {
 | |
| 			data = &corrections[i];
 | |
| 			if (nextage < data->correction &&
 | |
| 			    data->correction < maxage)
 | |
| 				nextage = data->correction;
 | |
| 		}
 | |
| 	} while (nextage);
 | |
| 
 | |
| 	/* The ages assigned above are off by one. */
 | |
| 	for (i = 0; i != trees_count; i++) {
 | |
| 		j = 0;
 | |
| 		while (ages[j] < trees[i].maxage)
 | |
| 			j++;
 | |
| 		trees[i].maxage = ages[j-1];
 | |
| 	}
 | |
| 
 | |
| 	/* Set up the forwarding between trees. */
 | |
| 	trees[trees_count-2].next = &trees[trees_count-1];
 | |
| 	trees[trees_count-1].leaf_mark = nfdi_mark;
 | |
| 	trees[trees_count-2].leaf_mark = nfdicf_mark;
 | |
| 	for (i = 0; i != trees_count-2; i += 2) {
 | |
| 		trees[i].next = &trees[trees_count-2];
 | |
| 		trees[i].leaf_mark = correction_mark;
 | |
| 		trees[i+1].next = &trees[trees_count-1];
 | |
| 		trees[i+1].leaf_mark = correction_mark;
 | |
| 	}
 | |
| 
 | |
| 	/* Assign the callouts. */
 | |
| 	for (i = 0; i != trees_count; i += 2) {
 | |
| 		trees[i].type = "nfdicf";
 | |
| 		trees[i].leaf_equal = nfdicf_equal;
 | |
| 		trees[i].leaf_print = nfdicf_print;
 | |
| 		trees[i].leaf_size = nfdicf_size;
 | |
| 		trees[i].leaf_index = nfdicf_index;
 | |
| 		trees[i].leaf_emit = nfdicf_emit;
 | |
| 
 | |
| 		trees[i+1].type = "nfdi";
 | |
| 		trees[i+1].leaf_equal = nfdi_equal;
 | |
| 		trees[i+1].leaf_print = nfdi_print;
 | |
| 		trees[i+1].leaf_size = nfdi_size;
 | |
| 		trees[i+1].leaf_index = nfdi_index;
 | |
| 		trees[i+1].leaf_emit = nfdi_emit;
 | |
| 	}
 | |
| 
 | |
| 	/* Finish init. */
 | |
| 	for (i = 0; i != trees_count; i++)
 | |
| 		trees[i].childnode = NODE;
 | |
| }
 | |
| 
 | |
| static void trees_populate(void)
 | |
| {
 | |
| 	struct unicode_data *data;
 | |
| 	unsigned int unichar;
 | |
| 	char keyval[4];
 | |
| 	int keylen;
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; i != trees_count; i++) {
 | |
| 		if (verbose > 0) {
 | |
| 			printf("Populating %s_%x\n",
 | |
| 				trees[i].type, trees[i].maxage);
 | |
| 		}
 | |
| 		for (unichar = 0; unichar != 0x110000; unichar++) {
 | |
| 			if (unicode_data[unichar].gen < 0)
 | |
| 				continue;
 | |
| 			keylen = utf8encode(keyval, unichar);
 | |
| 			data = corrections_lookup(&unicode_data[unichar]);
 | |
| 			if (data->correction <= trees[i].maxage)
 | |
| 				data = &unicode_data[unichar];
 | |
| 			insert(&trees[i], keyval, keylen, data);
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void trees_reduce(void)
 | |
| {
 | |
| 	int i;
 | |
| 	int size;
 | |
| 	int changed;
 | |
| 
 | |
| 	for (i = 0; i != trees_count; i++)
 | |
| 		prune(&trees[i]);
 | |
| 	for (i = 0; i != trees_count; i++)
 | |
| 		mark_nodes(&trees[i]);
 | |
| 	do {
 | |
| 		size = 0;
 | |
| 		for (i = 0; i != trees_count; i++)
 | |
| 			size = index_nodes(&trees[i], size);
 | |
| 		changed = 0;
 | |
| 		for (i = 0; i != trees_count; i++)
 | |
| 			changed += size_nodes(&trees[i]);
 | |
| 	} while (changed);
 | |
| 
 | |
| 	utf8data = calloc(size, 1);
 | |
| 	utf8data_size = size;
 | |
| 	for (i = 0; i != trees_count; i++)
 | |
| 		emit(&trees[i], utf8data);
 | |
| 
 | |
| 	if (verbose > 0) {
 | |
| 		for (i = 0; i != trees_count; i++) {
 | |
| 			printf("%s_%x idx %d\n",
 | |
| 				trees[i].type, trees[i].maxage, trees[i].index);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	nfdi = utf8data + trees[trees_count-1].index;
 | |
| 	nfdicf = utf8data + trees[trees_count-2].index;
 | |
| 
 | |
| 	nfdi_tree = &trees[trees_count-1];
 | |
| 	nfdicf_tree = &trees[trees_count-2];
 | |
| }
 | |
| 
 | |
| static void verify(struct tree *tree)
 | |
| {
 | |
| 	struct unicode_data *data;
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	unsigned int	unichar;
 | |
| 	char		key[4];
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 	int		report;
 | |
| 	int		nocf;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Verifying %s_%x\n", tree->type, tree->maxage);
 | |
| 	nocf = strcmp(tree->type, "nfdicf");
 | |
| 
 | |
| 	for (unichar = 0; unichar != 0x110000; unichar++) {
 | |
| 		report = 0;
 | |
| 		data = corrections_lookup(&unicode_data[unichar]);
 | |
| 		if (data->correction <= tree->maxage)
 | |
| 			data = &unicode_data[unichar];
 | |
| 		utf8encode(key,unichar);
 | |
| 		leaf = utf8lookup(tree, hangul, key);
 | |
| 
 | |
| 		if (!leaf) {
 | |
| 			if (data->gen != -1)
 | |
| 				report++;
 | |
| 			if (unichar < 0xd800 || unichar > 0xdfff)
 | |
| 				report++;
 | |
| 		} else {
 | |
| 			if (unichar >= 0xd800 && unichar <= 0xdfff)
 | |
| 				report++;
 | |
| 			if (data->gen == -1)
 | |
| 				report++;
 | |
| 			if (data->gen != LEAF_GEN(leaf))
 | |
| 				report++;
 | |
| 			if (LEAF_CCC(leaf) == DECOMPOSE) {
 | |
| 				if (HANGUL_SYLLABLE(data->code)) {
 | |
| 					if (data->utf8nfdi[0] != HANGUL)
 | |
| 						report++;
 | |
| 				} else if (nocf) {
 | |
| 					if (!data->utf8nfdi) {
 | |
| 						report++;
 | |
| 					} else if (strcmp(data->utf8nfdi,
 | |
| 							  LEAF_STR(leaf))) {
 | |
| 						report++;
 | |
| 					}
 | |
| 				} else {
 | |
| 					if (!data->utf8nfdicf &&
 | |
| 					    !data->utf8nfdi) {
 | |
| 						report++;
 | |
| 					} else if (data->utf8nfdicf) {
 | |
| 						if (strcmp(data->utf8nfdicf,
 | |
| 							   LEAF_STR(leaf)))
 | |
| 							report++;
 | |
| 					} else if (strcmp(data->utf8nfdi,
 | |
| 							  LEAF_STR(leaf))) {
 | |
| 						report++;
 | |
| 					}
 | |
| 				}
 | |
| 			} else if (data->ccc != LEAF_CCC(leaf)) {
 | |
| 				report++;
 | |
| 			}
 | |
| 		}
 | |
| 		if (report) {
 | |
| 			printf("%X code %X gen %d ccc %d"
 | |
| 				" nfdi -> \"%s\"",
 | |
| 				unichar, data->code, data->gen,
 | |
| 				data->ccc,
 | |
| 				data->utf8nfdi);
 | |
| 			if (leaf) {
 | |
| 				printf(" gen %d ccc %d"
 | |
| 					" nfdi -> \"%s\"",
 | |
| 					LEAF_GEN(leaf),
 | |
| 					LEAF_CCC(leaf),
 | |
| 					LEAF_CCC(leaf) == DECOMPOSE ?
 | |
| 						LEAF_STR(leaf) : "");
 | |
| 			}
 | |
| 			printf("\n");
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void trees_verify(void)
 | |
| {
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; i != trees_count; i++)
 | |
| 		verify(&trees[i]);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| static void help(void)
 | |
| {
 | |
| 	printf("Usage: %s [options]\n", argv0);
 | |
| 	printf("\n");
 | |
| 	printf("This program creates an a data trie used for parsing and\n");
 | |
| 	printf("normalization of UTF-8 strings. The trie is derived from\n");
 | |
| 	printf("a set of input files from the Unicode character database\n");
 | |
| 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
 | |
| 	printf("\n");
 | |
| 	printf("The generated tree supports two normalization forms:\n");
 | |
| 	printf("\n");
 | |
| 	printf("\tnfdi:\n");
 | |
| 	printf("\t- Apply unicode normalization form NFD.\n");
 | |
| 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
 | |
| 	printf("\n");
 | |
| 	printf("\tnfdicf:\n");
 | |
| 	printf("\t- Apply unicode normalization form NFD.\n");
 | |
| 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
 | |
| 	printf("\t- Apply a full casefold (C + F).\n");
 | |
| 	printf("\n");
 | |
| 	printf("These forms were chosen as being most useful when dealing\n");
 | |
| 	printf("with file names: NFD catches most cases where characters\n");
 | |
| 	printf("should be considered equivalent. The ignorables are mostly\n");
 | |
| 	printf("invisible, making names hard to type.\n");
 | |
| 	printf("\n");
 | |
| 	printf("The options to specify the files to be used are listed\n");
 | |
| 	printf("below with their default values, which are the names used\n");
 | |
| 	printf("by version 11.0.0 of the Unicode Character Database.\n");
 | |
| 	printf("\n");
 | |
| 	printf("The input files:\n");
 | |
| 	printf("\t-a %s\n", AGE_NAME);
 | |
| 	printf("\t-c %s\n", CCC_NAME);
 | |
| 	printf("\t-p %s\n", PROP_NAME);
 | |
| 	printf("\t-d %s\n", DATA_NAME);
 | |
| 	printf("\t-f %s\n", FOLD_NAME);
 | |
| 	printf("\t-n %s\n", NORM_NAME);
 | |
| 	printf("\n");
 | |
| 	printf("Additionally, the generated tables are tested using:\n");
 | |
| 	printf("\t-t %s\n", TEST_NAME);
 | |
| 	printf("\n");
 | |
| 	printf("Finally, the output file:\n");
 | |
| 	printf("\t-o %s\n", UTF8_NAME);
 | |
| 	printf("\n");
 | |
| }
 | |
| 
 | |
| static void usage(void)
 | |
| {
 | |
| 	help();
 | |
| 	exit(1);
 | |
| }
 | |
| 
 | |
| static void open_fail(const char *name, int error)
 | |
| {
 | |
| 	printf("Error %d opening %s: %s\n", error, name, strerror(error));
 | |
| 	exit(1);
 | |
| }
 | |
| 
 | |
| static void file_fail(const char *filename)
 | |
| {
 | |
| 	printf("Error parsing %s\n", filename);
 | |
| 	exit(1);
 | |
| }
 | |
| 
 | |
| static void line_fail(const char *filename, const char *line)
 | |
| {
 | |
| 	printf("Error parsing %s:%s\n", filename, line);
 | |
| 	exit(1);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| static void print_utf32(unsigned int *utf32str)
 | |
| {
 | |
| 	int	i;
 | |
| 
 | |
| 	for (i = 0; utf32str[i]; i++)
 | |
| 		printf(" %X", utf32str[i]);
 | |
| }
 | |
| 
 | |
| static void print_utf32nfdi(unsigned int unichar)
 | |
| {
 | |
| 	printf(" %X ->", unichar);
 | |
| 	print_utf32(unicode_data[unichar].utf32nfdi);
 | |
| 	printf("\n");
 | |
| }
 | |
| 
 | |
| static void print_utf32nfdicf(unsigned int unichar)
 | |
| {
 | |
| 	printf(" %X ->", unichar);
 | |
| 	print_utf32(unicode_data[unichar].utf32nfdicf);
 | |
| 	printf("\n");
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| static void age_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int first;
 | |
| 	unsigned int last;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int major;
 | |
| 	unsigned int minor;
 | |
| 	unsigned int revision;
 | |
| 	int gen;
 | |
| 	int count;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", age_name);
 | |
| 
 | |
| 	file = fopen(age_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(age_name, errno);
 | |
| 	count = 0;
 | |
| 
 | |
| 	gen = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "# Age=V%d_%d_%d",
 | |
| 				&major, &minor, &revision);
 | |
| 		if (ret == 3) {
 | |
| 			ages_count++;
 | |
| 			if (verbose > 1)
 | |
| 				printf(" Age V%d_%d_%d\n",
 | |
| 					major, minor, revision);
 | |
| 			if (!age_valid(major, minor, revision))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
 | |
| 		if (ret == 2) {
 | |
| 			ages_count++;
 | |
| 			if (verbose > 1)
 | |
| 				printf(" Age V%d_%d\n", major, minor);
 | |
| 			if (!age_valid(major, minor, 0))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/* We must have found something above. */
 | |
| 	if (verbose > 1)
 | |
| 		printf("%d age entries\n", ages_count);
 | |
| 	if (ages_count == 0 || ages_count > MAXGEN)
 | |
| 		file_fail(age_name);
 | |
| 
 | |
| 	/* There is a 0 entry. */
 | |
| 	ages_count++;
 | |
| 	ages = calloc(ages_count + 1, sizeof(*ages));
 | |
| 	/* And a guard entry. */
 | |
| 	ages[ages_count] = (unsigned int)-1;
 | |
| 
 | |
| 	rewind(file);
 | |
| 	count = 0;
 | |
| 	gen = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "# Age=V%d_%d_%d",
 | |
| 				&major, &minor, &revision);
 | |
| 		if (ret == 3) {
 | |
| 			ages[++gen] =
 | |
| 				UNICODE_AGE(major, minor, revision);
 | |
| 			if (verbose > 1)
 | |
| 				printf(" Age V%d_%d_%d = gen %d\n",
 | |
| 					major, minor, revision, gen);
 | |
| 			if (!age_valid(major, minor, revision))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
 | |
| 		if (ret == 2) {
 | |
| 			ages[++gen] = UNICODE_AGE(major, minor, 0);
 | |
| 			if (verbose > 1)
 | |
| 				printf(" Age V%d_%d = %d\n",
 | |
| 					major, minor, gen);
 | |
| 			if (!age_valid(major, minor, 0))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "%X..%X ; %d.%d #",
 | |
| 			     &first, &last, &major, &minor);
 | |
| 		if (ret == 4) {
 | |
| 			for (unichar = first; unichar <= last; unichar++)
 | |
| 				unicode_data[unichar].gen = gen;
 | |
| 			count += 1 + last - first;
 | |
| 			if (verbose > 1)
 | |
| 				printf("  %X..%X gen %d\n", first, last, gen);
 | |
| 			if (!utf32valid(first) || !utf32valid(last))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
 | |
| 		if (ret == 3) {
 | |
| 			unicode_data[unichar].gen = gen;
 | |
| 			count++;
 | |
| 			if (verbose > 1)
 | |
| 				printf("  %X gen %d\n", unichar, gen);
 | |
| 			if (!utf32valid(unichar))
 | |
| 				line_fail(age_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 	}
 | |
| 	unicode_maxage = ages[gen];
 | |
| 	fclose(file);
 | |
| 
 | |
| 	/* Nix surrogate block */
 | |
| 	if (verbose > 1)
 | |
| 		printf(" Removing surrogate block D800..DFFF\n");
 | |
| 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
 | |
| 		unicode_data[unichar].gen = -1;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 	        printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(age_name);
 | |
| }
 | |
| 
 | |
| static void ccc_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int first;
 | |
| 	unsigned int last;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int value;
 | |
| 	int count;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", ccc_name);
 | |
| 
 | |
| 	file = fopen(ccc_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(ccc_name, errno);
 | |
| 
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
 | |
| 		if (ret == 3) {
 | |
| 			for (unichar = first; unichar <= last; unichar++) {
 | |
| 				unicode_data[unichar].ccc = value;
 | |
|                                 count++;
 | |
| 			}
 | |
| 			if (verbose > 1)
 | |
| 				printf(" %X..%X ccc %d\n", first, last, value);
 | |
| 			if (!utf32valid(first) || !utf32valid(last))
 | |
| 				line_fail(ccc_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "%X ; %d #", &unichar, &value);
 | |
| 		if (ret == 2) {
 | |
| 			unicode_data[unichar].ccc = value;
 | |
|                         count++;
 | |
| 			if (verbose > 1)
 | |
| 				printf(" %X ccc %d\n", unichar, value);
 | |
| 			if (!utf32valid(unichar))
 | |
| 				line_fail(ccc_name, line);
 | |
| 			continue;
 | |
| 		}
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(ccc_name);
 | |
| }
 | |
| 
 | |
| static int ignore_compatibility_form(char *type)
 | |
| {
 | |
| 	int i;
 | |
| 	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
 | |
| 				 "final", "isolated", "circle", "super",
 | |
| 				 "sub", "vertical", "wide", "narrow",
 | |
| 				 "small", "square", "fraction", "compat"};
 | |
| 
 | |
| 	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
 | |
| 		if (strcmp(type, ignored_types[i]) == 0)
 | |
| 			return 1;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static void nfdi_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
 | |
| 	char *s;
 | |
| 	char *type;
 | |
| 	unsigned int *um;
 | |
| 	int count;
 | |
| 	int i;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", data_name);
 | |
| 	file = fopen(data_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(data_name, errno);
 | |
| 
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
 | |
| 			     &unichar, buf0);
 | |
| 		if (ret != 2)
 | |
| 			continue;
 | |
| 		if (!utf32valid(unichar))
 | |
| 			line_fail(data_name, line);
 | |
| 
 | |
| 		s = buf0;
 | |
| 		/* skip over <tag> */
 | |
| 		if (*s == '<') {
 | |
| 			type = ++s;
 | |
| 			while (*++s != '>');
 | |
| 			*s++ = '\0';
 | |
| 			if(ignore_compatibility_form(type))
 | |
| 				continue;
 | |
| 		}
 | |
| 		/* decode the decomposition into UTF-32 */
 | |
| 		i = 0;
 | |
| 		while (*s) {
 | |
| 			mapping[i] = strtoul(s, &s, 16);
 | |
| 			if (!utf32valid(mapping[i]))
 | |
| 				line_fail(data_name, line);
 | |
| 			i++;
 | |
| 		}
 | |
| 		mapping[i++] = 0;
 | |
| 
 | |
| 		um = malloc(i * sizeof(unsigned int));
 | |
| 		memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 		unicode_data[unichar].utf32nfdi = um;
 | |
| 
 | |
| 		if (verbose > 1)
 | |
| 			print_utf32nfdi(unichar);
 | |
| 		count++;
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 	if (verbose > 0)
 | |
| 		printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(data_name);
 | |
| }
 | |
| 
 | |
| static void nfdicf_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
 | |
| 	char status;
 | |
| 	char *s;
 | |
| 	unsigned int *um;
 | |
| 	int i;
 | |
| 	int count;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", fold_name);
 | |
| 	file = fopen(fold_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(fold_name, errno);
 | |
| 
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
 | |
| 		if (ret != 3)
 | |
| 			continue;
 | |
| 		if (!utf32valid(unichar))
 | |
| 			line_fail(fold_name, line);
 | |
| 		/* Use the C+F casefold. */
 | |
| 		if (status != 'C' && status != 'F')
 | |
| 			continue;
 | |
| 		s = buf0;
 | |
| 		if (*s == '<')
 | |
| 			while (*s++ != ' ')
 | |
| 				;
 | |
| 		i = 0;
 | |
| 		while (*s) {
 | |
| 			mapping[i] = strtoul(s, &s, 16);
 | |
| 			if (!utf32valid(mapping[i]))
 | |
| 				line_fail(fold_name, line);
 | |
| 			i++;
 | |
| 		}
 | |
| 		mapping[i++] = 0;
 | |
| 
 | |
| 		um = malloc(i * sizeof(unsigned int));
 | |
| 		memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 		unicode_data[unichar].utf32nfdicf = um;
 | |
| 
 | |
| 		if (verbose > 1)
 | |
| 			print_utf32nfdicf(unichar);
 | |
| 		count++;
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 	if (verbose > 0)
 | |
| 		printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(fold_name);
 | |
| }
 | |
| 
 | |
| static void ignore_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int first;
 | |
| 	unsigned int last;
 | |
| 	unsigned int *um;
 | |
| 	int count;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", prop_name);
 | |
| 	file = fopen(prop_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(prop_name, errno);
 | |
| 	assert(file);
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
 | |
| 		if (ret == 3) {
 | |
| 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
 | |
| 				continue;
 | |
| 			if (!utf32valid(first) || !utf32valid(last))
 | |
| 				line_fail(prop_name, line);
 | |
| 			for (unichar = first; unichar <= last; unichar++) {
 | |
| 				free(unicode_data[unichar].utf32nfdi);
 | |
| 				um = malloc(sizeof(unsigned int));
 | |
| 				*um = 0;
 | |
| 				unicode_data[unichar].utf32nfdi = um;
 | |
| 				free(unicode_data[unichar].utf32nfdicf);
 | |
| 				um = malloc(sizeof(unsigned int));
 | |
| 				*um = 0;
 | |
| 				unicode_data[unichar].utf32nfdicf = um;
 | |
| 				count++;
 | |
| 			}
 | |
| 			if (verbose > 1)
 | |
| 				printf(" %X..%X Default_Ignorable_Code_Point\n",
 | |
| 					first, last);
 | |
| 			continue;
 | |
| 		}
 | |
| 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
 | |
| 		if (ret == 2) {
 | |
| 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
 | |
| 				continue;
 | |
| 			if (!utf32valid(unichar))
 | |
| 				line_fail(prop_name, line);
 | |
| 			free(unicode_data[unichar].utf32nfdi);
 | |
| 			um = malloc(sizeof(unsigned int));
 | |
| 			*um = 0;
 | |
| 			unicode_data[unichar].utf32nfdi = um;
 | |
| 			free(unicode_data[unichar].utf32nfdicf);
 | |
| 			um = malloc(sizeof(unsigned int));
 | |
| 			*um = 0;
 | |
| 			unicode_data[unichar].utf32nfdicf = um;
 | |
| 			if (verbose > 1)
 | |
| 				printf(" %X Default_Ignorable_Code_Point\n",
 | |
| 					unichar);
 | |
| 			count++;
 | |
| 			continue;
 | |
| 		}
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(prop_name);
 | |
| }
 | |
| 
 | |
| static void corrections_init(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int major;
 | |
| 	unsigned int minor;
 | |
| 	unsigned int revision;
 | |
| 	unsigned int age;
 | |
| 	unsigned int *um;
 | |
| 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
 | |
| 	char *s;
 | |
| 	int i;
 | |
| 	int count;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", norm_name);
 | |
| 	file = fopen(norm_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(norm_name, errno);
 | |
| 
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
 | |
| 				&unichar, buf0, buf1,
 | |
| 				&major, &minor, &revision);
 | |
| 		if (ret != 6)
 | |
| 			continue;
 | |
| 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
 | |
| 			line_fail(norm_name, line);
 | |
| 		count++;
 | |
| 	}
 | |
| 	corrections = calloc(count, sizeof(struct unicode_data));
 | |
| 	corrections_count = count;
 | |
| 	rewind(file);
 | |
| 
 | |
| 	count = 0;
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
 | |
| 				&unichar, buf0, buf1,
 | |
| 				&major, &minor, &revision);
 | |
| 		if (ret != 6)
 | |
| 			continue;
 | |
| 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
 | |
| 			line_fail(norm_name, line);
 | |
| 		corrections[count] = unicode_data[unichar];
 | |
| 		assert(corrections[count].code == unichar);
 | |
| 		age = UNICODE_AGE(major, minor, revision);
 | |
| 		corrections[count].correction = age;
 | |
| 
 | |
| 		i = 0;
 | |
| 		s = buf0;
 | |
| 		while (*s) {
 | |
| 			mapping[i] = strtoul(s, &s, 16);
 | |
| 			if (!utf32valid(mapping[i]))
 | |
| 				line_fail(norm_name, line);
 | |
| 			i++;
 | |
| 		}
 | |
| 		mapping[i++] = 0;
 | |
| 
 | |
| 		um = malloc(i * sizeof(unsigned int));
 | |
| 		memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 		corrections[count].utf32nfdi = um;
 | |
| 
 | |
| 		if (verbose > 1)
 | |
| 			printf(" %X -> %s -> %s V%d_%d_%d\n",
 | |
| 				unichar, buf0, buf1, major, minor, revision);
 | |
| 		count++;
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 	        printf("Found %d entries\n", count);
 | |
| 	if (count == 0)
 | |
| 		file_fail(norm_name);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| /*
 | |
|  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 | |
|  *
 | |
|  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 | |
|  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 | |
|  *
 | |
|  * SBase = 0xAC00
 | |
|  * LBase = 0x1100
 | |
|  * VBase = 0x1161
 | |
|  * TBase = 0x11A7
 | |
|  * LCount = 19
 | |
|  * VCount = 21
 | |
|  * TCount = 28
 | |
|  * NCount = 588 (VCount * TCount)
 | |
|  * SCount = 11172 (LCount * NCount)
 | |
|  *
 | |
|  * Decomposition:
 | |
|  *   SIndex = s - SBase
 | |
|  *
 | |
|  * LV (Canonical/Full)
 | |
|  *   LIndex = SIndex / NCount
 | |
|  *   VIndex = (Sindex % NCount) / TCount
 | |
|  *   LPart = LBase + LIndex
 | |
|  *   VPart = VBase + VIndex
 | |
|  *
 | |
|  * LVT (Canonical)
 | |
|  *   LVIndex = (SIndex / TCount) * TCount
 | |
|  *   TIndex = (Sindex % TCount)
 | |
|  *   LVPart = SBase + LVIndex
 | |
|  *   TPart = TBase + TIndex
 | |
|  *
 | |
|  * LVT (Full)
 | |
|  *   LIndex = SIndex / NCount
 | |
|  *   VIndex = (Sindex % NCount) / TCount
 | |
|  *   TIndex = (Sindex % TCount)
 | |
|  *   LPart = LBase + LIndex
 | |
|  *   VPart = VBase + VIndex
 | |
|  *   if (TIndex == 0) {
 | |
|  *          d = <LPart, VPart>
 | |
|  *   } else {
 | |
|  *          TPart = TBase + TIndex
 | |
|  *          d = <LPart, VPart, TPart>
 | |
|  *   }
 | |
|  *
 | |
|  */
 | |
| 
 | |
| static void hangul_decompose(void)
 | |
| {
 | |
| 	unsigned int sb = 0xAC00;
 | |
| 	unsigned int lb = 0x1100;
 | |
| 	unsigned int vb = 0x1161;
 | |
| 	unsigned int tb = 0x11a7;
 | |
| 	/* unsigned int lc = 19; */
 | |
| 	unsigned int vc = 21;
 | |
| 	unsigned int tc = 28;
 | |
| 	unsigned int nc = (vc * tc);
 | |
| 	/* unsigned int sc = (lc * nc); */
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int mapping[4];
 | |
| 	unsigned int *um;
 | |
|         int count;
 | |
| 	int i;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Decomposing hangul\n");
 | |
| 	/* Hangul */
 | |
| 	count = 0;
 | |
| 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
 | |
| 		unsigned int si = unichar - sb;
 | |
| 		unsigned int li = si / nc;
 | |
| 		unsigned int vi = (si % nc) / tc;
 | |
| 		unsigned int ti = si % tc;
 | |
| 
 | |
| 		i = 0;
 | |
| 		mapping[i++] = lb + li;
 | |
| 		mapping[i++] = vb + vi;
 | |
| 		if (ti)
 | |
| 			mapping[i++] = tb + ti;
 | |
| 		mapping[i++] = 0;
 | |
| 
 | |
| 		assert(!unicode_data[unichar].utf32nfdi);
 | |
| 		um = malloc(i * sizeof(unsigned int));
 | |
| 		memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 		unicode_data[unichar].utf32nfdi = um;
 | |
| 
 | |
| 		assert(!unicode_data[unichar].utf32nfdicf);
 | |
| 		um = malloc(i * sizeof(unsigned int));
 | |
| 		memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 		unicode_data[unichar].utf32nfdicf = um;
 | |
| 
 | |
| 		/*
 | |
| 		 * Add a cookie as a reminder that the hangul syllable
 | |
| 		 * decompositions must not be stored in the generated
 | |
| 		 * trie.
 | |
| 		 */
 | |
| 		unicode_data[unichar].utf8nfdi = malloc(2);
 | |
| 		unicode_data[unichar].utf8nfdi[0] = HANGUL;
 | |
| 		unicode_data[unichar].utf8nfdi[1] = '\0';
 | |
| 
 | |
| 		if (verbose > 1)
 | |
| 			print_utf32nfdi(unichar);
 | |
| 
 | |
| 		count++;
 | |
| 	}
 | |
| 	if (verbose > 0)
 | |
| 		printf("Created %d entries\n", count);
 | |
| }
 | |
| 
 | |
| static void nfdi_decompose(void)
 | |
| {
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
 | |
| 	unsigned int *um;
 | |
| 	unsigned int *dc;
 | |
| 	int count;
 | |
| 	int i;
 | |
| 	int j;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Decomposing nfdi\n");
 | |
| 
 | |
| 	count = 0;
 | |
| 	for (unichar = 0; unichar != 0x110000; unichar++) {
 | |
| 		if (!unicode_data[unichar].utf32nfdi)
 | |
| 			continue;
 | |
| 		for (;;) {
 | |
| 			ret = 1;
 | |
| 			i = 0;
 | |
| 			um = unicode_data[unichar].utf32nfdi;
 | |
| 			while (*um) {
 | |
| 				dc = unicode_data[*um].utf32nfdi;
 | |
| 				if (dc) {
 | |
| 					for (j = 0; dc[j]; j++)
 | |
| 						mapping[i++] = dc[j];
 | |
| 					ret = 0;
 | |
| 				} else {
 | |
| 					mapping[i++] = *um;
 | |
| 				}
 | |
| 				um++;
 | |
| 			}
 | |
| 			mapping[i++] = 0;
 | |
| 			if (ret)
 | |
| 				break;
 | |
| 			free(unicode_data[unichar].utf32nfdi);
 | |
| 			um = malloc(i * sizeof(unsigned int));
 | |
| 			memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 			unicode_data[unichar].utf32nfdi = um;
 | |
| 		}
 | |
| 		/* Add this decomposition to nfdicf if there is no entry. */
 | |
| 		if (!unicode_data[unichar].utf32nfdicf) {
 | |
| 			um = malloc(i * sizeof(unsigned int));
 | |
| 			memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 			unicode_data[unichar].utf32nfdicf = um;
 | |
| 		}
 | |
| 		if (verbose > 1)
 | |
| 			print_utf32nfdi(unichar);
 | |
| 		count++;
 | |
| 	}
 | |
| 	if (verbose > 0)
 | |
| 		printf("Processed %d entries\n", count);
 | |
| }
 | |
| 
 | |
| static void nfdicf_decompose(void)
 | |
| {
 | |
| 	unsigned int unichar;
 | |
| 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
 | |
| 	unsigned int *um;
 | |
| 	unsigned int *dc;
 | |
| 	int count;
 | |
| 	int i;
 | |
| 	int j;
 | |
| 	int ret;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Decomposing nfdicf\n");
 | |
| 	count = 0;
 | |
| 	for (unichar = 0; unichar != 0x110000; unichar++) {
 | |
| 		if (!unicode_data[unichar].utf32nfdicf)
 | |
| 			continue;
 | |
| 		for (;;) {
 | |
| 			ret = 1;
 | |
| 			i = 0;
 | |
| 			um = unicode_data[unichar].utf32nfdicf;
 | |
| 			while (*um) {
 | |
| 				dc = unicode_data[*um].utf32nfdicf;
 | |
| 				if (dc) {
 | |
| 					for (j = 0; dc[j]; j++)
 | |
| 						mapping[i++] = dc[j];
 | |
| 					ret = 0;
 | |
| 				} else {
 | |
| 					mapping[i++] = *um;
 | |
| 				}
 | |
| 				um++;
 | |
| 			}
 | |
| 			mapping[i++] = 0;
 | |
| 			if (ret)
 | |
| 				break;
 | |
| 			free(unicode_data[unichar].utf32nfdicf);
 | |
| 			um = malloc(i * sizeof(unsigned int));
 | |
| 			memcpy(um, mapping, i * sizeof(unsigned int));
 | |
| 			unicode_data[unichar].utf32nfdicf = um;
 | |
| 		}
 | |
| 		if (verbose > 1)
 | |
| 			print_utf32nfdicf(unichar);
 | |
| 		count++;
 | |
| 	}
 | |
| 	if (verbose > 0)
 | |
| 		printf("Processed %d entries\n", count);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| int utf8agemax(struct tree *, const char *);
 | |
| int utf8nagemax(struct tree *, const char *, size_t);
 | |
| int utf8agemin(struct tree *, const char *);
 | |
| int utf8nagemin(struct tree *, const char *, size_t);
 | |
| ssize_t utf8len(struct tree *, const char *);
 | |
| ssize_t utf8nlen(struct tree *, const char *, size_t);
 | |
| struct utf8cursor;
 | |
| int utf8cursor(struct utf8cursor *, struct tree *, const char *);
 | |
| int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
 | |
| int utf8byte(struct utf8cursor *);
 | |
| 
 | |
| /*
 | |
|  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 | |
|  *
 | |
|  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 | |
|  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 | |
|  *
 | |
|  * SBase = 0xAC00
 | |
|  * LBase = 0x1100
 | |
|  * VBase = 0x1161
 | |
|  * TBase = 0x11A7
 | |
|  * LCount = 19
 | |
|  * VCount = 21
 | |
|  * TCount = 28
 | |
|  * NCount = 588 (VCount * TCount)
 | |
|  * SCount = 11172 (LCount * NCount)
 | |
|  *
 | |
|  * Decomposition:
 | |
|  *   SIndex = s - SBase
 | |
|  *
 | |
|  * LV (Canonical/Full)
 | |
|  *   LIndex = SIndex / NCount
 | |
|  *   VIndex = (Sindex % NCount) / TCount
 | |
|  *   LPart = LBase + LIndex
 | |
|  *   VPart = VBase + VIndex
 | |
|  *
 | |
|  * LVT (Canonical)
 | |
|  *   LVIndex = (SIndex / TCount) * TCount
 | |
|  *   TIndex = (Sindex % TCount)
 | |
|  *   LVPart = SBase + LVIndex
 | |
|  *   TPart = TBase + TIndex
 | |
|  *
 | |
|  * LVT (Full)
 | |
|  *   LIndex = SIndex / NCount
 | |
|  *   VIndex = (Sindex % NCount) / TCount
 | |
|  *   TIndex = (Sindex % TCount)
 | |
|  *   LPart = LBase + LIndex
 | |
|  *   VPart = VBase + VIndex
 | |
|  *   if (TIndex == 0) {
 | |
|  *          d = <LPart, VPart>
 | |
|  *   } else {
 | |
|  *          TPart = TBase + TIndex
 | |
|  *          d = <LPart, VPart, TPart>
 | |
|  *   }
 | |
|  */
 | |
| 
 | |
| /* Constants */
 | |
| #define SB	(0xAC00)
 | |
| #define LB	(0x1100)
 | |
| #define VB	(0x1161)
 | |
| #define TB	(0x11A7)
 | |
| #define LC	(19)
 | |
| #define VC	(21)
 | |
| #define TC	(28)
 | |
| #define NC	(VC * TC)
 | |
| #define SC	(LC * NC)
 | |
| 
 | |
| /* Algorithmic decomposition of hangul syllable. */
 | |
| static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
 | |
| {
 | |
| 	unsigned int	si;
 | |
| 	unsigned int	li;
 | |
| 	unsigned int	vi;
 | |
| 	unsigned int	ti;
 | |
| 	unsigned char	*h;
 | |
| 
 | |
| 	/* Calculate the SI, LI, VI, and TI values. */
 | |
| 	si = utf8decode(str) - SB;
 | |
| 	li = si / NC;
 | |
| 	vi = (si % NC) / TC;
 | |
| 	ti = si % TC;
 | |
| 
 | |
| 	/* Fill in base of leaf. */
 | |
| 	h = hangul;
 | |
| 	LEAF_GEN(h) = 2;
 | |
| 	LEAF_CCC(h) = DECOMPOSE;
 | |
| 	h += 2;
 | |
| 
 | |
| 	/* Add LPart, a 3-byte UTF-8 sequence. */
 | |
| 	h += utf8encode((char *)h, li + LB);
 | |
| 
 | |
| 	/* Add VPart, a 3-byte UTF-8 sequence. */
 | |
| 	h += utf8encode((char *)h, vi + VB);
 | |
| 
 | |
| 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
 | |
| 	if (ti)
 | |
| 		h += utf8encode((char *)h, ti + TB);
 | |
| 
 | |
| 	/* Terminate string. */
 | |
| 	h[0] = '\0';
 | |
| 
 | |
| 	return hangul;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Use trie to scan s, touching at most len bytes.
 | |
|  * Returns the leaf if one exists, NULL otherwise.
 | |
|  *
 | |
|  * A non-NULL return guarantees that the UTF-8 sequence starting at s
 | |
|  * is well-formed and corresponds to a known unicode code point.  The
 | |
|  * shorthand for this will be "is valid UTF-8 unicode".
 | |
|  */
 | |
| static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
 | |
| 			       const char *s, size_t len)
 | |
| {
 | |
| 	utf8trie_t	*trie;
 | |
| 	int		offlen;
 | |
| 	int		offset;
 | |
| 	int		mask;
 | |
| 	int		node;
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return NULL;
 | |
| 	if (len == 0)
 | |
| 		return NULL;
 | |
| 	node = 1;
 | |
| 	trie = utf8data + tree->index;
 | |
| 	while (node) {
 | |
| 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 | |
| 		if (*trie & NEXTBYTE) {
 | |
| 			if (--len == 0)
 | |
| 				return NULL;
 | |
| 			s++;
 | |
| 		}
 | |
| 		mask = 1 << (*trie & BITNUM);
 | |
| 		if (*s & mask) {
 | |
| 			/* Right leg */
 | |
| 			if (offlen) {
 | |
| 				/* Right node at offset of trie */
 | |
| 				node = (*trie & RIGHTNODE);
 | |
| 				offset = trie[offlen];
 | |
| 				while (--offlen) {
 | |
| 					offset <<= 8;
 | |
| 					offset |= trie[offlen];
 | |
| 				}
 | |
| 				trie += offset;
 | |
| 			} else if (*trie & RIGHTPATH) {
 | |
| 				/* Right node after this node */
 | |
| 				node = (*trie & TRIENODE);
 | |
| 				trie++;
 | |
| 			} else {
 | |
| 				/* No right node. */
 | |
| 				return NULL;
 | |
| 			}
 | |
| 		} else {
 | |
| 			/* Left leg */
 | |
| 			if (offlen) {
 | |
| 				/* Left node after this node. */
 | |
| 				node = (*trie & LEFTNODE);
 | |
| 				trie += offlen + 1;
 | |
| 			} else if (*trie & RIGHTPATH) {
 | |
| 				/* No left node. */
 | |
| 				return NULL;
 | |
| 			} else {
 | |
| 				/* Left node after this node */
 | |
| 				node = (*trie & TRIENODE);
 | |
| 				trie++;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	/*
 | |
| 	 * Hangul decomposition is done algorithmically. These are the
 | |
| 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 | |
| 	 * always 3 bytes long, so s has been advanced twice, and the
 | |
| 	 * start of the sequence is at s-2.
 | |
| 	 */
 | |
| 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 | |
| 		trie = utf8hangul(s - 2, hangul);
 | |
| 	return trie;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Use trie to scan s.
 | |
|  * Returns the leaf if one exists, NULL otherwise.
 | |
|  *
 | |
|  * Forwards to trie_nlookup().
 | |
|  */
 | |
| static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
 | |
| 			      const char *s)
 | |
| {
 | |
| 	return utf8nlookup(tree, hangul, s, (size_t)-1);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Return the number of bytes used by the current UTF-8 sequence.
 | |
|  * Assumes the input points to the first byte of a valid UTF-8
 | |
|  * sequence.
 | |
|  */
 | |
| static inline int utf8clen(const char *s)
 | |
| {
 | |
| 	unsigned char c = *s;
 | |
| 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Maximum age of any character in s.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  * Return 0 if only non-assigned code points are used.
 | |
|  */
 | |
| int utf8agemax(struct tree *tree, const char *s)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	int		age = 0;
 | |
| 	int		leaf_age;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 
 | |
| 	while (*s) {
 | |
| 		leaf = utf8lookup(tree, hangul, s);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		leaf_age = ages[LEAF_GEN(leaf)];
 | |
| 		if (leaf_age <= tree->maxage && leaf_age > age)
 | |
| 			age = leaf_age;
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return age;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Minimum age of any character in s.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  * Return 0 if non-assigned code points are used.
 | |
|  */
 | |
| int utf8agemin(struct tree *tree, const char *s)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	int		age;
 | |
| 	int		leaf_age;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 	age = tree->maxage;
 | |
| 	while (*s) {
 | |
| 		leaf = utf8lookup(tree, hangul, s);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		leaf_age = ages[LEAF_GEN(leaf)];
 | |
| 		if (leaf_age <= tree->maxage && leaf_age < age)
 | |
| 			age = leaf_age;
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return age;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Maximum age of any character in s, touch at most len bytes.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  */
 | |
| int utf8nagemax(struct tree *tree, const char *s, size_t len)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	int		age = 0;
 | |
| 	int		leaf_age;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 
 | |
|         while (len && *s) {
 | |
| 		leaf = utf8nlookup(tree, hangul, s, len);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		leaf_age = ages[LEAF_GEN(leaf)];
 | |
| 		if (leaf_age <= tree->maxage && leaf_age > age)
 | |
| 			age = leaf_age;
 | |
| 		len -= utf8clen(s);
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return age;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Maximum age of any character in s, touch at most len bytes.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  */
 | |
| int utf8nagemin(struct tree *tree, const char *s, size_t len)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	int		leaf_age;
 | |
| 	int		age;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 	age = tree->maxage;
 | |
|         while (len && *s) {
 | |
| 		leaf = utf8nlookup(tree, hangul, s, len);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		leaf_age = ages[LEAF_GEN(leaf)];
 | |
| 		if (leaf_age <= tree->maxage && leaf_age < age)
 | |
| 			age = leaf_age;
 | |
| 		len -= utf8clen(s);
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return age;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Length of the normalization of s.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  *
 | |
|  * A string of Default_Ignorable_Code_Point has length 0.
 | |
|  */
 | |
| ssize_t utf8len(struct tree *tree, const char *s)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	size_t		ret = 0;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 	while (*s) {
 | |
| 		leaf = utf8lookup(tree, hangul, s);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
 | |
| 			ret += utf8clen(s);
 | |
| 		else if (LEAF_CCC(leaf) == DECOMPOSE)
 | |
| 			ret += strlen(LEAF_STR(leaf));
 | |
| 		else
 | |
| 			ret += utf8clen(s);
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Length of the normalization of s, touch at most len bytes.
 | |
|  * Return -1 if s is not valid UTF-8 unicode.
 | |
|  */
 | |
| ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
 | |
| {
 | |
| 	utf8leaf_t	*leaf;
 | |
| 	size_t		ret = 0;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| 
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 	while (len && *s) {
 | |
| 		leaf = utf8nlookup(tree, hangul, s, len);
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
 | |
| 			ret += utf8clen(s);
 | |
| 		else if (LEAF_CCC(leaf) == DECOMPOSE)
 | |
| 			ret += strlen(LEAF_STR(leaf));
 | |
| 		else
 | |
| 			ret += utf8clen(s);
 | |
| 		len -= utf8clen(s);
 | |
| 		s += utf8clen(s);
 | |
| 	}
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Cursor structure used by the normalizer.
 | |
|  */
 | |
| struct utf8cursor {
 | |
| 	struct tree	*tree;
 | |
| 	const char	*s;
 | |
| 	const char	*p;
 | |
| 	const char	*ss;
 | |
| 	const char	*sp;
 | |
| 	unsigned int	len;
 | |
| 	unsigned int	slen;
 | |
| 	short int	ccc;
 | |
| 	short int	nccc;
 | |
| 	unsigned int	unichar;
 | |
| 	unsigned char	hangul[UTF8HANGULLEAF];
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * Set up an utf8cursor for use by utf8byte().
 | |
|  *
 | |
|  *   s      : string.
 | |
|  *   len    : length of s.
 | |
|  *   u8c    : pointer to cursor.
 | |
|  *   trie   : utf8trie_t to use for normalization.
 | |
|  *
 | |
|  * Returns -1 on error, 0 on success.
 | |
|  */
 | |
| int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
 | |
| 		size_t len)
 | |
| {
 | |
| 	if (!tree)
 | |
| 		return -1;
 | |
| 	if (!s)
 | |
| 		return -1;
 | |
| 	u8c->tree = tree;
 | |
| 	u8c->s = s;
 | |
| 	u8c->p = NULL;
 | |
| 	u8c->ss = NULL;
 | |
| 	u8c->sp = NULL;
 | |
| 	u8c->len = len;
 | |
| 	u8c->slen = 0;
 | |
| 	u8c->ccc = STOPPER;
 | |
| 	u8c->nccc = STOPPER;
 | |
| 	u8c->unichar = 0;
 | |
| 	/* Check we didn't clobber the maximum length. */
 | |
| 	if (u8c->len != len)
 | |
| 		return -1;
 | |
| 	/* The first byte of s may not be an utf8 continuation. */
 | |
| 	if (len > 0 && (*s & 0xC0) == 0x80)
 | |
| 		return -1;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Set up an utf8cursor for use by utf8byte().
 | |
|  *
 | |
|  *   s      : NUL-terminated string.
 | |
|  *   u8c    : pointer to cursor.
 | |
|  *   trie   : utf8trie_t to use for normalization.
 | |
|  *
 | |
|  * Returns -1 on error, 0 on success.
 | |
|  */
 | |
| int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
 | |
| {
 | |
| 	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Get one byte from the normalized form of the string described by u8c.
 | |
|  *
 | |
|  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 | |
|  *
 | |
|  * The cursor keeps track of the location in the string in u8c->s.
 | |
|  * When a character is decomposed, the current location is stored in
 | |
|  * u8c->p, and u8c->s is set to the start of the decomposition. Note
 | |
|  * that bytes from a decomposition do not count against u8c->len.
 | |
|  *
 | |
|  * Characters are emitted if they match the current CCC in u8c->ccc.
 | |
|  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 | |
|  * and the function returns 0 in that case.
 | |
|  *
 | |
|  * Sorting by CCC is done by repeatedly scanning the string.  The
 | |
|  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 | |
|  * the start of the scan.  The first pass finds the lowest CCC to be
 | |
|  * emitted and stores it in u8c->nccc, the second pass emits the
 | |
|  * characters with this CCC and finds the next lowest CCC. This limits
 | |
|  * the number of passes to 1 + the number of different CCCs in the
 | |
|  * sequence being scanned.
 | |
|  *
 | |
|  * Therefore:
 | |
|  *  u8c->p  != NULL -> a decomposition is being scanned.
 | |
|  *  u8c->ss != NULL -> this is a repeating scan.
 | |
|  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
 | |
|  */
 | |
| int utf8byte(struct utf8cursor *u8c)
 | |
| {
 | |
| 	utf8leaf_t *leaf;
 | |
| 	int ccc;
 | |
| 
 | |
| 	for (;;) {
 | |
| 		/* Check for the end of a decomposed character. */
 | |
| 		if (u8c->p && *u8c->s == '\0') {
 | |
| 			u8c->s = u8c->p;
 | |
| 			u8c->p = NULL;
 | |
| 		}
 | |
| 
 | |
| 		/* Check for end-of-string. */
 | |
| 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 | |
| 			/* There is no next byte. */
 | |
| 			if (u8c->ccc == STOPPER)
 | |
| 				return 0;
 | |
| 			/* End-of-string during a scan counts as a stopper. */
 | |
| 			ccc = STOPPER;
 | |
| 			goto ccc_mismatch;
 | |
| 		} else if ((*u8c->s & 0xC0) == 0x80) {
 | |
| 			/* This is a continuation of the current character. */
 | |
| 			if (!u8c->p)
 | |
| 				u8c->len--;
 | |
| 			return (unsigned char)*u8c->s++;
 | |
| 		}
 | |
| 
 | |
| 		/* Look up the data for the current character. */
 | |
| 		if (u8c->p) {
 | |
| 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
 | |
| 		} else {
 | |
| 			leaf = utf8nlookup(u8c->tree, u8c->hangul,
 | |
| 					   u8c->s, u8c->len);
 | |
| 		}
 | |
| 
 | |
| 		/* No leaf found implies that the input is a binary blob. */
 | |
| 		if (!leaf)
 | |
| 			return -1;
 | |
| 
 | |
| 		/* Characters that are too new have CCC 0. */
 | |
| 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
 | |
| 			ccc = STOPPER;
 | |
| 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
 | |
| 			u8c->len -= utf8clen(u8c->s);
 | |
| 			u8c->p = u8c->s + utf8clen(u8c->s);
 | |
| 			u8c->s = LEAF_STR(leaf);
 | |
| 			/* Empty decomposition implies CCC 0. */
 | |
| 			if (*u8c->s == '\0') {
 | |
| 				if (u8c->ccc == STOPPER)
 | |
| 					continue;
 | |
| 				ccc = STOPPER;
 | |
| 				goto ccc_mismatch;
 | |
| 			}
 | |
| 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
 | |
| 			ccc = LEAF_CCC(leaf);
 | |
| 		}
 | |
| 		u8c->unichar = utf8decode(u8c->s);
 | |
| 
 | |
| 		/*
 | |
| 		 * If this is not a stopper, then see if it updates
 | |
| 		 * the next canonical class to be emitted.
 | |
| 		 */
 | |
| 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 | |
| 			u8c->nccc = ccc;
 | |
| 
 | |
| 		/*
 | |
| 		 * Return the current byte if this is the current
 | |
| 		 * combining class.
 | |
| 		 */
 | |
| 		if (ccc == u8c->ccc) {
 | |
| 			if (!u8c->p)
 | |
| 				u8c->len--;
 | |
| 			return (unsigned char)*u8c->s++;
 | |
| 		}
 | |
| 
 | |
| 		/* Current combining class mismatch. */
 | |
| 	ccc_mismatch:
 | |
| 		if (u8c->nccc == STOPPER) {
 | |
| 			/*
 | |
| 			 * Scan forward for the first canonical class
 | |
| 			 * to be emitted.  Save the position from
 | |
| 			 * which to restart.
 | |
| 			 */
 | |
| 			assert(u8c->ccc == STOPPER);
 | |
| 			u8c->ccc = MINCCC - 1;
 | |
| 			u8c->nccc = ccc;
 | |
| 			u8c->sp = u8c->p;
 | |
| 			u8c->ss = u8c->s;
 | |
| 			u8c->slen = u8c->len;
 | |
| 			if (!u8c->p)
 | |
| 				u8c->len -= utf8clen(u8c->s);
 | |
| 			u8c->s += utf8clen(u8c->s);
 | |
| 		} else if (ccc != STOPPER) {
 | |
| 			/* Not a stopper, and not the ccc we're emitting. */
 | |
| 			if (!u8c->p)
 | |
| 				u8c->len -= utf8clen(u8c->s);
 | |
| 			u8c->s += utf8clen(u8c->s);
 | |
| 		} else if (u8c->nccc != MAXCCC + 1) {
 | |
| 			/* At a stopper, restart for next ccc. */
 | |
| 			u8c->ccc = u8c->nccc;
 | |
| 			u8c->nccc = MAXCCC + 1;
 | |
| 			u8c->s = u8c->ss;
 | |
| 			u8c->p = u8c->sp;
 | |
| 			u8c->len = u8c->slen;
 | |
| 		} else {
 | |
| 			/* All done, proceed from here. */
 | |
| 			u8c->ccc = STOPPER;
 | |
| 			u8c->nccc = STOPPER;
 | |
| 			u8c->sp = NULL;
 | |
| 			u8c->ss = NULL;
 | |
| 			u8c->slen = 0;
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| static int normalize_line(struct tree *tree)
 | |
| {
 | |
| 	char *s;
 | |
| 	char *t;
 | |
| 	int c;
 | |
| 	struct utf8cursor u8c;
 | |
| 
 | |
| 	/* First test: null-terminated string. */
 | |
| 	s = buf2;
 | |
| 	t = buf3;
 | |
| 	if (utf8cursor(&u8c, tree, s))
 | |
| 		return -1;
 | |
| 	while ((c = utf8byte(&u8c)) > 0)
 | |
| 		if (c != (unsigned char)*t++)
 | |
| 			return -1;
 | |
| 	if (c < 0)
 | |
| 		return -1;
 | |
| 	if (*t != 0)
 | |
| 		return -1;
 | |
| 
 | |
| 	/* Second test: length-limited string. */
 | |
| 	s = buf2;
 | |
| 	/* Replace NUL with a value that will cause an error if seen. */
 | |
| 	s[strlen(s) + 1] = -1;
 | |
| 	t = buf3;
 | |
| 	if (utf8cursor(&u8c, tree, s))
 | |
| 		return -1;
 | |
| 	while ((c = utf8byte(&u8c)) > 0)
 | |
| 		if (c != (unsigned char)*t++)
 | |
| 			return -1;
 | |
| 	if (c < 0)
 | |
| 		return -1;
 | |
| 	if (*t != 0)
 | |
| 		return -1;
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static void normalization_test(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	unsigned int unichar;
 | |
| 	struct unicode_data *data;
 | |
| 	char *s;
 | |
| 	char *t;
 | |
| 	int ret;
 | |
| 	int ignorables;
 | |
| 	int tests = 0;
 | |
| 	int failures = 0;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Parsing %s\n", test_name);
 | |
| 	/* Step one, read data from file. */
 | |
| 	file = fopen(test_name, "r");
 | |
| 	if (!file)
 | |
| 		open_fail(test_name, errno);
 | |
| 
 | |
| 	while (fgets(line, LINESIZE, file)) {
 | |
| 		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
 | |
| 			     buf0, buf1);
 | |
| 		if (ret != 2 || *line == '#')
 | |
| 			continue;
 | |
| 		s = buf0;
 | |
| 		t = buf2;
 | |
| 		while (*s) {
 | |
| 			unichar = strtoul(s, &s, 16);
 | |
| 			t += utf8encode(t, unichar);
 | |
| 		}
 | |
| 		*t = '\0';
 | |
| 
 | |
| 		ignorables = 0;
 | |
| 		s = buf1;
 | |
| 		t = buf3;
 | |
| 		while (*s) {
 | |
| 			unichar = strtoul(s, &s, 16);
 | |
| 			data = &unicode_data[unichar];
 | |
| 			if (data->utf8nfdi && !*data->utf8nfdi)
 | |
| 				ignorables = 1;
 | |
| 			else
 | |
| 				t += utf8encode(t, unichar);
 | |
| 		}
 | |
| 		*t = '\0';
 | |
| 
 | |
| 		tests++;
 | |
| 		if (normalize_line(nfdi_tree) < 0) {
 | |
| 			printf("Line %s -> %s", buf0, buf1);
 | |
| 			if (ignorables)
 | |
| 				printf(" (ignorables removed)");
 | |
| 			printf(" failure\n");
 | |
| 			failures++;
 | |
| 		}
 | |
| 	}
 | |
| 	fclose(file);
 | |
| 	if (verbose > 0)
 | |
| 		printf("Ran %d tests with %d failures\n", tests, failures);
 | |
| 	if (failures)
 | |
| 		file_fail(test_name);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| static void write_file(void)
 | |
| {
 | |
| 	FILE *file;
 | |
| 	int i;
 | |
| 	int j;
 | |
| 	int t;
 | |
| 	int gen;
 | |
| 
 | |
| 	if (verbose > 0)
 | |
| 		printf("Writing %s\n", utf8_name);
 | |
| 	file = fopen(utf8_name, "w");
 | |
| 	if (!file)
 | |
| 		open_fail(utf8_name, errno);
 | |
| 
 | |
| 	fprintf(file, "/* This file is generated code, do not edit. */\n");
 | |
| 	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
 | |
| 	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
 | |
| 	fprintf(file, "#endif\n");
 | |
| 	fprintf(file, "\n");
 | |
| 	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
 | |
| 		unicode_maxage);
 | |
| 	fprintf(file, "\n");
 | |
| 	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
 | |
| 	for (i = 0; i != ages_count; i++)
 | |
| 		fprintf(file, "\t%#x%s\n", ages[i],
 | |
| 			ages[i] == unicode_maxage ? "" : ",");
 | |
| 	fprintf(file, "};\n");
 | |
| 	fprintf(file, "\n");
 | |
| 	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
 | |
| 	t = 0;
 | |
| 	for (gen = 0; gen < ages_count; gen++) {
 | |
| 		fprintf(file, "\t{ %#x, %d }%s\n",
 | |
| 			ages[gen], trees[t].index,
 | |
| 			ages[gen] == unicode_maxage ? "" : ",");
 | |
| 		if (trees[t].maxage == ages[gen])
 | |
| 			t += 2;
 | |
| 	}
 | |
| 	fprintf(file, "};\n");
 | |
| 	fprintf(file, "\n");
 | |
| 	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
 | |
| 	t = 1;
 | |
| 	for (gen = 0; gen < ages_count; gen++) {
 | |
| 		fprintf(file, "\t{ %#x, %d }%s\n",
 | |
| 			ages[gen], trees[t].index,
 | |
| 			ages[gen] == unicode_maxage ? "" : ",");
 | |
| 		if (trees[t].maxage == ages[gen])
 | |
| 			t += 2;
 | |
| 	}
 | |
| 	fprintf(file, "};\n");
 | |
| 	fprintf(file, "\n");
 | |
| 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
 | |
| 		utf8data_size);
 | |
| 	t = 0;
 | |
| 	for (i = 0; i != utf8data_size; i += 16) {
 | |
| 		if (i == trees[t].index) {
 | |
| 			fprintf(file, "\t/* %s_%x */\n",
 | |
| 				trees[t].type, trees[t].maxage);
 | |
| 			if (t < trees_count-1)
 | |
| 				t++;
 | |
| 		}
 | |
| 		fprintf(file, "\t");
 | |
| 		for (j = i; j != i + 16; j++)
 | |
| 			fprintf(file, "0x%.2x%s", utf8data[j],
 | |
| 				(j < utf8data_size -1 ? "," : ""));
 | |
| 		fprintf(file, "\n");
 | |
| 	}
 | |
| 	fprintf(file, "};\n");
 | |
| 	fclose(file);
 | |
| }
 | |
| 
 | |
| /* ------------------------------------------------------------------ */
 | |
| 
 | |
| int main(int argc, char *argv[])
 | |
| {
 | |
| 	unsigned int unichar;
 | |
| 	int opt;
 | |
| 
 | |
| 	argv0 = argv[0];
 | |
| 
 | |
| 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
 | |
| 		switch (opt) {
 | |
| 		case 'a':
 | |
| 			age_name = optarg;
 | |
| 			break;
 | |
| 		case 'c':
 | |
| 			ccc_name = optarg;
 | |
| 			break;
 | |
| 		case 'd':
 | |
| 			data_name = optarg;
 | |
| 			break;
 | |
| 		case 'f':
 | |
| 			fold_name = optarg;
 | |
| 			break;
 | |
| 		case 'n':
 | |
| 			norm_name = optarg;
 | |
| 			break;
 | |
| 		case 'o':
 | |
| 			utf8_name = optarg;
 | |
| 			break;
 | |
| 		case 'p':
 | |
| 			prop_name = optarg;
 | |
| 			break;
 | |
| 		case 't':
 | |
| 			test_name = optarg;
 | |
| 			break;
 | |
| 		case 'v':
 | |
| 			verbose++;
 | |
| 			break;
 | |
| 		case 'h':
 | |
| 			help();
 | |
| 			exit(0);
 | |
| 		default:
 | |
| 			usage();
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (verbose > 1)
 | |
| 		help();
 | |
| 	for (unichar = 0; unichar != 0x110000; unichar++)
 | |
| 		unicode_data[unichar].code = unichar;
 | |
| 	age_init();
 | |
| 	ccc_init();
 | |
| 	nfdi_init();
 | |
| 	nfdicf_init();
 | |
| 	ignore_init();
 | |
| 	corrections_init();
 | |
| 	hangul_decompose();
 | |
| 	nfdi_decompose();
 | |
| 	nfdicf_decompose();
 | |
| 	utf8_init();
 | |
| 	trees_init();
 | |
| 	trees_populate();
 | |
| 	trees_reduce();
 | |
| 	trees_verify();
 | |
| 	/* Prevent "unused function" warning. */
 | |
| 	(void)lookup(nfdi_tree, " ");
 | |
| 	if (verbose > 2)
 | |
| 		tree_walk(nfdi_tree);
 | |
| 	if (verbose > 2)
 | |
| 		tree_walk(nfdicf_tree);
 | |
| 	normalization_test();
 | |
| 	write_file();
 | |
| 
 | |
| 	return 0;
 | |
| }
 |