diff --git a/.gitignore b/.gitignore index d5ce41d..2b214e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target /build /build.sh +__pycache__ unicode_data_latest.txt diff --git a/data.py b/data.py new file mode 100644 index 0000000..8ebfee7 --- /dev/null +++ b/data.py @@ -0,0 +1,347 @@ +from enum import Enum +from struct import pack +from typing import Optional + +# FIXME: +# Rather than creating many duplicated entries for "first / last" blocks, include a series of +# rules in the encoded file about what regions are duplicated / what ranges of codepoints need +# to be offset when using them to index into the table. + +class StringTableIndex: + def __init__(self, bs: bytes): + if len(bs) != 3: + raise ValueError('string table index must be 3 bytes') + self.__bs = bs + + @classmethod + def from_int(cls, index: int): + # Ensure that the index is less than 0xffffff, because 0xffffff is the "invalid index" + # sentinel value. + if index >= 0xffffff: + raise ValueError('non-nil string table indices must be less than 0xffffff') + return cls(index.to_bytes(length=3, byteorder='little', signed=False)) + + @classmethod + def invalid(cls): + return cls(b'\xff\xff\xff') + + def to_bytes(self): + return self.__bs + +class StringTable: + def __init__(self): + self.__buf = bytearray() + self.__map = {} + + def push(self, s: str) -> StringTableIndex: + if s in self.__map: + return self.__map[s] + + insert_pos = len(self.__buf) + s_bytes = s.encode(encoding='utf-8') + s_len_bytes = len(s_bytes).to_bytes(length=1, byteorder='little', signed=False) + self.__buf.extend(s_len_bytes) + self.__buf.extend(s_bytes) + self.__map[s] = insert_pos + return StringTableIndex.from_int(insert_pos) + + def to_bytes(self) -> bytes: + return bytes(self.__buf) + +class Category(Enum): + LU = 0 + LL = 1 + LT = 2 + MN = 3 + MC = 4 + ME = 5 + ND = 6 + NL = 7 + NO = 8 + ZS = 9 + ZL = 10 + ZP = 11 + CC = 12 + CF = 13 + CS = 14 + CO = 15 + CN = 16 + LM = 17 + LO = 18 + PC = 19 + PD = 20 + PS = 21 + PE = 22 + PI = 23 + PF = 24 + PO = 25 + SM = 26 + SC = 27 + SK = 28 + SO = 29 + +class Bidi(Enum): + L = 0 + R = 1 + AL = 2 + EN = 3 + ES = 4 + ET = 5 + AN = 6 + CS = 7 + NSM = 8 + BN = 9 + B = 10 + S = 11 + WS = 12 + ON = 13 + LRE = 14 + LRO = 15 + RLE = 16 + RLO = 17 + PDF = 18 + LRI = 19 + RLI = 20 + FSI = 21 + PDI = 22 + +class DecompKind(Enum): + NONE = 0 + ANONYMOUS = 1 + NOBREAK = 2 + COMPAT = 3 + SUPER = 4 + FRACTION = 5 + SUB = 6 + FONT = 7 + CIRCLE = 8 + WIDE = 9 + VERTICAL = 10 + SQUARE = 11 + ISOLATED = 12 + FINAL = 13 + INITIAL = 14 + MEDIAL = 15 + SMALL = 16 + NARROW = 17 + +class GroupKind(Enum): + NO_VALUE = 0 + HAS_VALUE = 1 + +class Group: + def __init__(self, kind: GroupKind, start: int, end: int): + self.__kind = kind + self.__start = start + self.__end = end + + def __str__(self): + return 'Group({}, {:x}, {:x})'.format(self.__kind, self.__start, self.__end) + + def kind(self) -> GroupKind: + return self.__kind + + def start(self) -> int: + return self.__start + + def end(self) -> int: + return self.__end + +def parse_codepoint_string(cp_str: str) -> str: + return ''.join([chr(int(cp, 16)) for cp in cp_str.split()]) + +def encode_char_data( + code: int, + name: StringTableIndex, + category: Category, + combining: int, + bidi: Bidi, + decomp_kind: DecompKind, + decomp: StringTableIndex, + decimal_digit: Optional[int], + digit: Optional[int], + numeric_value: StringTableIndex, + mirrored: bool, + old_name: StringTableIndex, + comment: StringTableIndex, + uppercase: StringTableIndex, + lowercase: StringTableIndex, + titlecase: StringTableIndex +) -> bytes: + # TODO: use a single "flags" byte to store: + # - mirrored + # - decomp_kind (5 bits needed) + + encoded = bytearray() + + # Pack the category, bidirectional category, decomposition kind and mirrored boolean into two + # bytes. + flags = 0 + flags |= int(category) & 0x1f + flags |= (int(bidi) & 0x1f) << 5 + flags |= (int(decomp_kind) & 0x1f) << 10 + flags |= int(mirrored) << 15 + encoded.extend(flags.to_bytes(length=2, byteorder='little')) + + # 4 bits decimal digit, 4 bits digit (max is 9). Both need bit patterns for "none" + + assert len(encoded) == 28 + + return bytes(encoded) + +with open('unicode_data_latest.txt', 'r') as fd: + input_data = fd.read() + +char_data_table = bytearray() +string_table = StringTable() +# FIXME: calculate cumulative offset required for codepoints after each group +groups = [] +in_group = False +prev_code = None + +uniq_vals = {} + +for row in input_data.splitlines(): + is_group_start = False + + [ + cell_code, + cell_name, + cell_category, + cell_combining, + cell_bidi, + cell_decomp, + cell_decimal_digit, + cell_digit, + cell_numeric, + cell_mirrored, + cell_old_name, + cell_comment, + cell_uppercase, + cell_lowercase, + cell_titlecase + ] = [cell.strip() for cell in row.split(';')] + + uniq_vals[cell_numeric] = True + + code = int(cell_code, 16) + + assert prev_code is None or prev_code < code + + # If the previous row was the start of a group, this row should be the end of the group. We now + # know both the start and end codepoints of the group, so we can append it to the groups list. + if in_group: + assert cell_name.startswith('<') and cell_name.endswith(', Last>') + groups.append(Group(GroupKind.HAS_VALUE, prev_code, code)) + + # If there is a gap between the previous codepoint and this codepoint, add a "no value" group + # to the list of groups to indicate the gap. + elif prev_code is not None and code > prev_code + 1: + groups.append(Group(GroupKind.NO_VALUE, prev_code + 1, code - 1)) + + prev_code = code + + # If we are at the end of a group, continue the loop without creating a new character data + # entry, since the entire group uses the entry created for the start of the group. + if in_group: + in_group = False + continue + + if cell_name.startswith('<') and cell_name.endswith(', First>'): + name = cell_name.removeprefix('<').removesuffix(', First>') + in_group = True + else: + name = string_table.push(cell_name) + + category = Category[cell_category.upper()] + combining = int(cell_combining) + bidi = Bidi[cell_bidi.upper()] + + if cell_decomp: + # If the decomposition string starts with an angle bracket, extract the decomposition kind + # from between the angle brackets. + if cell_decomp.startswith('<'): + [ + decomp_kind_str, + decomp_str + ] = [s.strip() for s in cell_decomp.removeprefix('<').split('>', 1)] + decomp_kind = DecompKind[decomp_kind_str.upper()] + else: + decomp_kind = DecompKind.ANONYMOUS + decomp_str = cell_decomp + # The decomposition is a series of ASCII-encoded codepoints separated by spaces, so split + # the decomposition string by whitespace and convert each of the encoded codepoints to + # actual characters. + decomp = string_table.push(parse_codepoint_string(decomp_str)) + else: + decomp_kind = DecompKind.NONE + decomp = StringTableIndex.invalid() + + if cell_decimal_digit: + decimal_digit = int(cell_decimal_digit, 10) + else: + decimal_digit = None + + if cell_digit: + digit = int(cell_digit, 10) + else: + digit = None + + if cell_numeric: + numeric_value = string_table.push(cell_numeric) + else: + numeric_value = StringTableIndex.invalid() + + mirrored = cell_mirrored == 'Y' + + if cell_old_name: + old_name = string_table.push(cell_old_name) + else: + old_name = StringTableIndex.invalid() + + if cell_comment: + comment = string_table.push(cell_comment) + else: + comment = StringTableIndex.invalid() + + if cell_uppercase: + uppercase = string_table.push(parse_codepoint_string(cell_uppercase)) + else: + uppercase = StringTableIndex.invalid() + + if cell_lowercase: + lowercase = string_table.push(parse_codepoint_string(cell_lowercase)) + else: + lowercase = StringTableIndex.invalid() + + if cell_titlecase: + titlecase = string_table.push(parse_codepoint_string(cell_titlecase)) + else: + titlecase = StringTableIndex.invalid() + + encoded = encode_char_data( + code, + name, + category, + combining, + bidi, + decomp_kind, + decomp, + decimal_digit, + digit, + numeric_value, + mirrored, + old_name, + comment, + uppercase, + lowercase, + titlecase + ) + +print(len(string_table.to_bytes())) + +# for k in uniq_vals.keys(): +# print(k) + +# for group in groups: +# print(group)