🎉 start new encoder for unicode data

3 years ago · d213c81bf6
parent ef6765e037
commit d213c81bf6
2 changed files with 348 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 /target
 /build
 /build.sh
+__pycache__
 unicode_data_latest.txt
--- a/data.py
+++ b/data.py
@ -0,0 +1,347 @@
+from enum import Enum
+from struct import pack
+from typing import Optional
+
+# FIXME:
+# Rather than creating many duplicated entries for "first / last" blocks, include a series of
+# rules in the encoded file about what regions are duplicated / what ranges of codepoints need
+# to be offset when using them to index into the table.
+
+class StringTableIndex:
+    def __init__(self, bs: bytes):
+        if len(bs) != 3:
+            raise ValueError('string table index must be 3 bytes')
+        self.__bs = bs
+
+    @classmethod
+    def from_int(cls, index: int):
+        # Ensure that the index is less than 0xffffff, because 0xffffff is the "invalid index"
+        # sentinel value.
+        if index >= 0xffffff:
+            raise ValueError('non-nil string table indices must be less than 0xffffff')
+        return cls(index.to_bytes(length=3, byteorder='little', signed=False))
+
+    @classmethod
+    def invalid(cls):
+        return cls(b'\xff\xff\xff')
+
+    def to_bytes(self):
+        return self.__bs
+
+class StringTable:
+    def __init__(self):
+        self.__buf = bytearray()
+        self.__map = {}
+
+    def push(self, s: str) -> StringTableIndex:
+        if s in self.__map:
+            return self.__map[s]
+        
+        insert_pos = len(self.__buf)
+        s_bytes = s.encode(encoding='utf-8')
+        s_len_bytes = len(s_bytes).to_bytes(length=1, byteorder='little', signed=False)
+        self.__buf.extend(s_len_bytes)
+        self.__buf.extend(s_bytes)
+        self.__map[s] = insert_pos
+        return StringTableIndex.from_int(insert_pos)
+
+    def to_bytes(self) -> bytes:
+        return bytes(self.__buf)
+
+class Category(Enum):
+    LU = 0
+    LL = 1
+    LT = 2
+    MN = 3
+    MC = 4
+    ME = 5
+    ND = 6
+    NL = 7
+    NO = 8
+    ZS = 9
+    ZL = 10
+    ZP = 11
+    CC = 12
+    CF = 13
+    CS = 14
+    CO = 15
+    CN = 16
+    LM = 17
+    LO = 18
+    PC = 19
+    PD = 20
+    PS = 21
+    PE = 22
+    PI = 23
+    PF = 24
+    PO = 25
+    SM = 26
+    SC = 27
+    SK = 28
+    SO = 29
+
+class Bidi(Enum):
+    L = 0
+    R = 1
+    AL = 2
+    EN = 3
+    ES = 4
+    ET = 5
+    AN = 6
+    CS = 7
+    NSM = 8
+    BN = 9
+    B = 10
+    S = 11
+    WS = 12
+    ON = 13
+    LRE = 14
+    LRO = 15
+    RLE = 16
+    RLO = 17
+    PDF = 18
+    LRI = 19
+    RLI = 20
+    FSI = 21
+    PDI = 22
+
+class DecompKind(Enum):
+    NONE = 0
+    ANONYMOUS = 1
+    NOBREAK = 2
+    COMPAT = 3
+    SUPER = 4
+    FRACTION = 5
+    SUB = 6
+    FONT = 7
+    CIRCLE = 8
+    WIDE = 9
+    VERTICAL = 10
+    SQUARE = 11
+    ISOLATED = 12
+    FINAL = 13
+    INITIAL = 14
+    MEDIAL = 15
+    SMALL = 16
+    NARROW = 17
+
+class GroupKind(Enum):
+    NO_VALUE = 0
+    HAS_VALUE = 1
+
+class Group:
+    def __init__(self, kind: GroupKind, start: int, end: int):
+        self.__kind = kind
+        self.__start = start
+        self.__end = end
+
+    def __str__(self):
+        return 'Group({}, {:x}, {:x})'.format(self.__kind, self.__start, self.__end)
+
+    def kind(self) -> GroupKind:
+        return self.__kind
+
+    def start(self) -> int:
+        return self.__start
+
+    def end(self) -> int:
+        return self.__end
+
+def parse_codepoint_string(cp_str: str) -> str:
+    return ''.join([chr(int(cp, 16)) for cp in cp_str.split()])
+
+def encode_char_data(
+    code: int,
+    name: StringTableIndex,
+    category: Category,
+    combining: int,
+    bidi: Bidi,
+    decomp_kind: DecompKind,
+    decomp: StringTableIndex,
+    decimal_digit: Optional[int],
+    digit: Optional[int],
+    numeric_value: StringTableIndex,
+    mirrored: bool,
+    old_name: StringTableIndex,
+    comment: StringTableIndex,
+    uppercase: StringTableIndex,
+    lowercase: StringTableIndex,
+    titlecase: StringTableIndex
+) -> bytes:
+    # TODO: use a single "flags" byte to store:
+    # - mirrored
+    # - decomp_kind (5 bits needed)
+
+    encoded = bytearray()
+
+    # Pack the category, bidirectional category, decomposition kind and mirrored boolean into two
+    # bytes.
+    flags = 0
+    flags |= int(category) & 0x1f
+    flags |= (int(bidi) & 0x1f) << 5
+    flags |= (int(decomp_kind) & 0x1f) << 10
+    flags |= int(mirrored) << 15
+    encoded.extend(flags.to_bytes(length=2, byteorder='little'))
+
+    # 4 bits decimal digit, 4 bits digit (max is 9). Both need bit patterns for "none"
+
+    assert len(encoded) == 28
+
+    return bytes(encoded)
+
+with open('unicode_data_latest.txt', 'r') as fd:
+    input_data = fd.read()
+
+char_data_table = bytearray()
+string_table = StringTable()
+# FIXME: calculate cumulative offset required for codepoints after each group
+groups = []
+in_group = False
+prev_code = None
+
+uniq_vals = {}
+
+for row in input_data.splitlines():    
+    is_group_start = False
+
+    [
+        cell_code,
+        cell_name,
+        cell_category,
+        cell_combining,
+        cell_bidi,
+        cell_decomp,
+        cell_decimal_digit,
+        cell_digit,
+        cell_numeric,
+        cell_mirrored,
+        cell_old_name,
+        cell_comment,
+        cell_uppercase,
+        cell_lowercase,
+        cell_titlecase
+    ] = [cell.strip() for cell in row.split(';')]
+
+    uniq_vals[cell_numeric] = True
+
+    code = int(cell_code, 16)
+
+    assert prev_code is None or prev_code < code
+
+    # If the previous row was the start of a group, this row should be the end of the group. We now
+    # know both the start and end codepoints of the group, so we can append it to the groups list.
+    if in_group:
+        assert cell_name.startswith('<') and cell_name.endswith(', Last>')
+        groups.append(Group(GroupKind.HAS_VALUE, prev_code, code))
+
+    # If there is a gap between the previous codepoint and this codepoint, add a "no value" group
+    # to the list of groups to indicate the gap.
+    elif prev_code is not None and code > prev_code + 1:
+        groups.append(Group(GroupKind.NO_VALUE, prev_code + 1, code - 1))
+    
+    prev_code = code
+
+    # If we are at the end of a group, continue the loop without creating a new character data
+    # entry, since the entire group uses the entry created for the start of the group.
+    if in_group:
+        in_group = False
+        continue
+
+    if cell_name.startswith('<') and cell_name.endswith(', First>'):
+        name = cell_name.removeprefix('<').removesuffix(', First>')
+        in_group = True
+    else:
+        name = string_table.push(cell_name)
+    
+    category = Category[cell_category.upper()]
+    combining = int(cell_combining)
+    bidi = Bidi[cell_bidi.upper()]
+
+    if cell_decomp:
+        # If the decomposition string starts with an angle bracket, extract the decomposition kind
+        # from between the angle brackets.
+        if cell_decomp.startswith('<'):
+            [
+                decomp_kind_str,
+                decomp_str
+            ] = [s.strip() for s in cell_decomp.removeprefix('<').split('>', 1)]
+            decomp_kind = DecompKind[decomp_kind_str.upper()]
+        else:
+            decomp_kind = DecompKind.ANONYMOUS
+            decomp_str = cell_decomp
+        # The decomposition is a series of ASCII-encoded codepoints separated by spaces, so split
+        # the decomposition string by whitespace and convert each of the encoded codepoints to
+        # actual characters.
+        decomp = string_table.push(parse_codepoint_string(decomp_str))
+    else:
+        decomp_kind = DecompKind.NONE
+        decomp = StringTableIndex.invalid()
+
+    if cell_decimal_digit:
+        decimal_digit = int(cell_decimal_digit, 10)
+    else:
+        decimal_digit = None
+
+    if cell_digit:
+        digit = int(cell_digit, 10)
+    else:
+        digit = None
+
+    if cell_numeric:
+        numeric_value = string_table.push(cell_numeric)
+    else:
+        numeric_value = StringTableIndex.invalid()
+
+    mirrored = cell_mirrored == 'Y'
+
+    if cell_old_name:
+        old_name = string_table.push(cell_old_name)
+    else:
+        old_name = StringTableIndex.invalid()
+
+    if cell_comment:
+        comment = string_table.push(cell_comment)
+    else:
+        comment = StringTableIndex.invalid()
+
+    if cell_uppercase:
+        uppercase = string_table.push(parse_codepoint_string(cell_uppercase))
+    else:
+        uppercase = StringTableIndex.invalid()
+
+    if cell_lowercase:
+        lowercase = string_table.push(parse_codepoint_string(cell_lowercase))
+    else:
+        lowercase = StringTableIndex.invalid()
+
+    if cell_titlecase:
+        titlecase = string_table.push(parse_codepoint_string(cell_titlecase))
+    else:
+        titlecase = StringTableIndex.invalid()
+
+    encoded = encode_char_data(
+        code,
+        name,
+        category,
+        combining,
+        bidi,
+        decomp_kind,
+        decomp,
+        decimal_digit,
+        digit,
+        numeric_value,
+        mirrored,
+        old_name,
+        comment,
+        uppercase,
+        lowercase,
+        titlecase
+    )
+
+print(len(string_table.to_bytes()))
+
+# for k in uniq_vals.keys():
+#     print(k)
+
+# for group in groups:
+#     print(group)