utfdump/data.py

# # Encoded data format
# N.B:
# - All integers are encoded in little-endian order
# - Indices into the string table are 3 bytes and point to the length of the string in the string
#   table, which is immediately followed by the string itself.
# - The string table index 0xffffff indicates an invalid index.
#
# The overall layout of the encoded data is:
# - 8 byte magic number: UTFDUMP!
# - 4 byte group table length (in bytes)
# - 4 byte char table length (in bytes)
# - 4 byte string table length (in bytes)
# - Group table
# - Char table
# - String table
#
# ## Group table format
# Each entry is 13 bytes and consists of:
# - 4 byte start codepoint
# - 4 byte end codepoint (inclusive)
# - 4 byte cumulative lengths of all previous groups
# - 1 byte group kind
#   - 0: no character data associated with group
#   - 1: group shares character data with codepoint immediately before the start of the group
#
# ## Char table format
# Each entry is 28 bytes and consists of:
# - 2 byte packed data
#   - First (least significant) 5 bits are general category
#   - Next 5 bits are bidirectional category
#   - Next 5 bits are decomposition kind
#   - Last bit is mirrored boolean
# - 3 byte string table index for name
# - 3 byte string table index for decomposition
# - 3 byte string table index for numeric value
# - 3 byte string table index for old Unicode 1.0 name
# - 3 byte string table index for comment
# - 3 byte string table index for uppercase
# - 3 byte string table index for lowercase
# - 3 byte string table index for titlecase
# - 1 byte combining class
# - 1 byte for decimal digit value and digit value
#   - First (least significant) 4 bits are decimal digit value
#   - Last 4 bits are digit value
#
# ## String table format
# Each entry consists of:
# - 1 byte string length
# - UTF-8 encoded string

from enum import Enum
from struct import pack
from typing import Optional
from gzip import compress
from time import time
import http.client

unicode_data_host = 'www.unicode.org'
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
out_data_path = 'lib/unicode_data_encoded.gz'

class StringTableIndex:
    def __init__(self, bs: bytes):
        if len(bs) != 3:
            raise ValueError('string table index must be 3 bytes')
        self.__bs = bs

    @classmethod
    def from_int(cls, index: int):
        # Ensure that the index is less than 0xffffff, because 0xffffff is the "invalid index"
        # sentinel value.
        if index >= 0xffffff:
            raise ValueError('non-nil string table indices must be less than 0xffffff')
        return cls(index.to_bytes(length=3, byteorder='little', signed=False))

    @classmethod
    def invalid(cls):
        return cls(b'\xff\xff\xff')

    def to_bytes(self):
        return self.__bs

class StringTable:
    def __init__(self):
        self.__buf = bytearray()
        self.__map = {}

    def push(self, s: str) -> StringTableIndex:
        if s in self.__map:
            return StringTableIndex.from_int(self.__map[s])

        insert_pos = len(self.__buf)
        s_bytes = s.encode(encoding='utf-8')
        s_len_bytes = len(s_bytes).to_bytes(length=1, byteorder='little', signed=False)
        self.__buf.extend(s_len_bytes)
        self.__buf.extend(s_bytes)
        self.__map[s] = insert_pos
        return StringTableIndex.from_int(insert_pos)

    def to_bytes(self) -> bytes:
        return bytes(self.__buf)

class Category(Enum):
    LU = 0
    LL = 1
    LT = 2
    MN = 3
    MC = 4
    ME = 5
    ND = 6
    NL = 7
    NO = 8
    ZS = 9
    ZL = 10
    ZP = 11
    CC = 12
    CF = 13
    CS = 14
    CO = 15
    CN = 16
    LM = 17
    LO = 18
    PC = 19
    PD = 20
    PS = 21
    PE = 22
    PI = 23
    PF = 24
    PO = 25
    SM = 26
    SC = 27
    SK = 28
    SO = 29

class Bidi(Enum):
    L = 0
    R = 1
    AL = 2
    EN = 3
    ES = 4
    ET = 5
    AN = 6
    CS = 7
    NSM = 8
    BN = 9
    B = 10
    S = 11
    WS = 12
    ON = 13
    LRE = 14
    LRO = 15
    RLE = 16
    RLO = 17
    PDF = 18
    LRI = 19
    RLI = 20
    FSI = 21
    PDI = 22

class DecompKind(Enum):
    NONE = 0
    ANONYMOUS = 1
    NOBREAK = 2
    COMPAT = 3
    SUPER = 4
    FRACTION = 5
    SUB = 6
    FONT = 7
    CIRCLE = 8
    WIDE = 9
    VERTICAL = 10
    SQUARE = 11
    ISOLATED = 12
    FINAL = 13
    INITIAL = 14
    MEDIAL = 15
    SMALL = 16
    NARROW = 17

class GroupKind(Enum):
    NO_VALUE = 0
    USE_PREV_VALUE = 1

class Group:
    def __init__(self, kind: GroupKind, start: int, end: int):
        self.__kind = kind
        self.__start = start
        self.__end = end

    def __str__(self):
        return 'Group({}, {:x}, {:x})'.format(self.__kind, self.__start, self.__end)

    def kind(self) -> GroupKind:
        return self.__kind

    def start(self) -> int:
        return self.__start

    def end(self) -> int:
        return self.__end

def parse_codepoint_string(cp_str: str) -> str:
    return ''.join([chr(int(cp, 16)) for cp in cp_str.split()])

def encode_char_data(
    code: int,
    name: StringTableIndex,
    category: Category,
    combining: int,
    bidi: Bidi,
    decomp_kind: DecompKind,
    decomp: StringTableIndex,
    decimal_digit: Optional[int],
    digit: Optional[int],
    numeric_value: StringTableIndex,
    mirrored: bool,
    old_name: StringTableIndex,
    comment: StringTableIndex,
    uppercase: StringTableIndex,
    lowercase: StringTableIndex,
    titlecase: StringTableIndex
) -> bytes:
    encoded = bytearray()

    # Pack the category, bidirectional category, decomposition kind and mirrored boolean into two
    # bytes.
    flags = 0
    flags |= category.value & 0x1f
    flags |= (bidi.value & 0x1f) << 5
    flags |= (decomp_kind.value & 0x1f) << 10
    flags |= int(mirrored) << 15
    encoded.extend(flags.to_bytes(length=2, byteorder='little', signed=False))

    encoded.extend(name.to_bytes())
    encoded.extend(decomp.to_bytes())
    encoded.extend(numeric_value.to_bytes())
    encoded.extend(old_name.to_bytes())
    encoded.extend(comment.to_bytes())
    encoded.extend(uppercase.to_bytes())
    encoded.extend(lowercase.to_bytes())
    encoded.extend(titlecase.to_bytes())

    encoded.extend(combining.to_bytes(length=1, byteorder='little', signed=False))

    if decimal_digit is None:
        decimal_digit = 0xf
    if digit is None:
        digit = 0xf
    digit_vals = (decimal_digit & 0xf) | ((digit << 4) & 0xf)
    encoded.extend(digit_vals.to_bytes(length=1, byteorder='little', signed=False))

    assert len(encoded) == 28

    return bytes(encoded)

print('Fetching Unicode data from {}...'.format(unicode_data_host))
start_time = time()

conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
conn.request('GET', unicode_data_url_path)
resp = conn.getresponse()
resp_data = resp.read()
input_data = resp_data.decode('utf-8')

end_time = time()
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))

char_data_table = bytearray()
string_table = StringTable()
groups = []
in_group = False
prev_code = None

uniq_vals = {}

rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]

print('Encoding {} rows...'.format(len(rows)))

for row in rows:
    is_group_start = False

    [
        cell_code,
        cell_name,
        cell_category,
        cell_combining,
        cell_bidi,
        cell_decomp,
        cell_decimal_digit,
        cell_digit,
        cell_numeric,
        cell_mirrored,
        cell_old_name,
        cell_comment,
        cell_uppercase,
        cell_lowercase,
        cell_titlecase
    ] = [cell.strip() for cell in row.split(';')]

    uniq_vals[cell_numeric] = True

    code = int(cell_code, 16)

    assert prev_code is None or prev_code < code

    # If the previous row was the start of a group, this row should be the end of the group. We now
    # know both the start and end codepoints of the group, so we can append it to the groups list.
    if in_group:
        assert cell_name.startswith('<') and cell_name.endswith(', Last>')
        groups.append(Group(GroupKind.USE_PREV_VALUE, prev_code + 1, code))

    # If there is a gap between the previous codepoint and this codepoint, add a "no value" group
    # to the list of groups to indicate the gap.
    elif prev_code is not None and code > prev_code + 1:
        groups.append(Group(GroupKind.NO_VALUE, prev_code + 1, code - 1))

    prev_code = code

    # If we are at the end of a group, continue the loop without creating a new character data
    # entry, since the entire group uses the entry created for the start of the group.
    if in_group:
        in_group = False
        continue

    if cell_name.startswith('<') and cell_name.endswith(', First>'):
        name = string_table.push(cell_name.removeprefix('<').removesuffix(', First>'))
        in_group = True
    else:
        name = string_table.push(cell_name)

    category = Category[cell_category.upper()]
    combining = int(cell_combining)
    bidi = Bidi[cell_bidi.upper()]

    if cell_decomp:
        # If the decomposition string starts with an angle bracket, extract the decomposition kind
        # from between the angle brackets.
        if cell_decomp.startswith('<'):
            [
                decomp_kind_str,
                decomp_str
            ] = [s.strip() for s in cell_decomp.removeprefix('<').split('>', 1)]
            decomp_kind = DecompKind[decomp_kind_str.upper()]
        else:
            decomp_kind = DecompKind.ANONYMOUS
            decomp_str = cell_decomp
        # The decomposition is a series of ASCII-encoded codepoints separated by spaces, so split
        # the decomposition string by whitespace and convert each of the encoded codepoints to
        # actual characters.
        decomp = string_table.push(parse_codepoint_string(decomp_str))
    else:
        decomp_kind = DecompKind.NONE
        decomp = StringTableIndex.invalid()

    if cell_decimal_digit:
        decimal_digit = int(cell_decimal_digit, 10)
    else:
        decimal_digit = None

    if cell_digit:
        digit = int(cell_digit, 10)
    else:
        digit = None

    if cell_numeric:
        numeric_value = string_table.push(cell_numeric)
    else:
        numeric_value = StringTableIndex.invalid()

    mirrored = cell_mirrored == 'Y'

    if cell_old_name:
        old_name = string_table.push(cell_old_name)
    else:
        old_name = StringTableIndex.invalid()

    if cell_comment:
        comment = string_table.push(cell_comment)
    else:
        comment = StringTableIndex.invalid()

    if cell_uppercase:
        uppercase = string_table.push(parse_codepoint_string(cell_uppercase))
    else:
        uppercase = StringTableIndex.invalid()

    if cell_lowercase:
        lowercase = string_table.push(parse_codepoint_string(cell_lowercase))
    else:
        lowercase = StringTableIndex.invalid()

    if cell_titlecase:
        titlecase = string_table.push(parse_codepoint_string(cell_titlecase))
    else:
        titlecase = StringTableIndex.invalid()

    char_data_table.extend(encode_char_data(
        code,
        name,
        category,
        combining,
        bidi,
        decomp_kind,
        decomp,
        decimal_digit,
        digit,
        numeric_value,
        mirrored,
        old_name,
        comment,
        uppercase,
        lowercase,
        titlecase
    ))

group_table = bytearray()
cumulative_offset = 0

for group in groups:
    group_table_entry = bytearray()
    group_table_entry.extend(group.start().to_bytes(length=4, byteorder='little', signed=False))
    group_table_entry.extend(group.end().to_bytes(length=4, byteorder='little', signed=False))
    # Include the sum of the lengths of all groups before this one.
    group_table_entry.extend(cumulative_offset.to_bytes(length=4, byteorder='little', signed=False))
    group_table_entry.extend(group.kind().value.to_bytes(length=1, byteorder='little', signed=False))
    assert len(group_table_entry) == 13
    group_table.extend(group_table_entry)

    # Calculate the length of this group and add it to the cumulative total.
    cumulative_offset += (group.end() - group.start()) + 1

string_table = string_table.to_bytes()

encoded_data = bytearray()
encoded_data.extend(b'UTFDUMP!')
encoded_data.extend(len(group_table).to_bytes(length=4, byteorder='little', signed=False))
encoded_data.extend(len(char_data_table).to_bytes(length=4, byteorder='little', signed=False))
encoded_data.extend(len(string_table).to_bytes(length=4, byteorder='little', signed=False))
encoded_data.extend(group_table)
encoded_data.extend(char_data_table)
encoded_data.extend(string_table)

compressed_data = compress(encoded_data)

print('Writing encoded data to {}...'.format(out_data_path))

with open(out_data_path, 'wb') as fd:
    fd.write(compressed_data)

print('Done!')