You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
452 lines
13 KiB
Python
452 lines
13 KiB
Python
# # Encoded data format
|
|
# N.B:
|
|
# - All integers are encoded in little-endian order
|
|
# - Indices into the string table are 3 bytes and point to the length of the string in the string
|
|
# table, which is immediately followed by the string itself.
|
|
# - The string table index 0xffffff indicates an invalid index.
|
|
#
|
|
# The overall layout of the encoded data is:
|
|
# - 8 byte magic number: UTFDUMP!
|
|
# - 4 byte group table length (in bytes)
|
|
# - 4 byte char table length (in bytes)
|
|
# - 4 byte string table length (in bytes)
|
|
# - Group table
|
|
# - Char table
|
|
# - String table
|
|
#
|
|
# ## Group table format
|
|
# Each entry is 13 bytes and consists of:
|
|
# - 4 byte start codepoint
|
|
# - 4 byte end codepoint (inclusive)
|
|
# - 4 byte cumulative lengths of all previous groups
|
|
# - 1 byte group kind
|
|
# - 0: no character data associated with group
|
|
# - 1: group shares character data with codepoint immediately before the start of the group
|
|
#
|
|
# ## Char table format
|
|
# Each entry is 28 bytes and consists of:
|
|
# - 2 byte packed data
|
|
# - First (least significant) 5 bits are general category
|
|
# - Next 5 bits are bidirectional category
|
|
# - Next 5 bits are decomposition kind
|
|
# - Last bit is mirrored boolean
|
|
# - 3 byte string table index for name
|
|
# - 3 byte string table index for decomposition
|
|
# - 3 byte string table index for numeric value
|
|
# - 3 byte string table index for old Unicode 1.0 name
|
|
# - 3 byte string table index for comment
|
|
# - 3 byte string table index for uppercase
|
|
# - 3 byte string table index for lowercase
|
|
# - 3 byte string table index for titlecase
|
|
# - 1 byte combining class
|
|
# - 1 byte for decimal digit value and digit value
|
|
# - First (least significant) 4 bits are decimal digit value
|
|
# - Last 4 bits are digit value
|
|
#
|
|
# ## String table format
|
|
# Each entry consists of:
|
|
# - 1 byte string length
|
|
# - UTF-8 encoded string
|
|
|
|
from enum import Enum
|
|
from struct import pack
|
|
from typing import Optional
|
|
from gzip import compress
|
|
from time import time
|
|
import http.client
|
|
|
|
unicode_data_host = 'www.unicode.org'
|
|
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
|
|
out_data_path = 'lib/unicode_data_encoded.gz'
|
|
|
|
class StringTableIndex:
|
|
def __init__(self, bs: bytes):
|
|
if len(bs) != 3:
|
|
raise ValueError('string table index must be 3 bytes')
|
|
self.__bs = bs
|
|
|
|
@classmethod
|
|
def from_int(cls, index: int):
|
|
# Ensure that the index is less than 0xffffff, because 0xffffff is the "invalid index"
|
|
# sentinel value.
|
|
if index >= 0xffffff:
|
|
raise ValueError('non-nil string table indices must be less than 0xffffff')
|
|
return cls(index.to_bytes(length=3, byteorder='little', signed=False))
|
|
|
|
@classmethod
|
|
def invalid(cls):
|
|
return cls(b'\xff\xff\xff')
|
|
|
|
def to_bytes(self):
|
|
return self.__bs
|
|
|
|
class StringTable:
|
|
def __init__(self):
|
|
self.__buf = bytearray()
|
|
self.__map = {}
|
|
|
|
def push(self, s: str) -> StringTableIndex:
|
|
if s in self.__map:
|
|
return StringTableIndex.from_int(self.__map[s])
|
|
|
|
insert_pos = len(self.__buf)
|
|
s_bytes = s.encode(encoding='utf-8')
|
|
s_len_bytes = len(s_bytes).to_bytes(length=1, byteorder='little', signed=False)
|
|
self.__buf.extend(s_len_bytes)
|
|
self.__buf.extend(s_bytes)
|
|
self.__map[s] = insert_pos
|
|
return StringTableIndex.from_int(insert_pos)
|
|
|
|
def to_bytes(self) -> bytes:
|
|
return bytes(self.__buf)
|
|
|
|
class Category(Enum):
|
|
LU = 0
|
|
LL = 1
|
|
LT = 2
|
|
MN = 3
|
|
MC = 4
|
|
ME = 5
|
|
ND = 6
|
|
NL = 7
|
|
NO = 8
|
|
ZS = 9
|
|
ZL = 10
|
|
ZP = 11
|
|
CC = 12
|
|
CF = 13
|
|
CS = 14
|
|
CO = 15
|
|
CN = 16
|
|
LM = 17
|
|
LO = 18
|
|
PC = 19
|
|
PD = 20
|
|
PS = 21
|
|
PE = 22
|
|
PI = 23
|
|
PF = 24
|
|
PO = 25
|
|
SM = 26
|
|
SC = 27
|
|
SK = 28
|
|
SO = 29
|
|
|
|
class Bidi(Enum):
|
|
L = 0
|
|
R = 1
|
|
AL = 2
|
|
EN = 3
|
|
ES = 4
|
|
ET = 5
|
|
AN = 6
|
|
CS = 7
|
|
NSM = 8
|
|
BN = 9
|
|
B = 10
|
|
S = 11
|
|
WS = 12
|
|
ON = 13
|
|
LRE = 14
|
|
LRO = 15
|
|
RLE = 16
|
|
RLO = 17
|
|
PDF = 18
|
|
LRI = 19
|
|
RLI = 20
|
|
FSI = 21
|
|
PDI = 22
|
|
|
|
class DecompKind(Enum):
|
|
NONE = 0
|
|
ANONYMOUS = 1
|
|
NOBREAK = 2
|
|
COMPAT = 3
|
|
SUPER = 4
|
|
FRACTION = 5
|
|
SUB = 6
|
|
FONT = 7
|
|
CIRCLE = 8
|
|
WIDE = 9
|
|
VERTICAL = 10
|
|
SQUARE = 11
|
|
ISOLATED = 12
|
|
FINAL = 13
|
|
INITIAL = 14
|
|
MEDIAL = 15
|
|
SMALL = 16
|
|
NARROW = 17
|
|
|
|
class GroupKind(Enum):
|
|
NO_VALUE = 0
|
|
USE_PREV_VALUE = 1
|
|
|
|
class Group:
|
|
def __init__(self, kind: GroupKind, start: int, end: int):
|
|
self.__kind = kind
|
|
self.__start = start
|
|
self.__end = end
|
|
|
|
def __str__(self):
|
|
return 'Group({}, {:x}, {:x})'.format(self.__kind, self.__start, self.__end)
|
|
|
|
def kind(self) -> GroupKind:
|
|
return self.__kind
|
|
|
|
def start(self) -> int:
|
|
return self.__start
|
|
|
|
def end(self) -> int:
|
|
return self.__end
|
|
|
|
def parse_codepoint_string(cp_str: str) -> str:
|
|
return ''.join([chr(int(cp, 16)) for cp in cp_str.split()])
|
|
|
|
def encode_char_data(
|
|
code: int,
|
|
name: StringTableIndex,
|
|
category: Category,
|
|
combining: int,
|
|
bidi: Bidi,
|
|
decomp_kind: DecompKind,
|
|
decomp: StringTableIndex,
|
|
decimal_digit: Optional[int],
|
|
digit: Optional[int],
|
|
numeric_value: StringTableIndex,
|
|
mirrored: bool,
|
|
old_name: StringTableIndex,
|
|
comment: StringTableIndex,
|
|
uppercase: StringTableIndex,
|
|
lowercase: StringTableIndex,
|
|
titlecase: StringTableIndex
|
|
) -> bytes:
|
|
encoded = bytearray()
|
|
|
|
# Pack the category, bidirectional category, decomposition kind and mirrored boolean into two
|
|
# bytes.
|
|
flags = 0
|
|
flags |= category.value & 0x1f
|
|
flags |= (bidi.value & 0x1f) << 5
|
|
flags |= (decomp_kind.value & 0x1f) << 10
|
|
flags |= int(mirrored) << 15
|
|
encoded.extend(flags.to_bytes(length=2, byteorder='little', signed=False))
|
|
|
|
encoded.extend(name.to_bytes())
|
|
encoded.extend(decomp.to_bytes())
|
|
encoded.extend(numeric_value.to_bytes())
|
|
encoded.extend(old_name.to_bytes())
|
|
encoded.extend(comment.to_bytes())
|
|
encoded.extend(uppercase.to_bytes())
|
|
encoded.extend(lowercase.to_bytes())
|
|
encoded.extend(titlecase.to_bytes())
|
|
|
|
encoded.extend(combining.to_bytes(length=1, byteorder='little', signed=False))
|
|
|
|
if decimal_digit is None:
|
|
decimal_digit = 0xf
|
|
if digit is None:
|
|
digit = 0xf
|
|
digit_vals = (decimal_digit & 0xf) | ((digit << 4) & 0xf)
|
|
encoded.extend(digit_vals.to_bytes(length=1, byteorder='little', signed=False))
|
|
|
|
assert len(encoded) == 28
|
|
|
|
return bytes(encoded)
|
|
|
|
print('Fetching Unicode data from {}...'.format(unicode_data_host))
|
|
start_time = time()
|
|
|
|
conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
|
|
conn.request('GET', unicode_data_url_path)
|
|
resp = conn.getresponse()
|
|
resp_data = resp.read()
|
|
input_data = resp_data.decode('utf-8')
|
|
|
|
end_time = time()
|
|
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
|
|
|
|
char_data_table = bytearray()
|
|
string_table = StringTable()
|
|
groups = []
|
|
in_group = False
|
|
prev_code = None
|
|
|
|
uniq_vals = {}
|
|
|
|
rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]
|
|
|
|
print('Encoding {} rows...'.format(len(rows)))
|
|
|
|
for row in rows:
|
|
is_group_start = False
|
|
|
|
[
|
|
cell_code,
|
|
cell_name,
|
|
cell_category,
|
|
cell_combining,
|
|
cell_bidi,
|
|
cell_decomp,
|
|
cell_decimal_digit,
|
|
cell_digit,
|
|
cell_numeric,
|
|
cell_mirrored,
|
|
cell_old_name,
|
|
cell_comment,
|
|
cell_uppercase,
|
|
cell_lowercase,
|
|
cell_titlecase
|
|
] = [cell.strip() for cell in row.split(';')]
|
|
|
|
uniq_vals[cell_numeric] = True
|
|
|
|
code = int(cell_code, 16)
|
|
|
|
assert prev_code is None or prev_code < code
|
|
|
|
# If the previous row was the start of a group, this row should be the end of the group. We now
|
|
# know both the start and end codepoints of the group, so we can append it to the groups list.
|
|
if in_group:
|
|
assert cell_name.startswith('<') and cell_name.endswith(', Last>')
|
|
groups.append(Group(GroupKind.USE_PREV_VALUE, prev_code + 1, code))
|
|
|
|
# If there is a gap between the previous codepoint and this codepoint, add a "no value" group
|
|
# to the list of groups to indicate the gap.
|
|
elif prev_code is not None and code > prev_code + 1:
|
|
groups.append(Group(GroupKind.NO_VALUE, prev_code + 1, code - 1))
|
|
|
|
prev_code = code
|
|
|
|
# If we are at the end of a group, continue the loop without creating a new character data
|
|
# entry, since the entire group uses the entry created for the start of the group.
|
|
if in_group:
|
|
in_group = False
|
|
continue
|
|
|
|
if cell_name.startswith('<') and cell_name.endswith(', First>'):
|
|
name = string_table.push(cell_name.removeprefix('<').removesuffix(', First>'))
|
|
in_group = True
|
|
else:
|
|
name = string_table.push(cell_name)
|
|
|
|
category = Category[cell_category.upper()]
|
|
combining = int(cell_combining)
|
|
bidi = Bidi[cell_bidi.upper()]
|
|
|
|
if cell_decomp:
|
|
# If the decomposition string starts with an angle bracket, extract the decomposition kind
|
|
# from between the angle brackets.
|
|
if cell_decomp.startswith('<'):
|
|
[
|
|
decomp_kind_str,
|
|
decomp_str
|
|
] = [s.strip() for s in cell_decomp.removeprefix('<').split('>', 1)]
|
|
decomp_kind = DecompKind[decomp_kind_str.upper()]
|
|
else:
|
|
decomp_kind = DecompKind.ANONYMOUS
|
|
decomp_str = cell_decomp
|
|
# The decomposition is a series of ASCII-encoded codepoints separated by spaces, so split
|
|
# the decomposition string by whitespace and convert each of the encoded codepoints to
|
|
# actual characters.
|
|
decomp = string_table.push(parse_codepoint_string(decomp_str))
|
|
else:
|
|
decomp_kind = DecompKind.NONE
|
|
decomp = StringTableIndex.invalid()
|
|
|
|
if cell_decimal_digit:
|
|
decimal_digit = int(cell_decimal_digit, 10)
|
|
else:
|
|
decimal_digit = None
|
|
|
|
if cell_digit:
|
|
digit = int(cell_digit, 10)
|
|
else:
|
|
digit = None
|
|
|
|
if cell_numeric:
|
|
numeric_value = string_table.push(cell_numeric)
|
|
else:
|
|
numeric_value = StringTableIndex.invalid()
|
|
|
|
mirrored = cell_mirrored == 'Y'
|
|
|
|
if cell_old_name:
|
|
old_name = string_table.push(cell_old_name)
|
|
else:
|
|
old_name = StringTableIndex.invalid()
|
|
|
|
if cell_comment:
|
|
comment = string_table.push(cell_comment)
|
|
else:
|
|
comment = StringTableIndex.invalid()
|
|
|
|
if cell_uppercase:
|
|
uppercase = string_table.push(parse_codepoint_string(cell_uppercase))
|
|
else:
|
|
uppercase = StringTableIndex.invalid()
|
|
|
|
if cell_lowercase:
|
|
lowercase = string_table.push(parse_codepoint_string(cell_lowercase))
|
|
else:
|
|
lowercase = StringTableIndex.invalid()
|
|
|
|
if cell_titlecase:
|
|
titlecase = string_table.push(parse_codepoint_string(cell_titlecase))
|
|
else:
|
|
titlecase = StringTableIndex.invalid()
|
|
|
|
char_data_table.extend(encode_char_data(
|
|
code,
|
|
name,
|
|
category,
|
|
combining,
|
|
bidi,
|
|
decomp_kind,
|
|
decomp,
|
|
decimal_digit,
|
|
digit,
|
|
numeric_value,
|
|
mirrored,
|
|
old_name,
|
|
comment,
|
|
uppercase,
|
|
lowercase,
|
|
titlecase
|
|
))
|
|
|
|
group_table = bytearray()
|
|
cumulative_offset = 0
|
|
|
|
for group in groups:
|
|
group_table_entry = bytearray()
|
|
group_table_entry.extend(group.start().to_bytes(length=4, byteorder='little', signed=False))
|
|
group_table_entry.extend(group.end().to_bytes(length=4, byteorder='little', signed=False))
|
|
# Include the sum of the lengths of all groups before this one.
|
|
group_table_entry.extend(cumulative_offset.to_bytes(length=4, byteorder='little', signed=False))
|
|
group_table_entry.extend(group.kind().value.to_bytes(length=1, byteorder='little', signed=False))
|
|
assert len(group_table_entry) == 13
|
|
group_table.extend(group_table_entry)
|
|
|
|
# Calculate the length of this group and add it to the cumulative total.
|
|
cumulative_offset += (group.end() - group.start()) + 1
|
|
|
|
string_table = string_table.to_bytes()
|
|
|
|
encoded_data = bytearray()
|
|
encoded_data.extend(b'UTFDUMP!')
|
|
encoded_data.extend(len(group_table).to_bytes(length=4, byteorder='little', signed=False))
|
|
encoded_data.extend(len(char_data_table).to_bytes(length=4, byteorder='little', signed=False))
|
|
encoded_data.extend(len(string_table).to_bytes(length=4, byteorder='little', signed=False))
|
|
encoded_data.extend(group_table)
|
|
encoded_data.extend(char_data_table)
|
|
encoded_data.extend(string_table)
|
|
|
|
compressed_data = compress(encoded_data)
|
|
|
|
print('Writing encoded data to {}...'.format(out_data_path))
|
|
|
|
with open(out_data_path, 'wb') as fd:
|
|
fd.write(compressed_data)
|
|
|
|
print('Done!')
|