🎉 start new encoder for unicode data
parent
ef6765e037
commit
d213c81bf6
@ -1,4 +1,5 @@
|
||||
/target
|
||||
/build
|
||||
/build.sh
|
||||
__pycache__
|
||||
unicode_data_latest.txt
|
||||
|
||||
@ -0,0 +1,347 @@
|
||||
from enum import Enum
|
||||
from struct import pack
|
||||
from typing import Optional
|
||||
|
||||
# FIXME:
|
||||
# Rather than creating many duplicated entries for "first / last" blocks, include a series of
|
||||
# rules in the encoded file about what regions are duplicated / what ranges of codepoints need
|
||||
# to be offset when using them to index into the table.
|
||||
|
||||
class StringTableIndex:
|
||||
def __init__(self, bs: bytes):
|
||||
if len(bs) != 3:
|
||||
raise ValueError('string table index must be 3 bytes')
|
||||
self.__bs = bs
|
||||
|
||||
@classmethod
|
||||
def from_int(cls, index: int):
|
||||
# Ensure that the index is less than 0xffffff, because 0xffffff is the "invalid index"
|
||||
# sentinel value.
|
||||
if index >= 0xffffff:
|
||||
raise ValueError('non-nil string table indices must be less than 0xffffff')
|
||||
return cls(index.to_bytes(length=3, byteorder='little', signed=False))
|
||||
|
||||
@classmethod
|
||||
def invalid(cls):
|
||||
return cls(b'\xff\xff\xff')
|
||||
|
||||
def to_bytes(self):
|
||||
return self.__bs
|
||||
|
||||
class StringTable:
|
||||
def __init__(self):
|
||||
self.__buf = bytearray()
|
||||
self.__map = {}
|
||||
|
||||
def push(self, s: str) -> StringTableIndex:
|
||||
if s in self.__map:
|
||||
return self.__map[s]
|
||||
|
||||
insert_pos = len(self.__buf)
|
||||
s_bytes = s.encode(encoding='utf-8')
|
||||
s_len_bytes = len(s_bytes).to_bytes(length=1, byteorder='little', signed=False)
|
||||
self.__buf.extend(s_len_bytes)
|
||||
self.__buf.extend(s_bytes)
|
||||
self.__map[s] = insert_pos
|
||||
return StringTableIndex.from_int(insert_pos)
|
||||
|
||||
def to_bytes(self) -> bytes:
|
||||
return bytes(self.__buf)
|
||||
|
||||
class Category(Enum):
|
||||
LU = 0
|
||||
LL = 1
|
||||
LT = 2
|
||||
MN = 3
|
||||
MC = 4
|
||||
ME = 5
|
||||
ND = 6
|
||||
NL = 7
|
||||
NO = 8
|
||||
ZS = 9
|
||||
ZL = 10
|
||||
ZP = 11
|
||||
CC = 12
|
||||
CF = 13
|
||||
CS = 14
|
||||
CO = 15
|
||||
CN = 16
|
||||
LM = 17
|
||||
LO = 18
|
||||
PC = 19
|
||||
PD = 20
|
||||
PS = 21
|
||||
PE = 22
|
||||
PI = 23
|
||||
PF = 24
|
||||
PO = 25
|
||||
SM = 26
|
||||
SC = 27
|
||||
SK = 28
|
||||
SO = 29
|
||||
|
||||
class Bidi(Enum):
|
||||
L = 0
|
||||
R = 1
|
||||
AL = 2
|
||||
EN = 3
|
||||
ES = 4
|
||||
ET = 5
|
||||
AN = 6
|
||||
CS = 7
|
||||
NSM = 8
|
||||
BN = 9
|
||||
B = 10
|
||||
S = 11
|
||||
WS = 12
|
||||
ON = 13
|
||||
LRE = 14
|
||||
LRO = 15
|
||||
RLE = 16
|
||||
RLO = 17
|
||||
PDF = 18
|
||||
LRI = 19
|
||||
RLI = 20
|
||||
FSI = 21
|
||||
PDI = 22
|
||||
|
||||
class DecompKind(Enum):
|
||||
NONE = 0
|
||||
ANONYMOUS = 1
|
||||
NOBREAK = 2
|
||||
COMPAT = 3
|
||||
SUPER = 4
|
||||
FRACTION = 5
|
||||
SUB = 6
|
||||
FONT = 7
|
||||
CIRCLE = 8
|
||||
WIDE = 9
|
||||
VERTICAL = 10
|
||||
SQUARE = 11
|
||||
ISOLATED = 12
|
||||
FINAL = 13
|
||||
INITIAL = 14
|
||||
MEDIAL = 15
|
||||
SMALL = 16
|
||||
NARROW = 17
|
||||
|
||||
class GroupKind(Enum):
|
||||
NO_VALUE = 0
|
||||
HAS_VALUE = 1
|
||||
|
||||
class Group:
|
||||
def __init__(self, kind: GroupKind, start: int, end: int):
|
||||
self.__kind = kind
|
||||
self.__start = start
|
||||
self.__end = end
|
||||
|
||||
def __str__(self):
|
||||
return 'Group({}, {:x}, {:x})'.format(self.__kind, self.__start, self.__end)
|
||||
|
||||
def kind(self) -> GroupKind:
|
||||
return self.__kind
|
||||
|
||||
def start(self) -> int:
|
||||
return self.__start
|
||||
|
||||
def end(self) -> int:
|
||||
return self.__end
|
||||
|
||||
def parse_codepoint_string(cp_str: str) -> str:
|
||||
return ''.join([chr(int(cp, 16)) for cp in cp_str.split()])
|
||||
|
||||
def encode_char_data(
|
||||
code: int,
|
||||
name: StringTableIndex,
|
||||
category: Category,
|
||||
combining: int,
|
||||
bidi: Bidi,
|
||||
decomp_kind: DecompKind,
|
||||
decomp: StringTableIndex,
|
||||
decimal_digit: Optional[int],
|
||||
digit: Optional[int],
|
||||
numeric_value: StringTableIndex,
|
||||
mirrored: bool,
|
||||
old_name: StringTableIndex,
|
||||
comment: StringTableIndex,
|
||||
uppercase: StringTableIndex,
|
||||
lowercase: StringTableIndex,
|
||||
titlecase: StringTableIndex
|
||||
) -> bytes:
|
||||
# TODO: use a single "flags" byte to store:
|
||||
# - mirrored
|
||||
# - decomp_kind (5 bits needed)
|
||||
|
||||
encoded = bytearray()
|
||||
|
||||
# Pack the category, bidirectional category, decomposition kind and mirrored boolean into two
|
||||
# bytes.
|
||||
flags = 0
|
||||
flags |= int(category) & 0x1f
|
||||
flags |= (int(bidi) & 0x1f) << 5
|
||||
flags |= (int(decomp_kind) & 0x1f) << 10
|
||||
flags |= int(mirrored) << 15
|
||||
encoded.extend(flags.to_bytes(length=2, byteorder='little'))
|
||||
|
||||
# 4 bits decimal digit, 4 bits digit (max is 9). Both need bit patterns for "none"
|
||||
|
||||
assert len(encoded) == 28
|
||||
|
||||
return bytes(encoded)
|
||||
|
||||
with open('unicode_data_latest.txt', 'r') as fd:
|
||||
input_data = fd.read()
|
||||
|
||||
char_data_table = bytearray()
|
||||
string_table = StringTable()
|
||||
# FIXME: calculate cumulative offset required for codepoints after each group
|
||||
groups = []
|
||||
in_group = False
|
||||
prev_code = None
|
||||
|
||||
uniq_vals = {}
|
||||
|
||||
for row in input_data.splitlines():
|
||||
is_group_start = False
|
||||
|
||||
[
|
||||
cell_code,
|
||||
cell_name,
|
||||
cell_category,
|
||||
cell_combining,
|
||||
cell_bidi,
|
||||
cell_decomp,
|
||||
cell_decimal_digit,
|
||||
cell_digit,
|
||||
cell_numeric,
|
||||
cell_mirrored,
|
||||
cell_old_name,
|
||||
cell_comment,
|
||||
cell_uppercase,
|
||||
cell_lowercase,
|
||||
cell_titlecase
|
||||
] = [cell.strip() for cell in row.split(';')]
|
||||
|
||||
uniq_vals[cell_numeric] = True
|
||||
|
||||
code = int(cell_code, 16)
|
||||
|
||||
assert prev_code is None or prev_code < code
|
||||
|
||||
# If the previous row was the start of a group, this row should be the end of the group. We now
|
||||
# know both the start and end codepoints of the group, so we can append it to the groups list.
|
||||
if in_group:
|
||||
assert cell_name.startswith('<') and cell_name.endswith(', Last>')
|
||||
groups.append(Group(GroupKind.HAS_VALUE, prev_code, code))
|
||||
|
||||
# If there is a gap between the previous codepoint and this codepoint, add a "no value" group
|
||||
# to the list of groups to indicate the gap.
|
||||
elif prev_code is not None and code > prev_code + 1:
|
||||
groups.append(Group(GroupKind.NO_VALUE, prev_code + 1, code - 1))
|
||||
|
||||
prev_code = code
|
||||
|
||||
# If we are at the end of a group, continue the loop without creating a new character data
|
||||
# entry, since the entire group uses the entry created for the start of the group.
|
||||
if in_group:
|
||||
in_group = False
|
||||
continue
|
||||
|
||||
if cell_name.startswith('<') and cell_name.endswith(', First>'):
|
||||
name = cell_name.removeprefix('<').removesuffix(', First>')
|
||||
in_group = True
|
||||
else:
|
||||
name = string_table.push(cell_name)
|
||||
|
||||
category = Category[cell_category.upper()]
|
||||
combining = int(cell_combining)
|
||||
bidi = Bidi[cell_bidi.upper()]
|
||||
|
||||
if cell_decomp:
|
||||
# If the decomposition string starts with an angle bracket, extract the decomposition kind
|
||||
# from between the angle brackets.
|
||||
if cell_decomp.startswith('<'):
|
||||
[
|
||||
decomp_kind_str,
|
||||
decomp_str
|
||||
] = [s.strip() for s in cell_decomp.removeprefix('<').split('>', 1)]
|
||||
decomp_kind = DecompKind[decomp_kind_str.upper()]
|
||||
else:
|
||||
decomp_kind = DecompKind.ANONYMOUS
|
||||
decomp_str = cell_decomp
|
||||
# The decomposition is a series of ASCII-encoded codepoints separated by spaces, so split
|
||||
# the decomposition string by whitespace and convert each of the encoded codepoints to
|
||||
# actual characters.
|
||||
decomp = string_table.push(parse_codepoint_string(decomp_str))
|
||||
else:
|
||||
decomp_kind = DecompKind.NONE
|
||||
decomp = StringTableIndex.invalid()
|
||||
|
||||
if cell_decimal_digit:
|
||||
decimal_digit = int(cell_decimal_digit, 10)
|
||||
else:
|
||||
decimal_digit = None
|
||||
|
||||
if cell_digit:
|
||||
digit = int(cell_digit, 10)
|
||||
else:
|
||||
digit = None
|
||||
|
||||
if cell_numeric:
|
||||
numeric_value = string_table.push(cell_numeric)
|
||||
else:
|
||||
numeric_value = StringTableIndex.invalid()
|
||||
|
||||
mirrored = cell_mirrored == 'Y'
|
||||
|
||||
if cell_old_name:
|
||||
old_name = string_table.push(cell_old_name)
|
||||
else:
|
||||
old_name = StringTableIndex.invalid()
|
||||
|
||||
if cell_comment:
|
||||
comment = string_table.push(cell_comment)
|
||||
else:
|
||||
comment = StringTableIndex.invalid()
|
||||
|
||||
if cell_uppercase:
|
||||
uppercase = string_table.push(parse_codepoint_string(cell_uppercase))
|
||||
else:
|
||||
uppercase = StringTableIndex.invalid()
|
||||
|
||||
if cell_lowercase:
|
||||
lowercase = string_table.push(parse_codepoint_string(cell_lowercase))
|
||||
else:
|
||||
lowercase = StringTableIndex.invalid()
|
||||
|
||||
if cell_titlecase:
|
||||
titlecase = string_table.push(parse_codepoint_string(cell_titlecase))
|
||||
else:
|
||||
titlecase = StringTableIndex.invalid()
|
||||
|
||||
encoded = encode_char_data(
|
||||
code,
|
||||
name,
|
||||
category,
|
||||
combining,
|
||||
bidi,
|
||||
decomp_kind,
|
||||
decomp,
|
||||
decimal_digit,
|
||||
digit,
|
||||
numeric_value,
|
||||
mirrored,
|
||||
old_name,
|
||||
comment,
|
||||
uppercase,
|
||||
lowercase,
|
||||
titlecase
|
||||
)
|
||||
|
||||
print(len(string_table.to_bytes()))
|
||||
|
||||
# for k in uniq_vals.keys():
|
||||
# print(k)
|
||||
|
||||
# for group in groups:
|
||||
# print(group)
|
||||
Loading…
Reference in New Issue