From 6791ac8d35f6d31a29b5aabbe3e9052030791bcf Mon Sep 17 00:00:00 2001
From: pantonshire <tom@tomandtally.co.uk>
Date: Thu, 1 Jun 2023 14:24:33 +0100
Subject: [PATCH] character data encoding

---
 data.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/data.py b/data.py
index 8ebfee7..33fcd4c 100644
--- a/data.py
+++ b/data.py
@@ -35,7 +35,7 @@ class StringTable:
 
     def push(self, s: str) -> StringTableIndex:
         if s in self.__map:
-            return self.__map[s]
+            return StringTableIndex.from_int(self.__map[s])
         
         insert_pos = len(self.__buf)
         s_bytes = s.encode(encoding='utf-8')
@@ -168,22 +168,34 @@ def encode_char_data(
     lowercase: StringTableIndex,
     titlecase: StringTableIndex
 ) -> bytes:
-    # TODO: use a single "flags" byte to store:
-    # - mirrored
-    # - decomp_kind (5 bits needed)
-
     encoded = bytearray()
 
     # Pack the category, bidirectional category, decomposition kind and mirrored boolean into two
     # bytes.
     flags = 0
-    flags |= int(category) & 0x1f
-    flags |= (int(bidi) & 0x1f) << 5
-    flags |= (int(decomp_kind) & 0x1f) << 10
+    flags |= category.value & 0x1f
+    flags |= (bidi.value & 0x1f) << 5
+    flags |= (decomp_kind.value & 0x1f) << 10
     flags |= int(mirrored) << 15
     encoded.extend(flags.to_bytes(length=2, byteorder='little'))
 
-    # 4 bits decimal digit, 4 bits digit (max is 9). Both need bit patterns for "none"
+    encoded.extend(name.to_bytes())
+    encoded.extend(decomp.to_bytes())
+    encoded.extend(numeric_value.to_bytes())
+    encoded.extend(old_name.to_bytes())
+    encoded.extend(comment.to_bytes())
+    encoded.extend(uppercase.to_bytes())
+    encoded.extend(lowercase.to_bytes())
+    encoded.extend(titlecase.to_bytes())
+
+    encoded.extend(combining.to_bytes(length=1, byteorder='little'))
+
+    if decimal_digit is None:
+        decimal_digit = 0xf
+    if digit is None:
+        digit = 0xf
+    digit_vals = (decimal_digit & 0xf) | ((digit << 4) & 0xf)
+    encoded.extend(digit_vals.to_bytes(length=1, byteorder='little'))
 
     assert len(encoded) == 28
 
@@ -248,7 +260,7 @@ for row in input_data.splitlines():
         continue
 
     if cell_name.startswith('<') and cell_name.endswith(', First>'):
-        name = cell_name.removeprefix('<').removesuffix(', First>')
+        name = string_table.push(cell_name.removeprefix('<').removesuffix(', First>'))
         in_group = True
     else:
         name = string_table.push(cell_name)
@@ -319,7 +331,7 @@ for row in input_data.splitlines():
     else:
         titlecase = StringTableIndex.invalid()
 
-    encoded = encode_char_data(
+    char_data_table.extend(encode_char_data(
         code,
         name,
         category,
@@ -336,12 +348,7 @@ for row in input_data.splitlines():
         uppercase,
         lowercase,
         titlecase
-    )
+    ))
 
+print(len(char_data_table))
 print(len(string_table.to_bytes()))
-
-# for k in uniq_vals.keys():
-#     print(k)
-
-# for group in groups:
-#     print(group)