From a3750b57320c8de19715796f65a1d5c128d11a01 Mon Sep 17 00:00:00 2001
From: pantonshire <tom@tomandtally.co.uk>
Date: Sun, 4 Jun 2023 18:12:30 +0100
Subject: [PATCH] working rust decoder for new encoded data format

---
 lib/src/character.rs    | 315 ++++++++++++++++++++++++++++++++++++++++
 lib/src/lib.rs          |  26 +++-
 lib/src/unicode_data.rs | 176 +++++++++++++++++++++-
 3 files changed, 510 insertions(+), 7 deletions(-)
 create mode 100644 lib/src/character.rs
diff --git a/lib/src/character.rs b/lib/src/character.rs
new file mode 100644
index 0000000..4b512f5
--- /dev/null
+++ b/lib/src/character.rs
@@ -0,0 +1,315 @@
+
+#[derive(Debug)]
+pub struct CharData<'a> {
+    pub(crate) codepoint: u32,
+    pub(crate) name: &'a str,
+    pub(crate) category: Category,
+    pub(crate) combining: CombiningClass,
+    pub(crate) bidi: BidiCategory,
+    pub(crate) decomp: Option<DecompMapping<'a>>,
+    pub(crate) decimal_digit: Option<u8>,
+    pub(crate) digit: Option<u8>,
+    // FIXME: replace with exact fraction type?
+    pub(crate) numeric: Option<&'a str>,
+    pub(crate) mirrored: bool,
+    pub(crate) old_name: Option<&'a str>,
+    pub(crate) comment: Option<&'a str>,
+    pub(crate) uppercase: Option<&'a str>,
+    pub(crate) lowercase: Option<&'a str>,
+    pub(crate) titlecase: Option<&'a str>,
+}
+
+impl<'a> CharData<'a> {
+    #[inline]
+    #[must_use]
+    pub fn codepoint(&self) -> u32 {
+        self.codepoint
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn name(&self) -> &'a str {
+        self.name
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn category(&self) -> Category {
+        self.category
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn bidi(&self) -> BidiCategory {
+        self.bidi
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn decomp(&self) -> Option<DecompMapping<'a>> {
+        self.decomp
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn decimal_digit(&self) -> Option<u8> {
+        self.decimal_digit
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn digit(&self) -> Option<u8> {
+        self.digit
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn numeric(&self) -> Option<&'a str> {
+        self.numeric
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn mirrored(&self) -> bool {
+        self.mirrored
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn old_name(&self) -> Option<&'a str> {
+        self.old_name
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn comment(&self) -> Option<&'a str> {
+        self.comment
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn uppercase(&self) -> Option<&'a str> {
+        self.uppercase
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn lowercase(&self) -> Option<&'a str> {
+        self.lowercase
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn titlecase(&self) -> Option<&'a str> {
+        self.titlecase
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub struct CombiningClass(pub u8);
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub enum Category {
+    Lu,
+    Ll,
+    Lt,
+    Mn,
+    Mc,
+    Me,
+    Nd,
+    Nl,
+    No,
+    Zs,
+    Zl,
+    Zp,
+    Cc,
+    Cf,
+    Cs,
+    Co,
+    Cn,
+    Lm,
+    Lo,
+    Pc,
+    Pd,
+    Ps,
+    Pe,
+    Pi,
+    Pf,
+    Po,
+    Sm,
+    Sc,
+    Sk,
+    So,
+}
+
+impl Category {
+    pub(crate) fn decode(encoded: u8) -> Option<Self> {
+        match encoded {
+            0 => Some(Self::Lu),
+            1 => Some(Self::Ll),
+            2 => Some(Self::Lt),
+            3 => Some(Self::Mn),
+            4 => Some(Self::Mc),
+            5 => Some(Self::Me),
+            6 => Some(Self::Nd),
+            7 => Some(Self::Nl),
+            8 => Some(Self::No),
+            9 => Some(Self::Zs),
+            10 => Some(Self::Zl),
+            11 => Some(Self::Zp),
+            12 => Some(Self::Cc),
+            13 => Some(Self::Cf),
+            14 => Some(Self::Cs),
+            15 => Some(Self::Co),
+            16 => Some(Self::Cn),
+            17 => Some(Self::Lm),
+            18 => Some(Self::Lo),
+            19 => Some(Self::Pc),
+            20 => Some(Self::Pd),
+            21 => Some(Self::Ps),
+            22 => Some(Self::Pe),
+            23 => Some(Self::Pi),
+            24 => Some(Self::Pf),
+            25 => Some(Self::Po),
+            26 => Some(Self::Sm),
+            27 => Some(Self::Sc),
+            28 => Some(Self::Sk),
+            29 => Some(Self::So),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub enum BidiCategory {
+    L,
+    R,
+    Al,
+    En,
+    Es,
+    Et,
+    An,
+    Cs,
+    Nsm,
+    Bn,
+    B,
+    S,
+    Ws,
+    On,
+    Lre,
+    Lro,
+    Rle,
+    Rlo,
+    Pdf,
+    Lri,
+    Rli,
+    Fsi,
+    Pdi,
+}
+
+impl BidiCategory {
+    pub(crate) fn decode(encoded: u8) -> Option<Self> {
+        match encoded {
+            0 => Some(Self::L),
+            1 => Some(Self::R),
+            2 => Some(Self::Al),
+            3 => Some(Self::En),
+            4 => Some(Self::Es),
+            5 => Some(Self::Et),
+            6 => Some(Self::An),
+            7 => Some(Self::Cs),
+            8 => Some(Self::Nsm),
+            9 => Some(Self::Bn),
+            10 => Some(Self::B),
+            11 => Some(Self::S),
+            12 => Some(Self::Ws),
+            13 => Some(Self::On),
+            14 => Some(Self::Lre),
+            15 => Some(Self::Lro),
+            16 => Some(Self::Rle),
+            17 => Some(Self::Rlo),
+            18 => Some(Self::Pdf),
+            19 => Some(Self::Lri),
+            20 => Some(Self::Rli),
+            21 => Some(Self::Fsi),
+            22 => Some(Self::Pdi),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub struct DecompMapping<'a> {
+    kind: Option<DecompKind>,
+    value: &'a str,
+}
+
+impl<'a> DecompMapping<'a> {
+    pub(crate) fn new(kind: Option<DecompKind>, value: &'a str) -> Self {
+        Self { kind, value }
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn kind(self) -> Option<DecompKind> {
+        self.kind
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn value(self) -> &'a str {
+        self.value
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub enum DecompKind {
+    Nobreak,
+    Compat,
+    Super,
+    Fraction,
+    Sub,
+    Font,
+    Circle,
+    Wide,
+    Vertical,
+    Square,
+    Isolated,
+    Final,
+    Initial,
+    Medial,
+    Small,
+    Narrow,
+}
+
+pub(crate) enum OptionalDecompKind {
+    None,
+    Anon,
+    Named(DecompKind),
+}
+
+impl OptionalDecompKind {
+    pub(crate) fn decode(encoded: u8) -> Option<Self> {
+        match encoded {
+            0 => Some(Self::None),
+            1 => Some(Self::Anon),
+            2 => Some(Self::Named(DecompKind::Nobreak)),
+            3 => Some(Self::Named(DecompKind::Compat)),
+            4 => Some(Self::Named(DecompKind::Super)),
+            5 => Some(Self::Named(DecompKind::Fraction)),
+            6 => Some(Self::Named(DecompKind::Sub)),
+            7 => Some(Self::Named(DecompKind::Font)),
+            8 => Some(Self::Named(DecompKind::Circle)),
+            9 => Some(Self::Named(DecompKind::Wide)),
+            10 => Some(Self::Named(DecompKind::Vertical)),
+            11 => Some(Self::Named(DecompKind::Square)),
+            12 => Some(Self::Named(DecompKind::Isolated)),
+            13 => Some(Self::Named(DecompKind::Final)),
+            14 => Some(Self::Named(DecompKind::Initial)),
+            15 => Some(Self::Named(DecompKind::Medial)),
+            16 => Some(Self::Named(DecompKind::Small)),
+            17 => Some(Self::Named(DecompKind::Narrow)),
+            _ => None,
+        }
+    }
+}
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
index b988ebb..ce11caf 100644
--- a/lib/src/lib.rs
+++ b/lib/src/lib.rs
@@ -1,3 +1,4 @@
+pub mod character;
 pub mod unicode_data;
 pub mod utf8;
 
@@ -27,11 +28,30 @@ mod tests {
     use crate::{UNICODE_DATA_BYTES, unicode_data};
 
     #[test]
-    fn test_encoded_data() {
+    fn test_data_decode() {
         let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES)
             .unwrap();
 
-        // println!("{:#?}", data.groups());
-        println!("{:#?}", data.chars());
+        assert_eq!(data.get(0x0).unwrap().name(), "<control>");
+        assert_eq!(data.get(0x0).unwrap().old_name(), Some("NULL"));
+        assert_eq!(data.get(0x1).unwrap().name(), "<control>");
+        assert_eq!(data.get(0x1).unwrap().old_name(), Some("START OF HEADING"));
+        assert_eq!(data.get(0x2).unwrap().name(), "<control>");
+        assert_eq!(data.get(0x2).unwrap().old_name(), Some("START OF TEXT"));
+
+        assert_eq!(data.get(0x377).unwrap().name(), "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA");
+        assert!(data.get(0x378).is_none());
+        assert!(data.get(0x379).is_none());
+        assert_eq!(data.get(0x37a).unwrap().name(), "GREEK YPOGEGRAMMENI");
+
+        assert_eq!(data.get(0x33ff).unwrap().name(), "SQUARE GAL");
+        assert_eq!(data.get(0x3400).unwrap().name(), "CJK Ideograph Extension A");
+        assert_eq!(data.get(0x3401).unwrap().name(), "CJK Ideograph Extension A");
+        assert_eq!(data.get(0x3402).unwrap().name(), "CJK Ideograph Extension A");
+        assert_eq!(data.get(0x4dbe).unwrap().name(), "CJK Ideograph Extension A");
+        assert_eq!(data.get(0x4dbf).unwrap().name(), "CJK Ideograph Extension A");
+        assert_eq!(data.get(0x4dc0).unwrap().name(), "HEXAGRAM FOR THE CREATIVE HEAVEN");
+
+        assert_eq!(data.get(0x1039f).unwrap().name(), "UGARITIC WORD DIVIDER");
     }
 }
diff --git a/lib/src/unicode_data.rs b/lib/src/unicode_data.rs
index 41d27d2..432f4cd 100644
--- a/lib/src/unicode_data.rs
+++ b/lib/src/unicode_data.rs
@@ -1,7 +1,16 @@
-use core::{fmt, mem, slice};
+use core::{fmt, mem, slice, str};
 
 use tap::Pipe;
 
+use crate::character::{
+    CharData,
+    Category,
+    BidiCategory,
+    OptionalDecompKind,
+    CombiningClass,
+    DecompMapping,
+};
+
 const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!";
 
 #[derive(Clone, Copy)]
@@ -32,8 +41,73 @@ impl<'a> UnicodeData<'a> {
         Ok(Self { group_table, char_table, string_table })
     }
 
-    pub(crate) fn chars(self) -> CharTable<'a> {
-        self.char_table
+    pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
+        let entry = self.char_entry_for(codepoint)?;
+
+        let flags_and_categories = entry.flags_and_categories.to_u16();
+        let category = Category::decode((flags_and_categories & 0x1f) as u8)?;
+        let bidi = BidiCategory::decode(((flags_and_categories >> 5) & 0x1f) as u8)?;
+        let decomp_kind = OptionalDecompKind::decode(((flags_and_categories >> 10) & 0x1f) as u8)?;
+        let mirrored = (flags_and_categories >> 15) != 0;
+
+        let name = self.string_table.get_u24_le(entry.name)?;
+        
+        let decomp_value = self.string_table.get_u24_le(entry.decomp);
+        let decomp = match (decomp_kind, decomp_value) {
+            (OptionalDecompKind::None, _) | (_, None) => None,
+            (OptionalDecompKind::Anon, Some(value)) => {
+                Some(DecompMapping::new(None, value))
+            },
+            (OptionalDecompKind::Named(kind), Some(value)) => {
+                Some(DecompMapping::new(Some(kind), value))
+            },
+        };
+        
+        let numeric = self.string_table.get_u24_le(entry.numeric);
+        let old_name = self.string_table.get_u24_le(entry.old_name); 
+        let comment = self.string_table.get_u24_le(entry.comment);
+        let uppercase = self.string_table.get_u24_le(entry.uppercase);
+        let lowercase = self.string_table.get_u24_le(entry.lowercase);
+        let titlecase = self.string_table.get_u24_le(entry.titlecase);
+
+        let combining = CombiningClass(entry.combining);
+
+        let decimal_digit = match entry.digit & 0xf {
+            0xf => None,
+            n => Some(n),
+        };
+
+        let digit = match (entry.digit >> 4) & 0xf {
+            0xf => None,
+            n => Some(n),
+        };
+
+        Some(CharData {
+            codepoint,
+            name,
+            category,
+            combining,
+            bidi,
+            decomp,
+            decimal_digit,
+            digit,
+            numeric,
+            mirrored,
+            old_name,
+            comment,
+            uppercase,
+            lowercase,
+            titlecase,
+        })
+    }
+
+    fn char_entry_for(self, codepoint: u32) -> Option<&'a CharTableEntry> {
+        let index = self.group_table
+            .char_table_index_for(codepoint)?
+            .pipe(usize::try_from)
+            .ok()?;
+
+        self.char_table.get(index)
     }
 }
 
@@ -78,8 +152,74 @@ impl<'a> GroupTable<'a> {
 
         Ok(Self { entries })
     }
+
+    // TODO: compare performance of binary search to linear search
+    // TODO: fast path for characters before the first group
+    fn char_table_index_for(self, codepoint: u32) -> Option<u32> {
+        let mut entries = self.entries;
+        let mut offset = 0;
+
+        loop {
+            if entries.len() == 0 {
+                break codepoint.checked_sub(offset);
+            }
+
+            let midpoint = entries.len() / 2;
+            let entry = &entries[midpoint];
+            let start = entry.start.to_u32();
+            let end = entry.end.to_u32();
+            let total_len_before = entry.total_len_before.to_u32();
+
+            if start <= codepoint && codepoint <= end {
+                match entry.kind {
+                    GROUP_KIND_USE_PREV_VALUE => {
+                        // This group uses the same character data as the codepoint immediately
+                        // before the group start (`start - 1`). Subtract `total_len_before`, which
+                        // is the total length of all groups before this group, from `start - 1` to
+                        // find the index of its character data in the character table.
+                        break start
+                            .checked_sub(1)
+                            .expect("first codepoint for a USE_PREV_VALUE group should always be at least 1")
+                            .checked_sub(total_len_before)
+                            .expect("computed character data index should not underflow")
+                            .pipe(Some)
+                    },
+                    
+                    // If the codepoint is in a group which is not `USE_PREV_VALUE`, we take it to
+                    // be a codepoint with no associated character data.
+                    _ => break None,
+                }
+            } else if codepoint > end {
+                // Since the `end` is inclusive, the length of the group is calculated as
+                // `(end - start) + 1`.
+                let group_len = end
+                    .checked_sub(start)
+                    .expect("group start should be less than or equal to the group end")
+                    .checked_add(1)
+                    .expect("group length should not overflow a u32");
+
+                // `total_len_before` is the total length of all groups before this group, so we
+                // can calculate the total length of all groups up to and including this group by
+                // adding `group_len` to it. We assign this to `offset` because this is the group
+                // with the largest `end` value that is less than the codepoint, and is therefore
+                // the offset that should be used to calculate the character table index for this
+                // codepoint in the event that there are no groups with a larger `end` value less
+                // than the codepoint and the codepoint is not contained in a group.
+                offset = total_len_before
+                    .checked_add(group_len)
+                    .expect("cumulative group length should not overflow a u32");
+
+                entries = &entries[(midpoint + 1)..];
+            } else {
+                entries = &entries[..midpoint];
+            }
+        }
+    }
 }
 
+const GROUP_KIND_NO_VALUE: u8 = 0;
+const GROUP_KIND_USE_PREV_VALUE: u8 = 1;
+
 #[derive(Debug)]
 #[repr(C, packed)]
 struct GroupTableEntry {
@@ -95,7 +235,7 @@ impl GroupTableEntry {
 
 #[derive(Debug)]
 #[derive(Clone, Copy)]
-pub(crate) struct CharTable<'a> {
+struct CharTable<'a> {
     entries: &'a [CharTableEntry],
 }
 
@@ -135,6 +275,10 @@ impl<'a> CharTable<'a> {
         
         Ok(Self { entries })
     }
+
+    fn get(self, i: usize) -> Option<&'a CharTableEntry> {
+        self.entries.get(i)
+    }
 }
 
 #[derive(Debug)]
@@ -166,6 +310,26 @@ impl<'a> StringTable<'a> {
     fn new(bs: &'a [u8]) -> Self {
         Self { inner: bs }
     }
+
+    fn get(self, i: usize) -> Option<&'a str> {
+        let len = usize::from(*self.inner.get(i)?);
+        
+        let str_start = i.checked_add(1)?;
+        let str_end = str_start.checked_add(len)?;
+
+        self.inner.get(str_start..str_end)
+            .and_then(|s| str::from_utf8(s).ok())
+    }
+
+    fn get_u24_le(self, i: U24Le) -> Option<&'a str> {
+        const NIL_INDEX_PATTERN: [u8; 3] = [0xff; 3];
+        
+        if i.0 == NIL_INDEX_PATTERN {
+            return None;
+        }
+
+        i.to_usize().and_then(|i| self.get(i))
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -194,6 +358,10 @@ impl U24Le {
         (&mut buf[..3]).copy_from_slice(&self.0);
         u32::from_le_bytes(buf)
     }
+
+    fn to_usize(self) -> Option<usize> {
+        usize::try_from(self.to_u32()).ok()
+    }
 }
 
 impl fmt::Debug for U24Le {