From a3750b57320c8de19715796f65a1d5c128d11a01 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Sun, 4 Jun 2023 18:12:30 +0100 Subject: [PATCH] working rust decoder for new encoded data format --- lib/src/character.rs | 315 ++++++++++++++++++++++++++++++++++++++++ lib/src/lib.rs | 26 +++- lib/src/unicode_data.rs | 176 +++++++++++++++++++++- 3 files changed, 510 insertions(+), 7 deletions(-) create mode 100644 lib/src/character.rs diff --git a/lib/src/character.rs b/lib/src/character.rs new file mode 100644 index 0000000..4b512f5 --- /dev/null +++ b/lib/src/character.rs @@ -0,0 +1,315 @@ + +#[derive(Debug)] +pub struct CharData<'a> { + pub(crate) codepoint: u32, + pub(crate) name: &'a str, + pub(crate) category: Category, + pub(crate) combining: CombiningClass, + pub(crate) bidi: BidiCategory, + pub(crate) decomp: Option>, + pub(crate) decimal_digit: Option, + pub(crate) digit: Option, + // FIXME: replace with exact fraction type? + pub(crate) numeric: Option<&'a str>, + pub(crate) mirrored: bool, + pub(crate) old_name: Option<&'a str>, + pub(crate) comment: Option<&'a str>, + pub(crate) uppercase: Option<&'a str>, + pub(crate) lowercase: Option<&'a str>, + pub(crate) titlecase: Option<&'a str>, +} + +impl<'a> CharData<'a> { + #[inline] + #[must_use] + pub fn codepoint(&self) -> u32 { + self.codepoint + } + + #[inline] + #[must_use] + pub fn name(&self) -> &'a str { + self.name + } + + #[inline] + #[must_use] + pub fn category(&self) -> Category { + self.category + } + + #[inline] + #[must_use] + pub fn bidi(&self) -> BidiCategory { + self.bidi + } + + #[inline] + #[must_use] + pub fn decomp(&self) -> Option> { + self.decomp + } + + #[inline] + #[must_use] + pub fn decimal_digit(&self) -> Option { + self.decimal_digit + } + + #[inline] + #[must_use] + pub fn digit(&self) -> Option { + self.digit + } + + #[inline] + #[must_use] + pub fn numeric(&self) -> Option<&'a str> { + self.numeric + } + + #[inline] + #[must_use] + pub fn mirrored(&self) -> bool { + self.mirrored + } + + #[inline] + #[must_use] + pub fn old_name(&self) -> Option<&'a str> { + self.old_name + } + + #[inline] + #[must_use] + pub fn comment(&self) -> Option<&'a str> { + self.comment + } + + #[inline] + #[must_use] + pub fn uppercase(&self) -> Option<&'a str> { + self.uppercase + } + + #[inline] + #[must_use] + pub fn lowercase(&self) -> Option<&'a str> { + self.lowercase + } + + #[inline] + #[must_use] + pub fn titlecase(&self) -> Option<&'a str> { + self.titlecase + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub struct CombiningClass(pub u8); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum Category { + Lu, + Ll, + Lt, + Mn, + Mc, + Me, + Nd, + Nl, + No, + Zs, + Zl, + Zp, + Cc, + Cf, + Cs, + Co, + Cn, + Lm, + Lo, + Pc, + Pd, + Ps, + Pe, + Pi, + Pf, + Po, + Sm, + Sc, + Sk, + So, +} + +impl Category { + pub(crate) fn decode(encoded: u8) -> Option { + match encoded { + 0 => Some(Self::Lu), + 1 => Some(Self::Ll), + 2 => Some(Self::Lt), + 3 => Some(Self::Mn), + 4 => Some(Self::Mc), + 5 => Some(Self::Me), + 6 => Some(Self::Nd), + 7 => Some(Self::Nl), + 8 => Some(Self::No), + 9 => Some(Self::Zs), + 10 => Some(Self::Zl), + 11 => Some(Self::Zp), + 12 => Some(Self::Cc), + 13 => Some(Self::Cf), + 14 => Some(Self::Cs), + 15 => Some(Self::Co), + 16 => Some(Self::Cn), + 17 => Some(Self::Lm), + 18 => Some(Self::Lo), + 19 => Some(Self::Pc), + 20 => Some(Self::Pd), + 21 => Some(Self::Ps), + 22 => Some(Self::Pe), + 23 => Some(Self::Pi), + 24 => Some(Self::Pf), + 25 => Some(Self::Po), + 26 => Some(Self::Sm), + 27 => Some(Self::Sc), + 28 => Some(Self::Sk), + 29 => Some(Self::So), + _ => None, + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum BidiCategory { + L, + R, + Al, + En, + Es, + Et, + An, + Cs, + Nsm, + Bn, + B, + S, + Ws, + On, + Lre, + Lro, + Rle, + Rlo, + Pdf, + Lri, + Rli, + Fsi, + Pdi, +} + +impl BidiCategory { + pub(crate) fn decode(encoded: u8) -> Option { + match encoded { + 0 => Some(Self::L), + 1 => Some(Self::R), + 2 => Some(Self::Al), + 3 => Some(Self::En), + 4 => Some(Self::Es), + 5 => Some(Self::Et), + 6 => Some(Self::An), + 7 => Some(Self::Cs), + 8 => Some(Self::Nsm), + 9 => Some(Self::Bn), + 10 => Some(Self::B), + 11 => Some(Self::S), + 12 => Some(Self::Ws), + 13 => Some(Self::On), + 14 => Some(Self::Lre), + 15 => Some(Self::Lro), + 16 => Some(Self::Rle), + 17 => Some(Self::Rlo), + 18 => Some(Self::Pdf), + 19 => Some(Self::Lri), + 20 => Some(Self::Rli), + 21 => Some(Self::Fsi), + 22 => Some(Self::Pdi), + _ => None, + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub struct DecompMapping<'a> { + kind: Option, + value: &'a str, +} + +impl<'a> DecompMapping<'a> { + pub(crate) fn new(kind: Option, value: &'a str) -> Self { + Self { kind, value } + } + + #[inline] + #[must_use] + pub fn kind(self) -> Option { + self.kind + } + + #[inline] + #[must_use] + pub fn value(self) -> &'a str { + self.value + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum DecompKind { + Nobreak, + Compat, + Super, + Fraction, + Sub, + Font, + Circle, + Wide, + Vertical, + Square, + Isolated, + Final, + Initial, + Medial, + Small, + Narrow, +} + +pub(crate) enum OptionalDecompKind { + None, + Anon, + Named(DecompKind), +} + +impl OptionalDecompKind { + pub(crate) fn decode(encoded: u8) -> Option { + match encoded { + 0 => Some(Self::None), + 1 => Some(Self::Anon), + 2 => Some(Self::Named(DecompKind::Nobreak)), + 3 => Some(Self::Named(DecompKind::Compat)), + 4 => Some(Self::Named(DecompKind::Super)), + 5 => Some(Self::Named(DecompKind::Fraction)), + 6 => Some(Self::Named(DecompKind::Sub)), + 7 => Some(Self::Named(DecompKind::Font)), + 8 => Some(Self::Named(DecompKind::Circle)), + 9 => Some(Self::Named(DecompKind::Wide)), + 10 => Some(Self::Named(DecompKind::Vertical)), + 11 => Some(Self::Named(DecompKind::Square)), + 12 => Some(Self::Named(DecompKind::Isolated)), + 13 => Some(Self::Named(DecompKind::Final)), + 14 => Some(Self::Named(DecompKind::Initial)), + 15 => Some(Self::Named(DecompKind::Medial)), + 16 => Some(Self::Named(DecompKind::Small)), + 17 => Some(Self::Named(DecompKind::Narrow)), + _ => None, + } + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index b988ebb..ce11caf 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -1,3 +1,4 @@ +pub mod character; pub mod unicode_data; pub mod utf8; @@ -27,11 +28,30 @@ mod tests { use crate::{UNICODE_DATA_BYTES, unicode_data}; #[test] - fn test_encoded_data() { + fn test_data_decode() { let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES) .unwrap(); - // println!("{:#?}", data.groups()); - println!("{:#?}", data.chars()); + assert_eq!(data.get(0x0).unwrap().name(), ""); + assert_eq!(data.get(0x0).unwrap().old_name(), Some("NULL")); + assert_eq!(data.get(0x1).unwrap().name(), ""); + assert_eq!(data.get(0x1).unwrap().old_name(), Some("START OF HEADING")); + assert_eq!(data.get(0x2).unwrap().name(), ""); + assert_eq!(data.get(0x2).unwrap().old_name(), Some("START OF TEXT")); + + assert_eq!(data.get(0x377).unwrap().name(), "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"); + assert!(data.get(0x378).is_none()); + assert!(data.get(0x379).is_none()); + assert_eq!(data.get(0x37a).unwrap().name(), "GREEK YPOGEGRAMMENI"); + + assert_eq!(data.get(0x33ff).unwrap().name(), "SQUARE GAL"); + assert_eq!(data.get(0x3400).unwrap().name(), "CJK Ideograph Extension A"); + assert_eq!(data.get(0x3401).unwrap().name(), "CJK Ideograph Extension A"); + assert_eq!(data.get(0x3402).unwrap().name(), "CJK Ideograph Extension A"); + assert_eq!(data.get(0x4dbe).unwrap().name(), "CJK Ideograph Extension A"); + assert_eq!(data.get(0x4dbf).unwrap().name(), "CJK Ideograph Extension A"); + assert_eq!(data.get(0x4dc0).unwrap().name(), "HEXAGRAM FOR THE CREATIVE HEAVEN"); + + assert_eq!(data.get(0x1039f).unwrap().name(), "UGARITIC WORD DIVIDER"); } } diff --git a/lib/src/unicode_data.rs b/lib/src/unicode_data.rs index 41d27d2..432f4cd 100644 --- a/lib/src/unicode_data.rs +++ b/lib/src/unicode_data.rs @@ -1,7 +1,16 @@ -use core::{fmt, mem, slice}; +use core::{fmt, mem, slice, str}; use tap::Pipe; +use crate::character::{ + CharData, + Category, + BidiCategory, + OptionalDecompKind, + CombiningClass, + DecompMapping, +}; + const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!"; #[derive(Clone, Copy)] @@ -32,8 +41,73 @@ impl<'a> UnicodeData<'a> { Ok(Self { group_table, char_table, string_table }) } - pub(crate) fn chars(self) -> CharTable<'a> { - self.char_table + pub fn get(self, codepoint: u32) -> Option> { + let entry = self.char_entry_for(codepoint)?; + + let flags_and_categories = entry.flags_and_categories.to_u16(); + let category = Category::decode((flags_and_categories & 0x1f) as u8)?; + let bidi = BidiCategory::decode(((flags_and_categories >> 5) & 0x1f) as u8)?; + let decomp_kind = OptionalDecompKind::decode(((flags_and_categories >> 10) & 0x1f) as u8)?; + let mirrored = (flags_and_categories >> 15) != 0; + + let name = self.string_table.get_u24_le(entry.name)?; + + let decomp_value = self.string_table.get_u24_le(entry.decomp); + let decomp = match (decomp_kind, decomp_value) { + (OptionalDecompKind::None, _) | (_, None) => None, + (OptionalDecompKind::Anon, Some(value)) => { + Some(DecompMapping::new(None, value)) + }, + (OptionalDecompKind::Named(kind), Some(value)) => { + Some(DecompMapping::new(Some(kind), value)) + }, + }; + + let numeric = self.string_table.get_u24_le(entry.numeric); + let old_name = self.string_table.get_u24_le(entry.old_name); + let comment = self.string_table.get_u24_le(entry.comment); + let uppercase = self.string_table.get_u24_le(entry.uppercase); + let lowercase = self.string_table.get_u24_le(entry.lowercase); + let titlecase = self.string_table.get_u24_le(entry.titlecase); + + let combining = CombiningClass(entry.combining); + + let decimal_digit = match entry.digit & 0xf { + 0xf => None, + n => Some(n), + }; + + let digit = match (entry.digit >> 4) & 0xf { + 0xf => None, + n => Some(n), + }; + + Some(CharData { + codepoint, + name, + category, + combining, + bidi, + decomp, + decimal_digit, + digit, + numeric, + mirrored, + old_name, + comment, + uppercase, + lowercase, + titlecase, + }) + } + + fn char_entry_for(self, codepoint: u32) -> Option<&'a CharTableEntry> { + let index = self.group_table + .char_table_index_for(codepoint)? + .pipe(usize::try_from) + .ok()?; + + self.char_table.get(index) } } @@ -78,8 +152,74 @@ impl<'a> GroupTable<'a> { Ok(Self { entries }) } + + // TODO: compare performance of binary search to linear search + // TODO: fast path for characters before the first group + fn char_table_index_for(self, codepoint: u32) -> Option { + let mut entries = self.entries; + let mut offset = 0; + + loop { + if entries.len() == 0 { + break codepoint.checked_sub(offset); + } + + let midpoint = entries.len() / 2; + let entry = &entries[midpoint]; + let start = entry.start.to_u32(); + let end = entry.end.to_u32(); + let total_len_before = entry.total_len_before.to_u32(); + + if start <= codepoint && codepoint <= end { + match entry.kind { + GROUP_KIND_USE_PREV_VALUE => { + // This group uses the same character data as the codepoint immediately + // before the group start (`start - 1`). Subtract `total_len_before`, which + // is the total length of all groups before this group, from `start - 1` to + // find the index of its character data in the character table. + break start + .checked_sub(1) + .expect("first codepoint for a USE_PREV_VALUE group should always be at least 1") + .checked_sub(total_len_before) + .expect("computed character data index should not underflow") + .pipe(Some) + }, + + // If the codepoint is in a group which is not `USE_PREV_VALUE`, we take it to + // be a codepoint with no associated character data. + _ => break None, + } + } else if codepoint > end { + // Since the `end` is inclusive, the length of the group is calculated as + // `(end - start) + 1`. + let group_len = end + .checked_sub(start) + .expect("group start should be less than or equal to the group end") + .checked_add(1) + .expect("group length should not overflow a u32"); + + // `total_len_before` is the total length of all groups before this group, so we + // can calculate the total length of all groups up to and including this group by + // adding `group_len` to it. We assign this to `offset` because this is the group + // with the largest `end` value that is less than the codepoint, and is therefore + // the offset that should be used to calculate the character table index for this + // codepoint in the event that there are no groups with a larger `end` value less + // than the codepoint and the codepoint is not contained in a group. + offset = total_len_before + .checked_add(group_len) + .expect("cumulative group length should not overflow a u32"); + + entries = &entries[(midpoint + 1)..]; + } else { + entries = &entries[..midpoint]; + } + } + } } +const GROUP_KIND_NO_VALUE: u8 = 0; +const GROUP_KIND_USE_PREV_VALUE: u8 = 1; + #[derive(Debug)] #[repr(C, packed)] struct GroupTableEntry { @@ -95,7 +235,7 @@ impl GroupTableEntry { #[derive(Debug)] #[derive(Clone, Copy)] -pub(crate) struct CharTable<'a> { +struct CharTable<'a> { entries: &'a [CharTableEntry], } @@ -135,6 +275,10 @@ impl<'a> CharTable<'a> { Ok(Self { entries }) } + + fn get(self, i: usize) -> Option<&'a CharTableEntry> { + self.entries.get(i) + } } #[derive(Debug)] @@ -166,6 +310,26 @@ impl<'a> StringTable<'a> { fn new(bs: &'a [u8]) -> Self { Self { inner: bs } } + + fn get(self, i: usize) -> Option<&'a str> { + let len = usize::from(*self.inner.get(i)?); + + let str_start = i.checked_add(1)?; + let str_end = str_start.checked_add(len)?; + + self.inner.get(str_start..str_end) + .and_then(|s| str::from_utf8(s).ok()) + } + + fn get_u24_le(self, i: U24Le) -> Option<&'a str> { + const NIL_INDEX_PATTERN: [u8; 3] = [0xff; 3]; + + if i.0 == NIL_INDEX_PATTERN { + return None; + } + + i.to_usize().and_then(|i| self.get(i)) + } } #[derive(Clone, Copy)] @@ -194,6 +358,10 @@ impl U24Le { (&mut buf[..3]).copy_from_slice(&self.0); u32::from_le_bytes(buf) } + + fn to_usize(self) -> Option { + usize::try_from(self.to_u32()).ok() + } } impl fmt::Debug for U24Le {