working rust decoder for new encoded data format

main
pantonshire 3 years ago
parent dc4650262d
commit a3750b5732

@ -0,0 +1,315 @@
#[derive(Debug)]
pub struct CharData<'a> {
pub(crate) codepoint: u32,
pub(crate) name: &'a str,
pub(crate) category: Category,
pub(crate) combining: CombiningClass,
pub(crate) bidi: BidiCategory,
pub(crate) decomp: Option<DecompMapping<'a>>,
pub(crate) decimal_digit: Option<u8>,
pub(crate) digit: Option<u8>,
// FIXME: replace with exact fraction type?
pub(crate) numeric: Option<&'a str>,
pub(crate) mirrored: bool,
pub(crate) old_name: Option<&'a str>,
pub(crate) comment: Option<&'a str>,
pub(crate) uppercase: Option<&'a str>,
pub(crate) lowercase: Option<&'a str>,
pub(crate) titlecase: Option<&'a str>,
}
impl<'a> CharData<'a> {
#[inline]
#[must_use]
pub fn codepoint(&self) -> u32 {
self.codepoint
}
#[inline]
#[must_use]
pub fn name(&self) -> &'a str {
self.name
}
#[inline]
#[must_use]
pub fn category(&self) -> Category {
self.category
}
#[inline]
#[must_use]
pub fn bidi(&self) -> BidiCategory {
self.bidi
}
#[inline]
#[must_use]
pub fn decomp(&self) -> Option<DecompMapping<'a>> {
self.decomp
}
#[inline]
#[must_use]
pub fn decimal_digit(&self) -> Option<u8> {
self.decimal_digit
}
#[inline]
#[must_use]
pub fn digit(&self) -> Option<u8> {
self.digit
}
#[inline]
#[must_use]
pub fn numeric(&self) -> Option<&'a str> {
self.numeric
}
#[inline]
#[must_use]
pub fn mirrored(&self) -> bool {
self.mirrored
}
#[inline]
#[must_use]
pub fn old_name(&self) -> Option<&'a str> {
self.old_name
}
#[inline]
#[must_use]
pub fn comment(&self) -> Option<&'a str> {
self.comment
}
#[inline]
#[must_use]
pub fn uppercase(&self) -> Option<&'a str> {
self.uppercase
}
#[inline]
#[must_use]
pub fn lowercase(&self) -> Option<&'a str> {
self.lowercase
}
#[inline]
#[must_use]
pub fn titlecase(&self) -> Option<&'a str> {
self.titlecase
}
}
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
pub struct CombiningClass(pub u8);
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum Category {
Lu,
Ll,
Lt,
Mn,
Mc,
Me,
Nd,
Nl,
No,
Zs,
Zl,
Zp,
Cc,
Cf,
Cs,
Co,
Cn,
Lm,
Lo,
Pc,
Pd,
Ps,
Pe,
Pi,
Pf,
Po,
Sm,
Sc,
Sk,
So,
}
impl Category {
pub(crate) fn decode(encoded: u8) -> Option<Self> {
match encoded {
0 => Some(Self::Lu),
1 => Some(Self::Ll),
2 => Some(Self::Lt),
3 => Some(Self::Mn),
4 => Some(Self::Mc),
5 => Some(Self::Me),
6 => Some(Self::Nd),
7 => Some(Self::Nl),
8 => Some(Self::No),
9 => Some(Self::Zs),
10 => Some(Self::Zl),
11 => Some(Self::Zp),
12 => Some(Self::Cc),
13 => Some(Self::Cf),
14 => Some(Self::Cs),
15 => Some(Self::Co),
16 => Some(Self::Cn),
17 => Some(Self::Lm),
18 => Some(Self::Lo),
19 => Some(Self::Pc),
20 => Some(Self::Pd),
21 => Some(Self::Ps),
22 => Some(Self::Pe),
23 => Some(Self::Pi),
24 => Some(Self::Pf),
25 => Some(Self::Po),
26 => Some(Self::Sm),
27 => Some(Self::Sc),
28 => Some(Self::Sk),
29 => Some(Self::So),
_ => None,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum BidiCategory {
L,
R,
Al,
En,
Es,
Et,
An,
Cs,
Nsm,
Bn,
B,
S,
Ws,
On,
Lre,
Lro,
Rle,
Rlo,
Pdf,
Lri,
Rli,
Fsi,
Pdi,
}
impl BidiCategory {
pub(crate) fn decode(encoded: u8) -> Option<Self> {
match encoded {
0 => Some(Self::L),
1 => Some(Self::R),
2 => Some(Self::Al),
3 => Some(Self::En),
4 => Some(Self::Es),
5 => Some(Self::Et),
6 => Some(Self::An),
7 => Some(Self::Cs),
8 => Some(Self::Nsm),
9 => Some(Self::Bn),
10 => Some(Self::B),
11 => Some(Self::S),
12 => Some(Self::Ws),
13 => Some(Self::On),
14 => Some(Self::Lre),
15 => Some(Self::Lro),
16 => Some(Self::Rle),
17 => Some(Self::Rlo),
18 => Some(Self::Pdf),
19 => Some(Self::Lri),
20 => Some(Self::Rli),
21 => Some(Self::Fsi),
22 => Some(Self::Pdi),
_ => None,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct DecompMapping<'a> {
kind: Option<DecompKind>,
value: &'a str,
}
impl<'a> DecompMapping<'a> {
pub(crate) fn new(kind: Option<DecompKind>, value: &'a str) -> Self {
Self { kind, value }
}
#[inline]
#[must_use]
pub fn kind(self) -> Option<DecompKind> {
self.kind
}
#[inline]
#[must_use]
pub fn value(self) -> &'a str {
self.value
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum DecompKind {
Nobreak,
Compat,
Super,
Fraction,
Sub,
Font,
Circle,
Wide,
Vertical,
Square,
Isolated,
Final,
Initial,
Medial,
Small,
Narrow,
}
pub(crate) enum OptionalDecompKind {
None,
Anon,
Named(DecompKind),
}
impl OptionalDecompKind {
pub(crate) fn decode(encoded: u8) -> Option<Self> {
match encoded {
0 => Some(Self::None),
1 => Some(Self::Anon),
2 => Some(Self::Named(DecompKind::Nobreak)),
3 => Some(Self::Named(DecompKind::Compat)),
4 => Some(Self::Named(DecompKind::Super)),
5 => Some(Self::Named(DecompKind::Fraction)),
6 => Some(Self::Named(DecompKind::Sub)),
7 => Some(Self::Named(DecompKind::Font)),
8 => Some(Self::Named(DecompKind::Circle)),
9 => Some(Self::Named(DecompKind::Wide)),
10 => Some(Self::Named(DecompKind::Vertical)),
11 => Some(Self::Named(DecompKind::Square)),
12 => Some(Self::Named(DecompKind::Isolated)),
13 => Some(Self::Named(DecompKind::Final)),
14 => Some(Self::Named(DecompKind::Initial)),
15 => Some(Self::Named(DecompKind::Medial)),
16 => Some(Self::Named(DecompKind::Small)),
17 => Some(Self::Named(DecompKind::Narrow)),
_ => None,
}
}
}

@ -1,3 +1,4 @@
pub mod character;
pub mod unicode_data;
pub mod utf8;
@ -27,11 +28,30 @@ mod tests {
use crate::{UNICODE_DATA_BYTES, unicode_data};
#[test]
fn test_encoded_data() {
fn test_data_decode() {
let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES)
.unwrap();
// println!("{:#?}", data.groups());
println!("{:#?}", data.chars());
assert_eq!(data.get(0x0).unwrap().name(), "<control>");
assert_eq!(data.get(0x0).unwrap().old_name(), Some("NULL"));
assert_eq!(data.get(0x1).unwrap().name(), "<control>");
assert_eq!(data.get(0x1).unwrap().old_name(), Some("START OF HEADING"));
assert_eq!(data.get(0x2).unwrap().name(), "<control>");
assert_eq!(data.get(0x2).unwrap().old_name(), Some("START OF TEXT"));
assert_eq!(data.get(0x377).unwrap().name(), "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA");
assert!(data.get(0x378).is_none());
assert!(data.get(0x379).is_none());
assert_eq!(data.get(0x37a).unwrap().name(), "GREEK YPOGEGRAMMENI");
assert_eq!(data.get(0x33ff).unwrap().name(), "SQUARE GAL");
assert_eq!(data.get(0x3400).unwrap().name(), "CJK Ideograph Extension A");
assert_eq!(data.get(0x3401).unwrap().name(), "CJK Ideograph Extension A");
assert_eq!(data.get(0x3402).unwrap().name(), "CJK Ideograph Extension A");
assert_eq!(data.get(0x4dbe).unwrap().name(), "CJK Ideograph Extension A");
assert_eq!(data.get(0x4dbf).unwrap().name(), "CJK Ideograph Extension A");
assert_eq!(data.get(0x4dc0).unwrap().name(), "HEXAGRAM FOR THE CREATIVE HEAVEN");
assert_eq!(data.get(0x1039f).unwrap().name(), "UGARITIC WORD DIVIDER");
}
}

@ -1,7 +1,16 @@
use core::{fmt, mem, slice};
use core::{fmt, mem, slice, str};
use tap::Pipe;
use crate::character::{
CharData,
Category,
BidiCategory,
OptionalDecompKind,
CombiningClass,
DecompMapping,
};
const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!";
#[derive(Clone, Copy)]
@ -32,8 +41,73 @@ impl<'a> UnicodeData<'a> {
Ok(Self { group_table, char_table, string_table })
}
pub(crate) fn chars(self) -> CharTable<'a> {
self.char_table
pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
let entry = self.char_entry_for(codepoint)?;
let flags_and_categories = entry.flags_and_categories.to_u16();
let category = Category::decode((flags_and_categories & 0x1f) as u8)?;
let bidi = BidiCategory::decode(((flags_and_categories >> 5) & 0x1f) as u8)?;
let decomp_kind = OptionalDecompKind::decode(((flags_and_categories >> 10) & 0x1f) as u8)?;
let mirrored = (flags_and_categories >> 15) != 0;
let name = self.string_table.get_u24_le(entry.name)?;
let decomp_value = self.string_table.get_u24_le(entry.decomp);
let decomp = match (decomp_kind, decomp_value) {
(OptionalDecompKind::None, _) | (_, None) => None,
(OptionalDecompKind::Anon, Some(value)) => {
Some(DecompMapping::new(None, value))
},
(OptionalDecompKind::Named(kind), Some(value)) => {
Some(DecompMapping::new(Some(kind), value))
},
};
let numeric = self.string_table.get_u24_le(entry.numeric);
let old_name = self.string_table.get_u24_le(entry.old_name);
let comment = self.string_table.get_u24_le(entry.comment);
let uppercase = self.string_table.get_u24_le(entry.uppercase);
let lowercase = self.string_table.get_u24_le(entry.lowercase);
let titlecase = self.string_table.get_u24_le(entry.titlecase);
let combining = CombiningClass(entry.combining);
let decimal_digit = match entry.digit & 0xf {
0xf => None,
n => Some(n),
};
let digit = match (entry.digit >> 4) & 0xf {
0xf => None,
n => Some(n),
};
Some(CharData {
codepoint,
name,
category,
combining,
bidi,
decomp,
decimal_digit,
digit,
numeric,
mirrored,
old_name,
comment,
uppercase,
lowercase,
titlecase,
})
}
fn char_entry_for(self, codepoint: u32) -> Option<&'a CharTableEntry> {
let index = self.group_table
.char_table_index_for(codepoint)?
.pipe(usize::try_from)
.ok()?;
self.char_table.get(index)
}
}
@ -78,8 +152,74 @@ impl<'a> GroupTable<'a> {
Ok(Self { entries })
}
// TODO: compare performance of binary search to linear search
// TODO: fast path for characters before the first group
fn char_table_index_for(self, codepoint: u32) -> Option<u32> {
let mut entries = self.entries;
let mut offset = 0;
loop {
if entries.len() == 0 {
break codepoint.checked_sub(offset);
}
let midpoint = entries.len() / 2;
let entry = &entries[midpoint];
let start = entry.start.to_u32();
let end = entry.end.to_u32();
let total_len_before = entry.total_len_before.to_u32();
if start <= codepoint && codepoint <= end {
match entry.kind {
GROUP_KIND_USE_PREV_VALUE => {
// This group uses the same character data as the codepoint immediately
// before the group start (`start - 1`). Subtract `total_len_before`, which
// is the total length of all groups before this group, from `start - 1` to
// find the index of its character data in the character table.
break start
.checked_sub(1)
.expect("first codepoint for a USE_PREV_VALUE group should always be at least 1")
.checked_sub(total_len_before)
.expect("computed character data index should not underflow")
.pipe(Some)
},
// If the codepoint is in a group which is not `USE_PREV_VALUE`, we take it to
// be a codepoint with no associated character data.
_ => break None,
}
} else if codepoint > end {
// Since the `end` is inclusive, the length of the group is calculated as
// `(end - start) + 1`.
let group_len = end
.checked_sub(start)
.expect("group start should be less than or equal to the group end")
.checked_add(1)
.expect("group length should not overflow a u32");
// `total_len_before` is the total length of all groups before this group, so we
// can calculate the total length of all groups up to and including this group by
// adding `group_len` to it. We assign this to `offset` because this is the group
// with the largest `end` value that is less than the codepoint, and is therefore
// the offset that should be used to calculate the character table index for this
// codepoint in the event that there are no groups with a larger `end` value less
// than the codepoint and the codepoint is not contained in a group.
offset = total_len_before
.checked_add(group_len)
.expect("cumulative group length should not overflow a u32");
entries = &entries[(midpoint + 1)..];
} else {
entries = &entries[..midpoint];
}
}
}
}
const GROUP_KIND_NO_VALUE: u8 = 0;
const GROUP_KIND_USE_PREV_VALUE: u8 = 1;
#[derive(Debug)]
#[repr(C, packed)]
struct GroupTableEntry {
@ -95,7 +235,7 @@ impl GroupTableEntry {
#[derive(Debug)]
#[derive(Clone, Copy)]
pub(crate) struct CharTable<'a> {
struct CharTable<'a> {
entries: &'a [CharTableEntry],
}
@ -135,6 +275,10 @@ impl<'a> CharTable<'a> {
Ok(Self { entries })
}
fn get(self, i: usize) -> Option<&'a CharTableEntry> {
self.entries.get(i)
}
}
#[derive(Debug)]
@ -166,6 +310,26 @@ impl<'a> StringTable<'a> {
fn new(bs: &'a [u8]) -> Self {
Self { inner: bs }
}
fn get(self, i: usize) -> Option<&'a str> {
let len = usize::from(*self.inner.get(i)?);
let str_start = i.checked_add(1)?;
let str_end = str_start.checked_add(len)?;
self.inner.get(str_start..str_end)
.and_then(|s| str::from_utf8(s).ok())
}
fn get_u24_le(self, i: U24Le) -> Option<&'a str> {
const NIL_INDEX_PATTERN: [u8; 3] = [0xff; 3];
if i.0 == NIL_INDEX_PATTERN {
return None;
}
i.to_usize().and_then(|i| self.get(i))
}
}
#[derive(Clone, Copy)]
@ -194,6 +358,10 @@ impl U24Le {
(&mut buf[..3]).copy_from_slice(&self.0);
u32::from_le_bytes(buf)
}
fn to_usize(self) -> Option<usize> {
usize::try_from(self.to_u32()).ok()
}
}
impl fmt::Debug for U24Le {

Loading…
Cancel
Save