use core::{fmt, mem, slice, str}; use tap::Pipe; use crate::character::{ CharData, Category, BidiCategory, OptionalDecompKind, CombiningClass, DecompMapping, }; const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!"; #[derive(Clone, Copy)] pub struct UnicodeData<'a> { group_table: GroupTable<'a>, char_table: CharTable<'a>, string_table: StringTable<'a>, } impl<'a> UnicodeData<'a> { pub(crate) fn from_bytes(bs: &'a [u8]) -> Result { let mut bs = ByteStream(bs); if bs.consume(MAGIC_NUMBER.len())? != MAGIC_NUMBER { return Err(UnicodeDataError::InvalidHeader); } let group_table_len = bs.consume_4_byte_len()?; let char_table_len = bs.consume_4_byte_len()?; let string_table_len = bs.consume_4_byte_len()?; let group_table = bs.consume(group_table_len)?.pipe(GroupTable::new)?; let char_table = bs.consume(char_table_len)?.pipe(CharTable::new)?; let string_table = bs.consume(string_table_len)?.pipe(StringTable::new); bs.check_empty()?; Ok(Self { group_table, char_table, string_table }) } pub fn get(self, codepoint: u32) -> Option> { let entry = self.char_entry_for(codepoint)?; let flags_and_categories = entry.flags_and_categories.to_u16(); let category = Category::decode((flags_and_categories & 0x1f) as u8)?; let bidi = BidiCategory::decode(((flags_and_categories >> 5) & 0x1f) as u8)?; let decomp_kind = OptionalDecompKind::decode(((flags_and_categories >> 10) & 0x1f) as u8)?; let mirrored = (flags_and_categories >> 15) != 0; let name = self.string_table.get_u24_le(entry.name)?; let decomp_value = self.string_table.get_u24_le(entry.decomp); let decomp = match (decomp_kind, decomp_value) { (OptionalDecompKind::None, _) | (_, None) => None, (OptionalDecompKind::Anon, Some(value)) => { Some(DecompMapping::new(None, value)) }, (OptionalDecompKind::Named(kind), Some(value)) => { Some(DecompMapping::new(Some(kind), value)) }, }; let numeric = self.string_table.get_u24_le(entry.numeric); let old_name = self.string_table.get_u24_le(entry.old_name); let comment = self.string_table.get_u24_le(entry.comment); let uppercase = self.string_table.get_u24_le(entry.uppercase); let lowercase = self.string_table.get_u24_le(entry.lowercase); let titlecase = self.string_table.get_u24_le(entry.titlecase); let combining = CombiningClass(entry.combining); let decimal_digit = match entry.digit & 0xf { 0xf => None, n => Some(n), }; let digit = match (entry.digit >> 4) & 0xf { 0xf => None, n => Some(n), }; Some(CharData { codepoint, name, category, combining, bidi, decomp, decimal_digit, digit, numeric, mirrored, old_name, comment, uppercase, lowercase, titlecase, }) } fn char_entry_for(self, codepoint: u32) -> Option<&'a CharTableEntry> { let index = self.group_table .char_table_index_for(codepoint)? .pipe(usize::try_from) .ok()?; self.char_table.get(index) } } #[derive(Clone, Copy, Debug)] struct GroupTable<'a> { entries: &'a [GroupTableEntry], } impl<'a> GroupTable<'a> { fn new(bs: &'a [u8]) -> Result { if bs.len() % GroupTableEntry::SIZE != 0 { return Err(UnicodeDataError::InvalidTableSize); } let num_entries = bs.len() / GroupTableEntry::SIZE; // SAFETY: // - The pointer is valid for reads of `num_entries * mem::size_of::()` // bytes; `num_entries = bs.len() / mem::size_of::()`, so // `num_entries * mem::size_of::() <= bs.len()` (the inequality is due // to flooring integer division), and clearly a pointer to `bs` is valid for reads of // <= `bs.len()` bytes. // // - `u8` and `GroupTableEntry` both have an alignment of 1 (since `GroupTableEntry` is // packed), so the pointer is correctly aligned. // // - The pointer points to `num_entries` consecutive properly-initialised `GroupTableEntry` // values, as `bs` contains initialised data and `GroupTableEntry` consists only of // arrays of `u8` of varying sizes, for which any bit pattern is valid. // // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be // mutated by safe code for the duration of the lifetime `'a`. // // - The total length of the slice does not exceed `isize::MAX`, since it is no larger // than `bs` which is a valid slice and therefore no larger than `isize::MAX`. let entries = unsafe { slice::from_raw_parts( bs.as_ptr() as *const GroupTableEntry, num_entries ) }; Ok(Self { entries }) } // TODO: compare performance of binary search to linear search // TODO: fast path for characters before the first group fn char_table_index_for(self, codepoint: u32) -> Option { let mut entries = self.entries; let mut offset = 0; loop { if entries.len() == 0 { break codepoint.checked_sub(offset); } let midpoint = entries.len() / 2; let entry = &entries[midpoint]; let start = entry.start.to_u32(); let end = entry.end.to_u32(); let total_len_before = entry.total_len_before.to_u32(); if start <= codepoint && codepoint <= end { match entry.kind { GROUP_KIND_USE_PREV_VALUE => { // This group uses the same character data as the codepoint immediately // before the group start (`start - 1`). Subtract `total_len_before`, which // is the total length of all groups before this group, from `start - 1` to // find the index of its character data in the character table. break start .checked_sub(1) .expect("first codepoint for a USE_PREV_VALUE group should always be at least 1") .checked_sub(total_len_before) .expect("computed character data index should not underflow") .pipe(Some) }, // If the codepoint is in a group which is not `USE_PREV_VALUE`, we take it to // be a codepoint with no associated character data. _ => break None, } } else if codepoint > end { // Since the `end` is inclusive, the length of the group is calculated as // `(end - start) + 1`. let group_len = end .checked_sub(start) .expect("group start should be less than or equal to the group end") .checked_add(1) .expect("group length should not overflow a u32"); // `total_len_before` is the total length of all groups before this group, so we // can calculate the total length of all groups up to and including this group by // adding `group_len` to it. We assign this to `offset` because this is the group // with the largest `end` value that is less than the codepoint, and is therefore // the offset that should be used to calculate the character table index for this // codepoint in the event that there are no groups with a larger `end` value less // than the codepoint and the codepoint is not contained in a group. offset = total_len_before .checked_add(group_len) .expect("cumulative group length should not overflow a u32"); entries = &entries[(midpoint + 1)..]; } else { entries = &entries[..midpoint]; } } } } const GROUP_KIND_NO_VALUE: u8 = 0; const GROUP_KIND_USE_PREV_VALUE: u8 = 1; #[derive(Debug)] #[repr(C, packed)] struct GroupTableEntry { start: U32Le, end: U32Le, total_len_before: U32Le, kind: u8, } impl GroupTableEntry { const SIZE: usize = mem::size_of::(); } #[derive(Debug)] #[derive(Clone, Copy)] struct CharTable<'a> { entries: &'a [CharTableEntry], } impl<'a> CharTable<'a> { fn new(bs: &'a [u8]) -> Result { if bs.len() % CharTableEntry::SIZE != 0 { return Err(UnicodeDataError::InvalidTableSize); } let num_entries = bs.len() / CharTableEntry::SIZE; // SAFETY: // - The pointer is valid for reads of `num_entries * mem::size_of::()` // bytes; `num_entries = bs.len() / mem::size_of::()`, so // `num_entries * mem::size_of::() <= bs.len()` (the inequality is due // to flooring integer division), and clearly a pointer to `bs` is valid for reads of // <= `bs.len()` bytes. // // - `u8` and `CharTableEntry` both have an alignment of 1 (since `CharTableEntry` is // packed), so the pointer is correctly aligned. // // - The pointer points to `num_entries` consecutive properly-initialised `CharTableEntry` // values, as `bs` contains initialised data and `CharTableEntry` consists only of // arrays of `u8` of varying sizes, for which any bit pattern is valid. // // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be // mutated by safe code for the duration of the lifetime `'a`. // // - The total length of the slice does not exceed `isize::MAX`, since it is no larger // than `bs` which is a valid slice and therefore no larger than `isize::MAX`. let entries = unsafe { slice::from_raw_parts( bs.as_ptr() as *const CharTableEntry, num_entries ) }; Ok(Self { entries }) } fn get(self, i: usize) -> Option<&'a CharTableEntry> { self.entries.get(i) } } #[derive(Debug)] #[repr(C, packed)] struct CharTableEntry { flags_and_categories: U16Le, name: U24Le, decomp: U24Le, numeric: U24Le, old_name: U24Le, comment: U24Le, uppercase: U24Le, lowercase: U24Le, titlecase: U24Le, combining: u8, digit: u8, } impl CharTableEntry { const SIZE: usize = mem::size_of::(); } #[derive(Clone, Copy)] struct StringTable<'a> { inner: &'a [u8], } impl<'a> StringTable<'a> { fn new(bs: &'a [u8]) -> Self { Self { inner: bs } } fn get(self, i: usize) -> Option<&'a str> { let len = usize::from(*self.inner.get(i)?); let str_start = i.checked_add(1)?; let str_end = str_start.checked_add(len)?; self.inner.get(str_start..str_end) .and_then(|s| str::from_utf8(s).ok()) } fn get_u24_le(self, i: U24Le) -> Option<&'a str> { const NIL_INDEX_PATTERN: [u8; 3] = [0xff; 3]; if i.0 == NIL_INDEX_PATTERN { return None; } i.to_usize().and_then(|i| self.get(i)) } } #[derive(Clone, Copy)] #[repr(transparent)] struct U16Le([u8; 2]); impl U16Le { fn to_u16(self) -> u16 { u16::from_le_bytes(self.0) } } impl fmt::Debug for U16Le { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.to_u16(), f) } } #[derive(Clone, Copy)] #[repr(transparent)] struct U24Le([u8; 3]); impl U24Le { fn to_u32(self) -> u32 { let mut buf = [0u8; 4]; (&mut buf[..3]).copy_from_slice(&self.0); u32::from_le_bytes(buf) } fn to_usize(self) -> Option { usize::try_from(self.to_u32()).ok() } } impl fmt::Debug for U24Le { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.to_u32(), f) } } #[derive(Clone, Copy)] #[repr(transparent)] struct U32Le([u8; 4]); impl U32Le { fn to_u32(self) -> u32 { u32::from_le_bytes(self.0) } } impl fmt::Debug for U32Le { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.to_u32(), f) } } struct ByteStream<'a>(&'a [u8]); impl<'a> ByteStream<'a> { fn consume(&mut self, n: usize) -> Result<&'a [u8], UnicodeDataError> { if n > self.0.len() { return Err(UnicodeDataError::InsufficientBytes); } let consumed = &self.0[..n]; self.0 = &self.0[n..]; Ok(consumed) } fn consume_4_byte_len(&mut self) -> Result { self.consume(4)? .pipe(<[u8; 4]>::try_from) .unwrap() .pipe(u32::from_le_bytes) .pipe(usize::try_from) .map_err(|_| UnicodeDataError::OutOfBounds) } fn check_empty(&self) -> Result<(), UnicodeDataError> { self.0 .is_empty() .then_some(()) .ok_or(UnicodeDataError::LeftoverBytes) } } #[derive(Debug)] pub enum UnicodeDataError { InvalidHeader, InsufficientBytes, OutOfBounds, LeftoverBytes, InvalidTableSize, } impl fmt::Display for UnicodeDataError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::InvalidHeader => write!(f, "invalid header"), Self::InsufficientBytes => write!(f, "fewer bytes than expected"), Self::OutOfBounds => write!(f, "index out of bounds"), Self::LeftoverBytes => write!(f, "unexpected bytes found after expected end of data"), Self::InvalidTableSize => write!(f, "invalid table size"), } } }