From 24abc7ed796444c693f1f63f89ecafa883c3280e Mon Sep 17 00:00:00 2001 From: pantonshire Date: Fri, 2 Jun 2023 20:50:47 +0100 Subject: [PATCH] work on rust side of new encoded data format --- Cargo.lock | 66 ++++++++++-- lib/Cargo.toml | 5 +- lib/build.rs | 75 +++----------- lib/src/lib.rs | 34 +++++-- lib/src/unicode_data.rs | 217 ++++++++++++++++++++++++++++++++++++++++ lib/src/utf8.rs | 4 +- 6 files changed, 315 insertions(+), 86 deletions(-) create mode 100644 lib/src/unicode_data.rs diff --git a/Cargo.lock b/Cargo.lock index 85026ee..935c564 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "atty" version = "0.2.14" @@ -31,6 +37,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "3.2.25" @@ -70,6 +82,25 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -121,11 +152,20 @@ dependencies = [ "serde", ] +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + [[package]] name = "once_cell" -version = "1.17.1" +version = "1.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b" [[package]] name = "os_str_bytes" @@ -170,18 +210,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] @@ -233,6 +273,12 @@ dependencies = [ "syn", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "termcolor" version = "1.2.0" @@ -250,9 +296,9 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-width" @@ -264,8 +310,8 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" name = "utfdump" version = "0.1.0" dependencies = [ - "once_cell", - "utfdump_core", + "flate2", + "tap", ] [[package]] diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 62a5e19..6b4e213 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -4,8 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -utfdump_core = { path = "../core" } -once_cell = "1.17.1" +tap = "1.0.1" [build-dependencies] -utfdump_core = { path = "../core" } +flate2 = "1.0.26" diff --git a/lib/build.rs b/lib/build.rs index 5d0244a..c80209f 100644 --- a/lib/build.rs +++ b/lib/build.rs @@ -1,71 +1,20 @@ -use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path}; +use std::{env, fs::File, io, path::Path}; -use utfdump_core::{CharData, DataStoreBuf}; - -const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt"; +const COMPRESSED_DATA_PATH: &str = "../unicode_data_encoded.gz"; const OUT_DATA_PATH: &str = "unicode_data_encoded"; -fn main() { - println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH); - +fn main() -> io::Result<()> { + println!("cargo:rerun-if-changed={}", COMPRESSED_DATA_PATH); + let out_dir = env::var_os("OUT_DIR").unwrap(); let out_path = Path::new(&out_dir).join(OUT_DATA_PATH); + let out_data_fd = File::create(out_path)?; + let mut decoder = flate2::write::GzDecoder::new(out_data_fd); + + let mut compressed_data_fd = File::open(COMPRESSED_DATA_PATH)?; - let data_file = File::open(UNICODE_DATA_PATH) - .expect("failed to open unicode data file"); - - let buf_reader = BufReader::new(data_file); - - let mut data = DataStoreBuf::new(); - let mut start_codepoint = None; - - for line in buf_reader.lines() { - let line = line.unwrap(); - let (codepoint, char_data) = CharData::from_row(&line).unwrap(); - - match start_codepoint { - Some(start_codepoint_inner) => { - let prefix = char_data.name() - .strip_suffix(", Last>") - .expect("expected end of codepoint block"); - - let name = { - let mut buf = String::with_capacity(prefix.len() + 1); - buf.push_str(prefix); - buf.push('>'); - buf - }; - - let char_data = char_data.with_name(&name); - - data.insert(char_data, start_codepoint_inner..(codepoint + 1)) - .unwrap(); - - start_codepoint = None; - }, - - None => { - if char_data.name().ends_with(", First>") { - start_codepoint = Some(codepoint); - } else { - data.insert(char_data, codepoint..(codepoint + 1)) - .unwrap(); - } - }, - } - } - - let (strings_len, [strings, data]) = data - .as_ref_type() - .to_bytes() - .unwrap(); - - let mut out_file = File::create(&out_path) - .expect("failed to open output file"); - - out_file.write_all(&strings_len).unwrap(); - out_file.write_all(strings).unwrap(); - out_file.write_all(data).unwrap(); + io::copy(&mut compressed_data_fd, &mut decoder)?; + decoder.finish()?; - drop(out_file); + Ok(()) } diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 58a181a..9ac68e8 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -1,18 +1,36 @@ +pub mod unicode_data; pub mod utf8; -pub use utfdump_core::{CharData, Category, CombiningClass}; +// pub use utfdump_core::{CharData, Category, CombiningClass}; -use once_cell::sync::Lazy; -use utfdump_core::data_store::DataStore; +// use once_cell::sync::Lazy; +// use utfdump_core::data_store::DataStore; + +// const UNICODE_DATA_BYTES: &[u8] = include_bytes!( +// concat!(env!("OUT_DIR"), "/unicode_data_encoded") +// ); + +// static UNICODE_DATA: Lazy = Lazy::new(|| { +// DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap() +// }); + +// pub fn char_data(c: char) -> Option> { +// UNICODE_DATA.get(c) +// } const UNICODE_DATA_BYTES: &[u8] = include_bytes!( concat!(env!("OUT_DIR"), "/unicode_data_encoded") ); -static UNICODE_DATA: Lazy = Lazy::new(|| { - DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap() -}); +#[cfg(test)] +mod tests { + use crate::{UNICODE_DATA_BYTES, unicode_data}; + + #[test] + fn test_encoded_data() { + let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES) + .unwrap(); -pub fn char_data(c: char) -> Option> { - UNICODE_DATA.get(c) + println!("{:#?}", data.groups()); + } } diff --git a/lib/src/unicode_data.rs b/lib/src/unicode_data.rs new file mode 100644 index 0000000..1f5e666 --- /dev/null +++ b/lib/src/unicode_data.rs @@ -0,0 +1,217 @@ +use core::{fmt, mem, slice}; + +use tap::Pipe; + +const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!"; + +#[derive(Clone, Copy)] +pub struct UnicodeData<'a> { + group_table: GroupTable<'a>, + char_table: CharTable<'a>, + string_table: StringTable<'a>, +} + +impl<'a> UnicodeData<'a> { + pub(crate) fn from_bytes(bs: &'a [u8]) -> Result { + let mut bs = ByteStream(bs); + + if bs.consume(MAGIC_NUMBER.len())? != MAGIC_NUMBER { + return Err(UnicodeDataError::InvalidHeader); + } + + let group_table_len = bs.consume_4_byte_len()?; + let char_table_len = bs.consume_4_byte_len()?; + let string_table_len = bs.consume_4_byte_len()?; + + let group_table = bs.consume(group_table_len)?.pipe(GroupTable::new)?; + let char_table = bs.consume(char_table_len)?.pipe(CharTable::new); + let string_table = bs.consume(string_table_len)?.pipe(StringTable::new); + + bs.check_empty()?; + + Ok(Self { group_table, char_table, string_table }) + } + + pub(crate) fn groups(self) -> GroupTable<'a> { + self.group_table + } +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct GroupTable<'a> { + entries: &'a [GroupTableEntry], +} + +impl<'a> GroupTable<'a> { + fn new(bs: &'a [u8]) -> Result { + if bs.len() % GroupTableEntry::SIZE != 0 { + return Err(UnicodeDataError::InvalidTableSize); + } + + let num_entries = bs.len() / GroupTableEntry::SIZE; + + // SAFETY: + // - The pointer is valid for reads of `num_entries * mem::size_of::()` + // bytes; `num_entries = bs.len() / mem::size_of::()`, so + // `num_entries * mem::size_of::() <= bs.len()` (the inequality is due + // to flooring integer division), and clearly a pointer to `bs` is valid for reads of + // <= `bs.len()` bytes. + // + // - `u8` and `GroupTableEntry` both have an alignment of 1 (since `GroupTableEntry` is + // packed), so the pointer is correctly aligned. + // + // - The pointer points to `num_entries` consecutive properly-initialised `GroupTableEntry` + // values, as `bs` contains initialised data and `GroupTableEntry` consists only of + // arrays of `u8` of varying sizes, for which any bit pattern is valid. + // + // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be + // mutated by safe code for the duration of the lifetime `'a`. + // + // - The total length of the slice does not exceed `isize::MAX`, since it is no larger + // than `bs` which is a valid slice and therefore no larger than `isize::MAX`. + let entries = unsafe { + slice::from_raw_parts( + bs.as_ptr() as *const GroupTableEntry, + num_entries + ) + }; + + Ok(Self { entries }) + } +} + +#[derive(Debug)] +#[repr(C, packed)] +struct GroupTableEntry { + start: U32Le, + end: U32Le, + total_len_before: U32Le, + kind: u8, +} + +impl GroupTableEntry { + const SIZE: usize = mem::size_of::(); +} + +#[derive(Clone, Copy)] +struct CharTable<'a> { + inner: &'a [u8], +} + +impl<'a> CharTable<'a> { + fn new(bs: &'a [u8]) -> Self { + Self { inner: bs } + } +} + +#[derive(Clone, Copy)] +struct StringTable<'a> { + inner: &'a [u8], +} + +impl<'a> StringTable<'a> { + fn new(bs: &'a [u8]) -> Self { + Self { inner: bs } + } +} + +#[derive(Clone, Copy)] +#[repr(transparent)] +struct U16Le([u8; 2]); + +impl U16Le { + fn to_u16(self) -> u16 { + u16::from_le_bytes(self.0) + } +} + +impl fmt::Debug for U16Le { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.to_u16(), f) + } +} + +#[derive(Clone, Copy)] +#[repr(transparent)] +struct U24Le([u8; 3]); + +impl U24Le { + fn to_u32(self) -> u32 { + let mut buf = [0u8; 4]; + (&mut buf[..3]).copy_from_slice(&self.0); + u32::from_le_bytes(buf) + } +} + +impl fmt::Debug for U24Le { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.to_u32(), f) + } +} + +#[derive(Clone, Copy)] +#[repr(transparent)] +struct U32Le([u8; 4]); + +impl U32Le { + fn to_u32(self) -> u32 { + u32::from_le_bytes(self.0) + } +} + +impl fmt::Debug for U32Le { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.to_u32(), f) + } +} + +struct ByteStream<'a>(&'a [u8]); + +impl<'a> ByteStream<'a> { + fn consume(&mut self, n: usize) -> Result<&'a [u8], UnicodeDataError> { + if n > self.0.len() { + return Err(UnicodeDataError::InsufficientBytes); + } + + let consumed = &self.0[..n]; + self.0 = &self.0[n..]; + Ok(consumed) + } + + fn consume_4_byte_len(&mut self) -> Result { + self.consume(4)? + .pipe(<[u8; 4]>::try_from) + .unwrap() + .pipe(u32::from_le_bytes) + .pipe(usize::try_from) + .map_err(|_| UnicodeDataError::OutOfBounds) + } + + fn check_empty(&self) -> Result<(), UnicodeDataError> { + self.0 + .is_empty() + .then_some(()) + .ok_or(UnicodeDataError::LeftoverBytes) + } +} + +#[derive(Debug)] +pub enum UnicodeDataError { + InvalidHeader, + InsufficientBytes, + OutOfBounds, + LeftoverBytes, + InvalidTableSize, +} + +impl fmt::Display for UnicodeDataError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidHeader => write!(f, "invalid header"), + Self::InsufficientBytes => write!(f, "fewer bytes than expected"), + Self::OutOfBounds => write!(f, "index out of bounds"), + Self::LeftoverBytes => write!(f, "unexpected bytes found after expected end of data"), + Self::InvalidTableSize => write!(f, "invalid table size"), + } + } +} diff --git a/lib/src/utf8.rs b/lib/src/utf8.rs index 8cbee11..a15e18e 100644 --- a/lib/src/utf8.rs +++ b/lib/src/utf8.rs @@ -1,4 +1,4 @@ -use std::iter::Peekable; +use core::iter::Peekable; pub trait ToByte { fn to_byte(self) -> u8; @@ -186,7 +186,7 @@ impl Utf8Error { #[cfg(test)] mod tests { - use std::char::REPLACEMENT_CHARACTER; + use core::char::REPLACEMENT_CHARACTER; use super::Utf8Decode;