From 24abc7ed796444c693f1f63f89ecafa883c3280e Mon Sep 17 00:00:00 2001
From: pantonshire <tom@tomandtally.co.uk>
Date: Fri, 2 Jun 2023 20:50:47 +0100
Subject: [PATCH] work on rust side of new encoded data format

---
 Cargo.lock              |  66 ++++++++++--
 lib/Cargo.toml          |   5 +-
 lib/build.rs            |  75 +++-----------
 lib/src/lib.rs          |  34 +++++--
 lib/src/unicode_data.rs | 217 ++++++++++++++++++++++++++++++++++++++++
 lib/src/utf8.rs         |   4 +-
 6 files changed, 315 insertions(+), 86 deletions(-)
 create mode 100644 lib/src/unicode_data.rs

diff --git a/Cargo.lock b/Cargo.lock
index 85026ee..935c564 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -31,6 +37,12 @@ version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
 
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
 [[package]]
 name = "clap"
 version = "3.2.25"
@@ -70,6 +82,25 @@ dependencies = [
  "os_str_bytes",
 ]
 
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -121,11 +152,20 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.17.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
 
 [[package]]
 name = "os_str_bytes"
@@ -170,18 +210,18 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
+checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.27"
+version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
+checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
 dependencies = [
  "proc-macro2",
 ]
@@ -233,6 +273,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "termcolor"
 version = "1.2.0"
@@ -250,9 +296,9 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.8"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
+checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
 [[package]]
 name = "unicode-width"
@@ -264,8 +310,8 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
 name = "utfdump"
 version = "0.1.0"
 dependencies = [
- "once_cell",
- "utfdump_core",
+ "flate2",
+ "tap",
 ]
 
 [[package]]
diff --git a/lib/Cargo.toml b/lib/Cargo.toml
index 62a5e19..6b4e213 100644
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@@ -4,8 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-utfdump_core = { path = "../core" }
-once_cell = "1.17.1"
+tap = "1.0.1"
 
 [build-dependencies]
-utfdump_core = { path = "../core" }
+flate2 = "1.0.26"
diff --git a/lib/build.rs b/lib/build.rs
index 5d0244a..c80209f 100644
--- a/lib/build.rs
+++ b/lib/build.rs
@@ -1,71 +1,20 @@
-use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path};
+use std::{env, fs::File, io, path::Path};
 
-use utfdump_core::{CharData, DataStoreBuf};
-
-const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt";
+const COMPRESSED_DATA_PATH: &str = "../unicode_data_encoded.gz";
 const OUT_DATA_PATH: &str = "unicode_data_encoded";
 
-fn main() {
-    println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH);
-    
+fn main() -> io::Result<()> {
+    println!("cargo:rerun-if-changed={}", COMPRESSED_DATA_PATH);
+
     let out_dir = env::var_os("OUT_DIR").unwrap();
     let out_path = Path::new(&out_dir).join(OUT_DATA_PATH);
+    let out_data_fd = File::create(out_path)?;
+    let mut decoder = flate2::write::GzDecoder::new(out_data_fd);
+    
+    let mut compressed_data_fd = File::open(COMPRESSED_DATA_PATH)?;
 
-    let data_file = File::open(UNICODE_DATA_PATH)
-        .expect("failed to open unicode data file");
-
-    let buf_reader = BufReader::new(data_file);
- 
-    let mut data = DataStoreBuf::new();
-    let mut start_codepoint = None;
-
-    for line in buf_reader.lines() {
-        let line = line.unwrap();
-        let (codepoint, char_data) = CharData::from_row(&line).unwrap();
-        
-        match start_codepoint {
-            Some(start_codepoint_inner) => {
-                let prefix = char_data.name()
-                    .strip_suffix(", Last>")
-                    .expect("expected end of codepoint block");
-
-                let name = {
-                    let mut buf = String::with_capacity(prefix.len() + 1);
-                    buf.push_str(prefix);
-                    buf.push('>');
-                    buf
-                };
-
-                let char_data = char_data.with_name(&name);
-
-                data.insert(char_data, start_codepoint_inner..(codepoint + 1))
-                    .unwrap();
-
-                start_codepoint = None;
-            },
-
-            None => {
-                if char_data.name().ends_with(", First>") {
-                    start_codepoint = Some(codepoint);
-                } else {
-                    data.insert(char_data, codepoint..(codepoint + 1))
-                        .unwrap();
-                }
-            },
-        }
-    }
-
-    let (strings_len, [strings, data]) = data
-        .as_ref_type()
-        .to_bytes()
-        .unwrap();
-
-    let mut out_file = File::create(&out_path)
-        .expect("failed to open output file");
-
-    out_file.write_all(&strings_len).unwrap();
-    out_file.write_all(strings).unwrap();
-    out_file.write_all(data).unwrap();
+    io::copy(&mut compressed_data_fd, &mut decoder)?;
+    decoder.finish()?;
 
-    drop(out_file);
+    Ok(())
 }
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
index 58a181a..9ac68e8 100644
--- a/lib/src/lib.rs
+++ b/lib/src/lib.rs
@@ -1,18 +1,36 @@
+pub mod unicode_data;
 pub mod utf8;
 
-pub use utfdump_core::{CharData, Category, CombiningClass};
+// pub use utfdump_core::{CharData, Category, CombiningClass};
 
-use once_cell::sync::Lazy;
-use utfdump_core::data_store::DataStore;
+// use once_cell::sync::Lazy;
+// use utfdump_core::data_store::DataStore;
+
+// const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
+//     concat!(env!("OUT_DIR"), "/unicode_data_encoded")
+// );
+
+// static UNICODE_DATA: Lazy<DataStore> = Lazy::new(|| {
+//     DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap()
+// });
+
+// pub fn char_data(c: char) -> Option<CharData<'static>> {
+//     UNICODE_DATA.get(c)
+// }
 
 const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
     concat!(env!("OUT_DIR"), "/unicode_data_encoded")
 );
 
-static UNICODE_DATA: Lazy<DataStore> = Lazy::new(|| {
-    DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap()
-});
+#[cfg(test)]
+mod tests {
+    use crate::{UNICODE_DATA_BYTES, unicode_data};
+
+    #[test]
+    fn test_encoded_data() {
+        let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES)
+            .unwrap();
 
-pub fn char_data(c: char) -> Option<CharData<'static>> {
-    UNICODE_DATA.get(c)
+        println!("{:#?}", data.groups());
+    }
 }
diff --git a/lib/src/unicode_data.rs b/lib/src/unicode_data.rs
new file mode 100644
index 0000000..1f5e666
--- /dev/null
+++ b/lib/src/unicode_data.rs
@@ -0,0 +1,217 @@
+use core::{fmt, mem, slice};
+
+use tap::Pipe;
+
+const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!";
+
+#[derive(Clone, Copy)]
+pub struct UnicodeData<'a> {
+    group_table: GroupTable<'a>,
+    char_table: CharTable<'a>,
+    string_table: StringTable<'a>,
+}
+
+impl<'a> UnicodeData<'a> {
+    pub(crate) fn from_bytes(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
+        let mut bs = ByteStream(bs);
+
+        if bs.consume(MAGIC_NUMBER.len())? != MAGIC_NUMBER {
+            return Err(UnicodeDataError::InvalidHeader);
+        }
+
+        let group_table_len = bs.consume_4_byte_len()?;
+        let char_table_len = bs.consume_4_byte_len()?;
+        let string_table_len = bs.consume_4_byte_len()?;
+
+        let group_table = bs.consume(group_table_len)?.pipe(GroupTable::new)?;
+        let char_table = bs.consume(char_table_len)?.pipe(CharTable::new);
+        let string_table = bs.consume(string_table_len)?.pipe(StringTable::new);
+        
+        bs.check_empty()?;
+        
+        Ok(Self { group_table, char_table, string_table })
+    }
+
+    pub(crate) fn groups(self) -> GroupTable<'a> {
+        self.group_table
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct GroupTable<'a> {
+    entries: &'a [GroupTableEntry],
+}
+
+impl<'a> GroupTable<'a> {
+    fn new(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
+        if bs.len() % GroupTableEntry::SIZE != 0 {
+            return Err(UnicodeDataError::InvalidTableSize);
+        }
+
+        let num_entries = bs.len() / GroupTableEntry::SIZE;
+
+        // SAFETY:
+        // - The pointer is valid for reads of `num_entries * mem::size_of::<GroupTableEntry>()`
+        //   bytes; `num_entries = bs.len() / mem::size_of::<GroupTableEntry>()`, so
+        //   `num_entries * mem::size_of::<GroupTableEntry>() <= bs.len()` (the inequality is due
+        //   to flooring integer division), and clearly a pointer to `bs` is valid for reads of
+        //   <= `bs.len()` bytes.
+        //
+        // - `u8` and `GroupTableEntry` both have an alignment of 1 (since `GroupTableEntry` is
+        //    packed), so the pointer is correctly aligned.
+        //
+        // - The pointer points to `num_entries` consecutive properly-initialised `GroupTableEntry`
+        //   values, as `bs` contains initialised data and `GroupTableEntry` consists only of
+        //   arrays of `u8` of varying sizes, for which any bit pattern is valid.
+        //
+        // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be
+        //   mutated by safe code for the duration of the lifetime `'a`.
+        //
+        // - The total length of the slice does not exceed `isize::MAX`, since it is no larger
+        //   than `bs` which is a valid slice and therefore no larger than `isize::MAX`.
+        let entries = unsafe {
+            slice::from_raw_parts(
+                bs.as_ptr() as *const GroupTableEntry,
+                num_entries
+            )
+        };
+
+        Ok(Self { entries })
+    }
+}
+
+#[derive(Debug)]
+#[repr(C, packed)]
+struct GroupTableEntry {
+    start: U32Le,
+    end: U32Le,
+    total_len_before: U32Le,
+    kind: u8,
+}
+
+impl GroupTableEntry {
+    const SIZE: usize = mem::size_of::<Self>();
+}
+
+#[derive(Clone, Copy)]
+struct CharTable<'a> {
+    inner: &'a [u8],
+}
+
+impl<'a> CharTable<'a> {
+    fn new(bs: &'a [u8]) -> Self {
+        Self { inner: bs }
+    }
+}
+
+#[derive(Clone, Copy)]
+struct StringTable<'a> {
+    inner: &'a [u8],
+}
+
+impl<'a> StringTable<'a> {
+    fn new(bs: &'a [u8]) -> Self {
+        Self { inner: bs }
+    }
+}
+
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+struct U16Le([u8; 2]);
+
+impl U16Le {
+    fn to_u16(self) -> u16 {
+        u16::from_le_bytes(self.0)
+    }
+}
+
+impl fmt::Debug for U16Le {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.to_u16(), f)
+    }
+}
+
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+struct U24Le([u8; 3]);
+
+impl U24Le {
+    fn to_u32(self) -> u32 {
+        let mut buf = [0u8; 4];
+        (&mut buf[..3]).copy_from_slice(&self.0);
+        u32::from_le_bytes(buf)
+    }
+}
+
+impl fmt::Debug for U24Le {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.to_u32(), f)
+    }
+}
+
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+struct U32Le([u8; 4]);
+
+impl U32Le {
+    fn to_u32(self) -> u32 {
+        u32::from_le_bytes(self.0)
+    }
+}
+
+impl fmt::Debug for U32Le {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.to_u32(), f)
+    }
+}
+
+struct ByteStream<'a>(&'a [u8]);
+
+impl<'a> ByteStream<'a> {
+    fn consume(&mut self, n: usize) -> Result<&'a [u8], UnicodeDataError> {
+        if n > self.0.len() {
+            return Err(UnicodeDataError::InsufficientBytes);
+        }
+
+        let consumed = &self.0[..n];
+        self.0 = &self.0[n..];
+        Ok(consumed)
+    }
+
+    fn consume_4_byte_len(&mut self) -> Result<usize, UnicodeDataError> {
+        self.consume(4)?
+            .pipe(<[u8; 4]>::try_from)
+            .unwrap()
+            .pipe(u32::from_le_bytes)
+            .pipe(usize::try_from)
+            .map_err(|_| UnicodeDataError::OutOfBounds)
+    }
+
+    fn check_empty(&self) -> Result<(), UnicodeDataError> {
+        self.0
+            .is_empty()
+            .then_some(())
+            .ok_or(UnicodeDataError::LeftoverBytes)
+    }
+}
+
+#[derive(Debug)]
+pub enum UnicodeDataError {
+    InvalidHeader,
+    InsufficientBytes,
+    OutOfBounds,
+    LeftoverBytes,
+    InvalidTableSize,
+}
+
+impl fmt::Display for UnicodeDataError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::InvalidHeader => write!(f, "invalid header"),
+            Self::InsufficientBytes => write!(f, "fewer bytes than expected"),
+            Self::OutOfBounds => write!(f, "index out of bounds"),
+            Self::LeftoverBytes => write!(f, "unexpected bytes found after expected end of data"),
+            Self::InvalidTableSize => write!(f, "invalid table size"),
+        }
+    }
+}
diff --git a/lib/src/utf8.rs b/lib/src/utf8.rs
index 8cbee11..a15e18e 100644
--- a/lib/src/utf8.rs
+++ b/lib/src/utf8.rs
@@ -1,4 +1,4 @@
-use std::iter::Peekable;
+use core::iter::Peekable;
 
 pub trait ToByte {
     fn to_byte(self) -> u8;
@@ -186,7 +186,7 @@ impl Utf8Error {
 
 #[cfg(test)]
 mod tests {
-    use std::char::REPLACEMENT_CHARACTER;
+    use core::char::REPLACEMENT_CHARACTER;
 
     use super::Utf8Decode;