From 6e8d197ae481668be8e9894504a1733e195a65c3 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Mon, 5 Jun 2023 10:26:13 +0100 Subject: [PATCH] remove core --- Cargo.toml | 1 - core/Cargo.toml | 10 -- core/src/char_data.rs | 281 --------------------------------------- core/src/data_store.rs | 189 -------------------------- core/src/lib.rs | 6 - core/src/string_table.rs | 98 -------------- 6 files changed, 585 deletions(-) delete mode 100644 core/Cargo.toml delete mode 100644 core/src/char_data.rs delete mode 100644 core/src/data_store.rs delete mode 100644 core/src/lib.rs delete mode 100644 core/src/string_table.rs diff --git a/Cargo.toml b/Cargo.toml index 4c45115..a3a8149 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,5 @@ [workspace] members = [ - "core", "lib", "bin", ] diff --git a/core/Cargo.toml b/core/Cargo.toml deleted file mode 100644 index d1c6715..0000000 --- a/core/Cargo.toml +++ /dev/null @@ -1,10 +0,0 @@ -[package] -name = "utfdump_core" -version = "0.1.0" -edition = "2021" -authors = ["Tom Panton "] -license = "MIT" -repository = "https://github.com/pantonshire/utfdump" -description = "Core library for the utfdump command-line tool" - -[dependencies] diff --git a/core/src/char_data.rs b/core/src/char_data.rs deleted file mode 100644 index 4f5d2c0..0000000 --- a/core/src/char_data.rs +++ /dev/null @@ -1,281 +0,0 @@ -use std::fmt; - -#[derive(Clone, Debug)] -pub struct CharData<'a> { - name: &'a str, - category: Category, - combining_class: CombiningClass, -} - -impl<'a> CharData<'a> { - pub fn from_row(row: &'a str) -> Option<(u32, Self)> { - let mut fields = [""; 15]; - for (i, field) in row.splitn(15, ';').enumerate() { - fields[i] = field; - } - - let codepoint = u32::from_str_radix(fields[0], 16).ok()?; - let name = fields[1]; - let category = Category::from_abbr(fields[2])?; - let ccc = CombiningClass(u8::from_str_radix(fields[3], 10).ok()?); - - Some((codepoint, Self::from_parts(name, category, ccc))) - } - - pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self { - Self { name, category, combining_class } - } - - pub fn with_name<'b>(self, name: &'a str) -> CharData<'b> - where - 'a: 'b, - { - Self { name, ..self } - } - - pub fn name(&self) -> &'a str { - self.name - } - - pub fn category(&self) -> Category { - self.category - } - - pub fn combining_class(&self) -> CombiningClass { - self.combining_class - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum Category { - Lu, - Ll, - Lt, - Mn, - Mc, - Me, - Nd, - Nl, - No, - Zs, - Zl, - Zp, - Cc, - Cf, - Cs, - Co, - Cn, - Lm, - Lo, - Pc, - Pd, - Ps, - Pe, - Pi, - Pf, - Po, - Sm, - Sc, - Sk, - So, -} - -impl Category { - pub fn from_byte(b: u8) -> Option { - match b { - 0 => Some(Self::Lu), - 1 => Some(Self::Ll), - 2 => Some(Self::Lt), - 3 => Some(Self::Mn), - 4 => Some(Self::Mc), - 5 => Some(Self::Me), - 6 => Some(Self::Nd), - 7 => Some(Self::Nl), - 8 => Some(Self::No), - 9 => Some(Self::Zs), - 10 => Some(Self::Zl), - 11 => Some(Self::Zp), - 12 => Some(Self::Cc), - 13 => Some(Self::Cf), - 14 => Some(Self::Cs), - 15 => Some(Self::Co), - 16 => Some(Self::Cn), - 17 => Some(Self::Lm), - 18 => Some(Self::Lo), - 19 => Some(Self::Pc), - 20 => Some(Self::Pd), - 21 => Some(Self::Ps), - 22 => Some(Self::Pe), - 23 => Some(Self::Pi), - 24 => Some(Self::Pf), - 25 => Some(Self::Po), - 26 => Some(Self::Sm), - 27 => Some(Self::Sc), - 28 => Some(Self::Sk), - 29 => Some(Self::So), - _ => None, - } - } - - pub fn byte_repr(self) -> u8 { - self as u8 - } - - pub fn from_abbr(s: &str) -> Option { - match s { - "Lu" => Some(Self::Lu), - "Ll" => Some(Self::Ll), - "Lt" => Some(Self::Lt), - "Mn" => Some(Self::Mn), - "Mc" => Some(Self::Mc), - "Me" => Some(Self::Me), - "Nd" => Some(Self::Nd), - "Nl" => Some(Self::Nl), - "No" => Some(Self::No), - "Zs" => Some(Self::Zs), - "Zl" => Some(Self::Zl), - "Zp" => Some(Self::Zp), - "Cc" => Some(Self::Cc), - "Cf" => Some(Self::Cf), - "Cs" => Some(Self::Cs), - "Co" => Some(Self::Co), - "Cn" => Some(Self::Cn), - "Lm" => Some(Self::Lm), - "Lo" => Some(Self::Lo), - "Pc" => Some(Self::Pc), - "Pd" => Some(Self::Pd), - "Ps" => Some(Self::Ps), - "Pe" => Some(Self::Pe), - "Pi" => Some(Self::Pi), - "Pf" => Some(Self::Pf), - "Po" => Some(Self::Po), - "Sm" => Some(Self::Sm), - "Sc" => Some(Self::Sc), - "Sk" => Some(Self::Sk), - "So" => Some(Self::So), - _ => None, - } - } - - pub fn abbr(self) -> &'static str { - match self { - Self::Lu => "Lu", - Self::Ll => "Ll", - Self::Lt => "Lt", - Self::Mn => "Mn", - Self::Mc => "Mc", - Self::Me => "Me", - Self::Nd => "Nd", - Self::Nl => "Nl", - Self::No => "No", - Self::Zs => "Zs", - Self::Zl => "Zl", - Self::Zp => "Zp", - Self::Cc => "Cc", - Self::Cf => "Cf", - Self::Cs => "Cs", - Self::Co => "Co", - Self::Cn => "Cn", - Self::Lm => "Lm", - Self::Lo => "Lo", - Self::Pc => "Pc", - Self::Pd => "Pd", - Self::Ps => "Ps", - Self::Pe => "Pe", - Self::Pi => "Pi", - Self::Pf => "Pf", - Self::Po => "Po", - Self::Sm => "Sm", - Self::Sc => "Sc", - Self::Sk => "Sk", - Self::So => "So", - } - } - - pub fn full_name(self) -> &'static str { - match self { - Self::Lu => "Letter, Uppercase", - Self::Ll => "Letter, Lowercase", - Self::Lt => "Letter, Titlecase", - Self::Mn => "Mark, Non-Spacing", - Self::Mc => "Mark, Spacing Combining", - Self::Me => "Mark, Enclosing", - Self::Nd => "Number, Decimal Digit", - Self::Nl => "Number, Letter", - Self::No => "Number, Other", - Self::Zs => "Separator, Space", - Self::Zl => "Separator, Line", - Self::Zp => "Separator: Paragraph", - Self::Cc => "Other, Control", - Self::Cf => "Other, Format", - Self::Cs => "Other, Surrogate", - Self::Co => "Other, Private Use", - Self::Cn => "Other, Not Assigned", - Self::Lm => "Letter, Modifier", - Self::Lo => "Letter, Other", - Self::Pc => "Punctuation, Connector", - Self::Pd => "Punctuation, Dash", - Self::Ps => "Punctuation, Open", - Self::Pe => "Punctuation, Close", - Self::Pi => "Punctuation, Initial Quote", - Self::Pf => "Punctuation, Final Quote", - Self::Po => "Punctuation, Other", - Self::Sm => "Symbol, Math", - Self::Sc => "Symbol, Currency", - Self::Sk => "Symbol, Modifier", - Self::So => "Symbol, Other", - } - } -} - -impl fmt::Display for Category { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.abbr()) - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub struct CombiningClass(pub u8); - -impl CombiningClass { - pub fn name(self) -> Option<&'static str> { - match self.0 { - 0 => Some("Not_Reordered"), - 1 => Some("Overlay"), - 6 => Some("Han_Reading"), - 7 => Some("Nukta"), - 8 => Some("Kana_Voicing"), - 9 => Some("Virama"), - 200 => Some("Attached_Below_Left"), - 202 => Some("Attached_Below"), - 214 => Some("Attached_Above"), - 216 => Some("Attached_Above_Right"), - 218 => Some("Below_Left"), - 220 => Some("Below"), - 222 => Some("Below_Right"), - 224 => Some("Left"), - 226 => Some("Right"), - 228 => Some("Above_Left"), - 230 => Some("Above"), - 232 => Some("Above_Right"), - 233 => Some("Double_Below"), - 234 => Some("Double_Above"), - 240 => Some("Iota_Subscript"), - _ => None, - } - } - - pub fn is_combining(self) -> bool { - self.0 != 0 - } -} - -impl fmt::Display for CombiningClass { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self.name() { - Some(name) => write!(f, "{}", name), - None => write!(f, "Ccc{}", self.0), - } - } -} - diff --git a/core/src/data_store.rs b/core/src/data_store.rs deleted file mode 100644 index 3ff6b8c..0000000 --- a/core/src/data_store.rs +++ /dev/null @@ -1,189 +0,0 @@ -use std::{collections::{HashMap, hash_map}, error, fmt, ops::Range}; - -use crate::{ - char_data::{CharData, Category, CombiningClass}, - string_table::{StringTableBufError, StringTableBuf, StringTable}, -}; - -const DATA_ENTRY_SIZE: usize = 8; - -const DATA_INIT_FLAG: u8 = 1; -const DATA_REPEATED_FLAG: u8 = 2; - -fn encode_char_data( - name_index: u32, - category: Category, - combining_class: CombiningClass, - repeated: bool -) -> [u8; DATA_ENTRY_SIZE] -{ - let mut buf = [0u8; DATA_ENTRY_SIZE]; - - buf[0] |= DATA_INIT_FLAG; - - if repeated { - buf[0] |= DATA_REPEATED_FLAG; - } - - buf[1..5].copy_from_slice(&name_index.to_le_bytes()); - buf[5] = category.byte_repr(); - buf[6] = combining_class.0; - - buf -} - -fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) - -> Option<(u32, Category, CombiningClass, bool)> -{ - let flags = bytes[0]; - - if flags & DATA_INIT_FLAG == 0 { - return None; - } - - let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap()); - let category = Category::from_byte(bytes[5])?; - let combining_class = CombiningClass(bytes[6]); - let repeated = flags & DATA_REPEATED_FLAG != 0; - - Some((name_index, category, combining_class, repeated)) -} - -pub struct DataStoreBuf { - data: Vec, - strings: StringTableBuf, - strings_map: HashMap, -} - -impl DataStoreBuf { - pub fn new() -> Self { - Self { - data: Vec::new(), - strings: StringTableBuf::new(), - strings_map: HashMap::new(), - } - } - - pub fn as_ref_type(&self) -> DataStore { - DataStore { data: &self.data, strings: &*self.strings } - } - - pub fn insert(&mut self, char_data: CharData, range: Range) -> Result<(), DataBufError> { - if range.is_empty() { - return Ok(()); - } - - let repeated = range.end - .checked_sub(range.start) - .map(|len| len > 1) - .unwrap_or(false); - - let range = { - let start = usize::try_from(range.start) - .map_err(|_| DataBufError::DataOutOfCapacity)? - .checked_mul(DATA_ENTRY_SIZE) - .ok_or(DataBufError::DataOutOfCapacity)?; - let end = usize::try_from(range.end) - .map_err(|_| DataBufError::DataOutOfCapacity)? - .checked_mul(DATA_ENTRY_SIZE) - .ok_or(DataBufError::DataOutOfCapacity)?; - start..end - }; - - if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) { - self.data.try_reserve(extra_capacity_needed) - .map_err(|_| DataBufError::DataOutOfCapacity)?; - } - - let name_index = self.add_string(char_data.name().to_owned())?; - - let encoded_char_data = encode_char_data( - name_index, - char_data.category(), - char_data.combining_class(), - repeated - ); - - if self.data.len() < range.end { - // Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries. - self.data.resize(range.end, 0); - } - - for i in range.step_by(DATA_ENTRY_SIZE) { - self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data); - } - - Ok(()) - } - - fn add_string(&mut self, name: String) -> Result { - match self.strings_map.entry(name) { - hash_map::Entry::Occupied(entry) => Ok(*entry.get()), - hash_map::Entry::Vacant(entry) => { - let index = self.strings.push(entry.key())?; - entry.insert(index); - Ok(index) - }, - } - } -} - -#[derive(Clone, Copy)] -pub struct DataStore<'a> { - data: &'a [u8], - strings: &'a StringTable, -} - -impl<'a> DataStore<'a> { - pub fn get(self, codepoint: char) -> Option> { - let index = usize::try_from(u32::from(codepoint)).ok()?; - let start = index.checked_mul(DATA_ENTRY_SIZE)?; - let end = start.checked_add(DATA_ENTRY_SIZE)?; - let encoded = self.data.get(start..end)?; - let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?; - let name = self.strings.get(name_index)?; - Some(CharData::from_parts(name, category, ccc)) - } - - pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> { - let strings = self.strings.to_bytes(); - let strings_len = u32::try_from(strings.len()) - .ok()? - .to_le_bytes(); - Some((strings_len, [strings, self.data])) - } - - pub fn from_bytes(bytes: &'a [u8]) -> Option { - let strings_len = usize::try_from( - u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap()) - ).ok()?; - let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?); - let data = bytes.get((4 + strings_len)..)?; - Some(Self { data, strings }) - } -} - -#[derive(Debug)] -pub enum DataBufError { - DataOutOfCapacity, - StringsMapOutOfCapacity, - StringTable(StringTableBufError), -} - -impl fmt::Display for DataBufError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::DataOutOfCapacity => write!(f, "data buf out of capacity"), - Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"), - Self::StringTable(err) => write!(f, "string table error: {}", err), - } - } -} - -impl error::Error for DataBufError {} - -impl From for DataBufError { - fn from(err: StringTableBufError) -> Self { - Self::StringTable(err) - } -} diff --git a/core/src/lib.rs b/core/src/lib.rs deleted file mode 100644 index 2e3d42d..0000000 --- a/core/src/lib.rs +++ /dev/null @@ -1,6 +0,0 @@ -pub mod char_data; -pub mod data_store; -mod string_table; - -pub use char_data::{CharData, Category, CombiningClass}; -pub use data_store::{DataStore, DataStoreBuf, DataBufError}; diff --git a/core/src/string_table.rs b/core/src/string_table.rs deleted file mode 100644 index 5a7c493..0000000 --- a/core/src/string_table.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::{fmt, error, str, ops::Deref}; - -/// A view into a [`StringTableBuf`](StringTableBuf). The table stores a collection of strings -/// contiguously, with each string being prefixed by its length in bytes. -#[repr(transparent)] -pub struct StringTable { - bytes: [u8], -} - -impl StringTable { - pub fn from_bytes(bytes: &[u8]) -> &Self { - // SAFETY: - // `StringTable` uses `repr(transparent)`, so it has the same memory layout as `[u8]`. - unsafe { &*(bytes as *const [u8] as *const Self) } - } - - pub fn to_bytes(&self) -> &[u8] { - &self.bytes - } - - /// Attempt to retrieve the string at the given byte offset in the table. The given index must - /// be the start of a table entry; providing any other index may result in an error or an - /// unintended string. - /// - /// Note that the string table does not have a sure-fire mechanism for detecting whether the - /// given index is valid, so providing an invalid index may not always result in an error; the - /// bytes starting at the invalid index may be incorrectly interpreted as a valid table entry. - /// However, this will never result in unsoundness, and thus the function is not marked as - /// unsafe; it is checked that the resulting string is valid UTF-8. - pub fn get(&self, index: u32) -> Option<&str> { - let index = usize::try_from(index).ok()?; - let len = *self.bytes.get(index)?; - let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?; - str::from_utf8(bytes).ok() - } -} - -/// An owned [`StringTable`](StringTable). Stores a collection of strings contiguously, with each -/// string being prefixed by its length in bytes. -pub struct StringTableBuf { - buf: Vec, -} - -impl StringTableBuf { - pub fn new() -> Self { - Self { buf: Vec::new() } - } - - /// Append the given string to the table, returning the byte offset in the table at which it - /// was stored. This byte offset can then be used to retrieve the string from the table later, - /// via `StringTable::get`. - pub fn push(&mut self, s: &str) -> Result { - let len = u8::try_from(s.len()) - .map_err(|_| StringTableBufError::StringTooLong)?; - - let index = u32::try_from(self.buf.len()) - .map_err(|_| StringTableBufError::OutOfCapacity)?; - - self.buf.try_reserve(s.len() + 1) - .map_err(|_| StringTableBufError::OutOfCapacity)?; - - self.buf.push(len); - self.buf.extend(s.bytes()); - - Ok(index) - } -} - -impl AsRef for StringTableBuf { - fn as_ref(&self) -> &StringTable { - self - } -} - -impl Deref for StringTableBuf { - type Target = StringTable; - - fn deref(&self) -> &Self::Target { - StringTable::from_bytes(&self.buf) - } -} - -#[derive(Debug)] -pub enum StringTableBufError { - StringTooLong, - OutOfCapacity, -} - -impl fmt::Display for StringTableBufError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::StringTooLong => write!(f, "string too long to add to table"), - Self::OutOfCapacity => write!(f, "string table out of capacity"), - } - } -} - -impl error::Error for StringTableBufError {}