diff --git a/Cargo.toml b/Cargo.toml index e296b0c..4c45115 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] members = [ - "utfdump_core", - "utfdump", - "utfdump_bin", + "core", + "lib", + "bin", ] diff --git a/utfdump_bin/Cargo.toml b/bin/Cargo.toml similarity index 92% rename from utfdump_bin/Cargo.toml rename to bin/Cargo.toml index 0ff8ce0..be9c90e 100644 --- a/utfdump_bin/Cargo.toml +++ b/bin/Cargo.toml @@ -8,7 +8,7 @@ repository = "https://github.com/pantonshire/utfdump" description = "Command-line Unicode character info tool" [dependencies] -utfdump = { path = "../utfdump" } +utfdump = { path = "../lib" } libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" } tabled = "0.8.0" clap = { version = "3.2.22", features = ["derive"] } diff --git a/utfdump_bin/src/main.rs b/bin/src/main.rs similarity index 100% rename from utfdump_bin/src/main.rs rename to bin/src/main.rs diff --git a/utfdump_core/Cargo.toml b/core/Cargo.toml similarity index 100% rename from utfdump_core/Cargo.toml rename to core/Cargo.toml diff --git a/utfdump_core/src/chardata.rs b/core/src/char_data.rs similarity index 100% rename from utfdump_core/src/chardata.rs rename to core/src/char_data.rs diff --git a/utfdump_core/src/encoded.rs b/core/src/data_store.rs similarity index 71% rename from utfdump_core/src/encoded.rs rename to core/src/data_store.rs index 044ac43..3ff6b8c 100644 --- a/utfdump_core/src/encoded.rs +++ b/core/src/data_store.rs @@ -1,6 +1,9 @@ -use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range}; +use std::{collections::{HashMap, hash_map}, error, fmt, ops::Range}; -use crate::chardata::{CharData, Category, CombiningClass}; +use crate::{ + char_data::{CharData, Category, CombiningClass}, + string_table::{StringTableBufError, StringTableBuf, StringTable}, +}; const DATA_ENTRY_SIZE: usize = 8; @@ -46,13 +49,13 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) Some((name_index, category, combining_class, repeated)) } -pub struct DataBuf { +pub struct DataStoreBuf { data: Vec, strings: StringTableBuf, strings_map: HashMap, } -impl DataBuf { +impl DataStoreBuf { pub fn new() -> Self { Self { data: Vec::new(), @@ -61,8 +64,8 @@ impl DataBuf { } } - pub fn as_ref_type(&self) -> Data { - Data { data: &self.data, strings: self.strings.as_ref_type() } + pub fn as_ref_type(&self) -> DataStore { + DataStore { data: &self.data, strings: &*self.strings } } pub fn insert(&mut self, char_data: CharData, range: Range) -> Result<(), DataBufError> { @@ -126,12 +129,12 @@ impl DataBuf { } #[derive(Clone, Copy)] -pub struct Data<'a> { +pub struct DataStore<'a> { data: &'a [u8], - strings: StringTable<'a>, + strings: &'a StringTable, } -impl<'a> Data<'a> { +impl<'a> DataStore<'a> { pub fn get(self, codepoint: char) -> Option> { let index = usize::try_from(u32::from(codepoint)).ok()?; let start = index.checked_mul(DATA_ENTRY_SIZE)?; @@ -184,73 +187,3 @@ impl From for DataBufError { Self::StringTable(err) } } - -#[derive(Clone, Copy)] -pub struct StringTable<'a> { - bytes: &'a [u8], -} - -impl<'a> StringTable<'a> { - pub fn from_bytes(bytes: &'a [u8]) -> Self { - Self { bytes } - } - - pub fn to_bytes(self) -> &'a [u8] { - self.bytes - } - - pub fn get(self, index: u32) -> Option<&'a str> { - let index = usize::try_from(index).ok()?; - let len = *self.bytes.get(index)?; - let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?; - str::from_utf8(bytes).ok() - } -} - -pub struct StringTableBuf { - buf: Vec, -} - -impl StringTableBuf { - pub fn new() -> Self { - Self { buf: Vec::new() } - } - - pub fn as_ref_type(&self) -> StringTable { - StringTable { bytes: &self.buf } - } - - pub fn push(&mut self, s: &str) -> Result { - let len = u8::try_from(s.len()) - .map_err(|_| StringTableBufError::StringTooLong)?; - - let index = u32::try_from(self.buf.len()) - .map_err(|_| StringTableBufError::OutOfCapacity)?; - - self.buf.try_reserve(s.len() + 1) - .map_err(|_| StringTableBufError::OutOfCapacity)?; - - self.buf.push(len); - self.buf.extend(s.bytes()); - - Ok(index) - } -} - -#[derive(Debug)] -pub enum StringTableBufError { - StringTooLong, - OutOfCapacity, -} - -impl fmt::Display for StringTableBufError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::StringTooLong => write!(f, "string too long to add to table"), - Self::OutOfCapacity => write!(f, "string table out of capacity"), - } - } -} - -impl error::Error for StringTableBufError {} - diff --git a/core/src/lib.rs b/core/src/lib.rs new file mode 100644 index 0000000..2e3d42d --- /dev/null +++ b/core/src/lib.rs @@ -0,0 +1,6 @@ +pub mod char_data; +pub mod data_store; +mod string_table; + +pub use char_data::{CharData, Category, CombiningClass}; +pub use data_store::{DataStore, DataStoreBuf, DataBufError}; diff --git a/core/src/string_table.rs b/core/src/string_table.rs new file mode 100644 index 0000000..5a7c493 --- /dev/null +++ b/core/src/string_table.rs @@ -0,0 +1,98 @@ +use std::{fmt, error, str, ops::Deref}; + +/// A view into a [`StringTableBuf`](StringTableBuf). The table stores a collection of strings +/// contiguously, with each string being prefixed by its length in bytes. +#[repr(transparent)] +pub struct StringTable { + bytes: [u8], +} + +impl StringTable { + pub fn from_bytes(bytes: &[u8]) -> &Self { + // SAFETY: + // `StringTable` uses `repr(transparent)`, so it has the same memory layout as `[u8]`. + unsafe { &*(bytes as *const [u8] as *const Self) } + } + + pub fn to_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Attempt to retrieve the string at the given byte offset in the table. The given index must + /// be the start of a table entry; providing any other index may result in an error or an + /// unintended string. + /// + /// Note that the string table does not have a sure-fire mechanism for detecting whether the + /// given index is valid, so providing an invalid index may not always result in an error; the + /// bytes starting at the invalid index may be incorrectly interpreted as a valid table entry. + /// However, this will never result in unsoundness, and thus the function is not marked as + /// unsafe; it is checked that the resulting string is valid UTF-8. + pub fn get(&self, index: u32) -> Option<&str> { + let index = usize::try_from(index).ok()?; + let len = *self.bytes.get(index)?; + let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?; + str::from_utf8(bytes).ok() + } +} + +/// An owned [`StringTable`](StringTable). Stores a collection of strings contiguously, with each +/// string being prefixed by its length in bytes. +pub struct StringTableBuf { + buf: Vec, +} + +impl StringTableBuf { + pub fn new() -> Self { + Self { buf: Vec::new() } + } + + /// Append the given string to the table, returning the byte offset in the table at which it + /// was stored. This byte offset can then be used to retrieve the string from the table later, + /// via `StringTable::get`. + pub fn push(&mut self, s: &str) -> Result { + let len = u8::try_from(s.len()) + .map_err(|_| StringTableBufError::StringTooLong)?; + + let index = u32::try_from(self.buf.len()) + .map_err(|_| StringTableBufError::OutOfCapacity)?; + + self.buf.try_reserve(s.len() + 1) + .map_err(|_| StringTableBufError::OutOfCapacity)?; + + self.buf.push(len); + self.buf.extend(s.bytes()); + + Ok(index) + } +} + +impl AsRef for StringTableBuf { + fn as_ref(&self) -> &StringTable { + self + } +} + +impl Deref for StringTableBuf { + type Target = StringTable; + + fn deref(&self) -> &Self::Target { + StringTable::from_bytes(&self.buf) + } +} + +#[derive(Debug)] +pub enum StringTableBufError { + StringTooLong, + OutOfCapacity, +} + +impl fmt::Display for StringTableBufError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::StringTooLong => write!(f, "string too long to add to table"), + Self::OutOfCapacity => write!(f, "string table out of capacity"), + } + } +} + +impl error::Error for StringTableBufError {} diff --git a/utfdump/Cargo.toml b/lib/Cargo.toml similarity index 57% rename from utfdump/Cargo.toml rename to lib/Cargo.toml index 2e7c677..62a5e19 100644 --- a/utfdump/Cargo.toml +++ b/lib/Cargo.toml @@ -4,8 +4,8 @@ version = "0.1.0" edition = "2021" [dependencies] -utfdump_core = { path = "../utfdump_core" } +utfdump_core = { path = "../core" } once_cell = "1.17.1" [build-dependencies] -utfdump_core = { path = "../utfdump_core" } +utfdump_core = { path = "../core" } diff --git a/utfdump/build.rs b/lib/build.rs similarity index 95% rename from utfdump/build.rs rename to lib/build.rs index cda2ff2..5d0244a 100644 --- a/utfdump/build.rs +++ b/lib/build.rs @@ -1,6 +1,6 @@ use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path}; -use utfdump_core::{chardata::CharData, encoded::DataBuf}; +use utfdump_core::{CharData, DataStoreBuf}; const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt"; const OUT_DATA_PATH: &str = "unicode_data_encoded"; @@ -16,7 +16,7 @@ fn main() { let buf_reader = BufReader::new(data_file); - let mut data = DataBuf::new(); + let mut data = DataStoreBuf::new(); let mut start_codepoint = None; for line in buf_reader.lines() { diff --git a/utfdump/src/lib.rs b/lib/src/lib.rs similarity index 53% rename from utfdump/src/lib.rs rename to lib/src/lib.rs index 01125f2..58a181a 100644 --- a/utfdump/src/lib.rs +++ b/lib/src/lib.rs @@ -1,16 +1,16 @@ pub mod utf8; -pub use utfdump_core::chardata::{CharData, Category, CombiningClass}; +pub use utfdump_core::{CharData, Category, CombiningClass}; use once_cell::sync::Lazy; -use utfdump_core::encoded::Data; +use utfdump_core::data_store::DataStore; const UNICODE_DATA_BYTES: &[u8] = include_bytes!( concat!(env!("OUT_DIR"), "/unicode_data_encoded") ); -static UNICODE_DATA: Lazy = Lazy::new(|| { - Data::from_bytes(UNICODE_DATA_BYTES).unwrap() +static UNICODE_DATA: Lazy = Lazy::new(|| { + DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap() }); pub fn char_data(c: char) -> Option> { diff --git a/utfdump/src/utf8.rs b/lib/src/utf8.rs similarity index 100% rename from utfdump/src/utf8.rs rename to lib/src/utf8.rs diff --git a/utfdump_core/src/lib.rs b/utfdump_core/src/lib.rs deleted file mode 100644 index d21680b..0000000 --- a/utfdump_core/src/lib.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod chardata; -pub mod encoded;