commit 172d1a14fe295f01b377157997dc02292c93455e Author: pantonshire Date: Sun Sep 18 18:05:00 2022 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9469cc1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +unicode_data_latest.txt diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..f903703 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,154 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "libshire" +version = "0.1.0" +source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31" +dependencies = [ + "serde", +] + +[[package]] +name = "papergrid" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "453cf71f2a37af495a1a124bf30d4d7469cfbea58e9f2479be9d222396a518a2" +dependencies = [ + "bytecount", + "fnv", + "unicode-width", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.144" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860" + +[[package]] +name = "syn" +version = "1.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tabled" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5b2f8c37d26d87d2252187b0a45ea3cbf42baca10377c7e7eaaa2800fa9bf97" +dependencies = [ + "papergrid", + "tabled_derive", + "unicode-width", +] + +[[package]] +name = "tabled_derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9ee618502f497abf593e1c5c9577f34775b111480009ffccd7ad70d23fcaba8" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "utfdump" +version = "0.1.0" +dependencies = [ + "libshire", + "tabled", + "utfdump_core", +] + +[[package]] +name = "utfdump_core" +version = "0.1.0" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5170b07 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,5 @@ +[workspace] +members = [ + "utfdump_core", + "utfdump_bin", +] diff --git a/get_data.sh b/get_data.sh new file mode 100755 index 0000000..652cfdc --- /dev/null +++ b/get_data.sh @@ -0,0 +1,2 @@ +#!/bin/bash +curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt diff --git a/utfdump_bin/Cargo.toml b/utfdump_bin/Cargo.toml new file mode 100644 index 0000000..ef1f19d --- /dev/null +++ b/utfdump_bin/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "utfdump" +version = "0.1.0" +edition = "2021" + +[dependencies] +utfdump_core = { path = "../utfdump_core" } +libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" } +tabled = "0.8.0" + +[build-dependencies] +utfdump_core = { path = "../utfdump_core" } diff --git a/utfdump_bin/build.rs b/utfdump_bin/build.rs new file mode 100644 index 0000000..cda2ff2 --- /dev/null +++ b/utfdump_bin/build.rs @@ -0,0 +1,71 @@ +use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path}; + +use utfdump_core::{chardata::CharData, encoded::DataBuf}; + +const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt"; +const OUT_DATA_PATH: &str = "unicode_data_encoded"; + +fn main() { + println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH); + + let out_dir = env::var_os("OUT_DIR").unwrap(); + let out_path = Path::new(&out_dir).join(OUT_DATA_PATH); + + let data_file = File::open(UNICODE_DATA_PATH) + .expect("failed to open unicode data file"); + + let buf_reader = BufReader::new(data_file); + + let mut data = DataBuf::new(); + let mut start_codepoint = None; + + for line in buf_reader.lines() { + let line = line.unwrap(); + let (codepoint, char_data) = CharData::from_row(&line).unwrap(); + + match start_codepoint { + Some(start_codepoint_inner) => { + let prefix = char_data.name() + .strip_suffix(", Last>") + .expect("expected end of codepoint block"); + + let name = { + let mut buf = String::with_capacity(prefix.len() + 1); + buf.push_str(prefix); + buf.push('>'); + buf + }; + + let char_data = char_data.with_name(&name); + + data.insert(char_data, start_codepoint_inner..(codepoint + 1)) + .unwrap(); + + start_codepoint = None; + }, + + None => { + if char_data.name().ends_with(", First>") { + start_codepoint = Some(codepoint); + } else { + data.insert(char_data, codepoint..(codepoint + 1)) + .unwrap(); + } + }, + } + } + + let (strings_len, [strings, data]) = data + .as_ref_type() + .to_bytes() + .unwrap(); + + let mut out_file = File::create(&out_path) + .expect("failed to open output file"); + + out_file.write_all(&strings_len).unwrap(); + out_file.write_all(strings).unwrap(); + out_file.write_all(data).unwrap(); + + drop(out_file); +} diff --git a/utfdump_bin/src/main.rs b/utfdump_bin/src/main.rs new file mode 100644 index 0000000..558677d --- /dev/null +++ b/utfdump_bin/src/main.rs @@ -0,0 +1,126 @@ +use std::{fmt, io::{self, Read}}; + +use libshire::strings::CappedString; +use tabled::{Tabled, Table, Style}; + +use utfdump_core::{chardata::Category, encoded::Data}; + +const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded")); + +fn main() { + let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap(); + + let input = { + let mut buf = Vec::::new(); + let stdin = io::stdin(); + let mut guard = stdin.lock(); + guard.read_to_end(&mut buf) + .expect("failed to read stdin"); + // TODO: just skip over invalid utf-8 characters + String::from_utf8(buf) + .expect("invalid utf-8") + }; + + let rows = input + .chars() + .map(|c| { + let mut name = Optional::None; + let mut category = Optional::None; + let mut char_combining_class = Optional::None; + + let mut combining = false; + + if let Some(char_data) = data.get(c as u32) { + name = Optional::Some(char_data.name()); + category = Optional::Some(char_data.category()); + + let ccc = char_data.ccc(); + char_combining_class = Optional::Some(ccc); + combining = ccc != 0; + } + + let display_char = { + let mut buf = CappedString::empty(); + if combining { + buf.push_truncating('\u{25cc}'); + } + buf.push_truncating(c); + buf + }; + + OutRow { + display_char, + codepoint: Codepoint(c), + utf_8_bytes: Utf8Bytes(c), + name, + category, + char_combining_class, + } + }); + + let table = Table::new(rows) + .with(Style::modern()); + + println!("{}", table); +} + +#[derive(Tabled)] +struct OutRow { + #[tabled(rename = "")] + display_char: CappedString<8>, + #[tabled(rename = "Codepoint")] + codepoint: Codepoint, + #[tabled(rename = "UTF-8")] + utf_8_bytes: Utf8Bytes, + #[tabled(rename = "Name")] + name: Optional<&'static str>, + #[tabled(rename = "Category")] + category: Optional, + #[tabled(rename = "Combining")] + char_combining_class: Optional, +} + +#[derive(Debug)] +enum Optional { + Some(T), + None, +} + +impl fmt::Display for Optional +where + T: fmt::Display, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Some(x) => fmt::Display::fmt(&x, f), + Self::None => f.write_str("??"), + } + } +} + +#[derive(Debug)] +struct Codepoint(char); + +impl fmt::Display for Codepoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "U+{:04x}", self.0 as u32) + } +} + +#[derive(Debug)] +struct Utf8Bytes(char); + +impl fmt::Display for Utf8Bytes { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut buf = [0u8; 4]; + let s = self.0.encode_utf8(&mut buf); + let mut bytes = s.bytes(); + if let Some(b) = bytes.next() { + write!(f, "0x{:02x}", b)?; + for b in bytes { + write!(f, " 0x{:02x}", b)?; + } + } + Ok(()) + } +} diff --git a/utfdump_core/Cargo.toml b/utfdump_core/Cargo.toml new file mode 100644 index 0000000..d0be7de --- /dev/null +++ b/utfdump_core/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "utfdump_core" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/utfdump_core/src/chardata.rs b/utfdump_core/src/chardata.rs new file mode 100644 index 0000000..9426bb8 --- /dev/null +++ b/utfdump_core/src/chardata.rs @@ -0,0 +1,237 @@ +use std::fmt; + +#[derive(Clone, Debug)] +pub struct CharData<'a> { + name: &'a str, + category: Category, + ccc: u8, +} + +impl<'a> CharData<'a> { + pub fn from_row(row: &'a str) -> Option<(u32, Self)> { + let mut fields = [""; 15]; + for (i, field) in row.splitn(15, ';').enumerate() { + fields[i] = field; + } + + let codepoint = u32::from_str_radix(fields[0], 16).ok()?; + let name = fields[1]; + let category = Category::from_abbr(fields[2])?; + let ccc = u8::from_str_radix(fields[3], 10).ok()?; + + Some((codepoint, Self::from_parts(name, category, ccc))) + } + + pub fn from_parts(name: &'a str, category: Category, ccc: u8) -> Self { + Self { name, category, ccc } + } + + pub fn with_name<'b>(self, name: &'a str) -> CharData<'b> + where + 'a: 'b, + { + Self { name, ..self } + } + + pub fn name(&self) -> &'a str { + self.name + } + + pub fn category(&self) -> Category { + self.category + } + + pub fn ccc(&self) -> u8 { + self.ccc + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[repr(u8)] +pub enum Category { + Lu = 0, + Ll = 1, + Lt = 2, + Mn = 3, + Mc = 4, + Me = 5, + Nd = 6, + Nl = 7, + No = 8, + Zs = 9, + Zl = 10, + Zp = 11, + Cc = 12, + Cf = 13, + Cs = 14, + Co = 15, + Cn = 16, + Lm = 17, + Lo = 18, + Pc = 19, + Pd = 20, + Ps = 21, + Pe = 22, + Pi = 23, + Pf = 24, + Po = 25, + Sm = 26, + Sc = 27, + Sk = 28, + So = 29, +} + +impl Category { + pub fn from_byte(b: u8) -> Option { + match b { + 0 => Some(Self::Lu), + 1 => Some(Self::Ll), + 2 => Some(Self::Lt), + 3 => Some(Self::Mn), + 4 => Some(Self::Mc), + 5 => Some(Self::Me), + 6 => Some(Self::Nd), + 7 => Some(Self::Nl), + 8 => Some(Self::No), + 9 => Some(Self::Zs), + 10 => Some(Self::Zl), + 11 => Some(Self::Zp), + 12 => Some(Self::Cc), + 13 => Some(Self::Cf), + 14 => Some(Self::Cs), + 15 => Some(Self::Co), + 16 => Some(Self::Cn), + 17 => Some(Self::Lm), + 18 => Some(Self::Lo), + 19 => Some(Self::Pc), + 20 => Some(Self::Pd), + 21 => Some(Self::Ps), + 22 => Some(Self::Pe), + 23 => Some(Self::Pi), + 24 => Some(Self::Pf), + 25 => Some(Self::Po), + 26 => Some(Self::Sm), + 27 => Some(Self::Sc), + 28 => Some(Self::Sk), + 29 => Some(Self::So), + _ => None, + } + } + + pub fn byte_repr(self) -> u8 { + self as u8 + } + + pub fn from_abbr(s: &str) -> Option { + match s { + "Lu" => Some(Self::Lu), + "Ll" => Some(Self::Ll), + "Lt" => Some(Self::Lt), + "Mn" => Some(Self::Mn), + "Mc" => Some(Self::Mc), + "Me" => Some(Self::Me), + "Nd" => Some(Self::Nd), + "Nl" => Some(Self::Nl), + "No" => Some(Self::No), + "Zs" => Some(Self::Zs), + "Zl" => Some(Self::Zl), + "Zp" => Some(Self::Zp), + "Cc" => Some(Self::Cc), + "Cf" => Some(Self::Cf), + "Cs" => Some(Self::Cs), + "Co" => Some(Self::Co), + "Cn" => Some(Self::Cn), + "Lm" => Some(Self::Lm), + "Lo" => Some(Self::Lo), + "Pc" => Some(Self::Pc), + "Pd" => Some(Self::Pd), + "Ps" => Some(Self::Ps), + "Pe" => Some(Self::Pe), + "Pi" => Some(Self::Pi), + "Pf" => Some(Self::Pf), + "Po" => Some(Self::Po), + "Sm" => Some(Self::Sm), + "Sc" => Some(Self::Sc), + "Sk" => Some(Self::Sk), + "So" => Some(Self::So), + _ => None, + } + } + + pub fn abbr(self) -> &'static str { + match self { + Self::Lu => "Lu", + Self::Ll => "Ll", + Self::Lt => "Lt", + Self::Mn => "Mn", + Self::Mc => "Mc", + Self::Me => "Me", + Self::Nd => "Nd", + Self::Nl => "Nl", + Self::No => "No", + Self::Zs => "Zs", + Self::Zl => "Zl", + Self::Zp => "Zp", + Self::Cc => "Cc", + Self::Cf => "Cf", + Self::Cs => "Cs", + Self::Co => "Co", + Self::Cn => "Cn", + Self::Lm => "Lm", + Self::Lo => "Lo", + Self::Pc => "Pc", + Self::Pd => "Pd", + Self::Ps => "Ps", + Self::Pe => "Pe", + Self::Pi => "Pi", + Self::Pf => "Pf", + Self::Po => "Po", + Self::Sm => "Sm", + Self::Sc => "Sc", + Self::Sk => "Sk", + Self::So => "So", + } + } + + pub fn full_name(self) -> &'static str { + match self { + Self::Lu => "Letter, Uppercase", + Self::Ll => "Letter, Lowercase", + Self::Lt => "Letter, Titlecase", + Self::Mn => "Mark, Non-Spacing", + Self::Mc => "Mark, Spacing Combining", + Self::Me => "Mark, Enclosing", + Self::Nd => "Number, Decimal Digit", + Self::Nl => "Number, Letter", + Self::No => "Number, Other", + Self::Zs => "Separator, Space", + Self::Zl => "Separator, Line", + Self::Zp => "Separator: Paragraph", + Self::Cc => "Other, Control", + Self::Cf => "Other, Format", + Self::Cs => "Other, Surrogate", + Self::Co => "Other, Private Use", + Self::Cn => "Other, Not Assigned", + Self::Lm => "Letter, Modifier", + Self::Lo => "Letter, Other", + Self::Pc => "Punctuation, Connector", + Self::Pd => "Punctuation, Dash", + Self::Ps => "Punctuation, Open", + Self::Pe => "Punctuation, Close", + Self::Pi => "Punctuation, Initial Quote", + Self::Pf => "Punctuation, Final Quote", + Self::Po => "Punctuation, Other", + Self::Sm => "Symbol, Math", + Self::Sc => "Symbol, Currency", + Self::Sk => "Symbol, Modifier", + Self::So => "Symbol, Other", + } + } +} + +impl fmt::Display for Category { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}: {}", self.abbr(), self.full_name()) + } +} + diff --git a/utfdump_core/src/encoded.rs b/utfdump_core/src/encoded.rs new file mode 100644 index 0000000..430114c --- /dev/null +++ b/utfdump_core/src/encoded.rs @@ -0,0 +1,245 @@ +use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range}; + +use crate::chardata::{CharData, Category}; + +const DATA_ENTRY_SIZE: usize = 8; + +const DATA_INIT_FLAG: u8 = 1; +const DATA_REPEATED_FLAG: u8 = 2; + +fn encode_char_data(name_index: u32, category: Category, ccc: u8, repeated: bool) -> [u8; DATA_ENTRY_SIZE] { + let mut buf = [0u8; DATA_ENTRY_SIZE]; + + buf[0] |= DATA_INIT_FLAG; + + if repeated { + buf[0] |= DATA_REPEATED_FLAG; + } + + buf[1..5].copy_from_slice(&name_index.to_le_bytes()); + buf[5] = category.byte_repr(); + buf[6] = ccc; + + buf +} + +fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, u8, bool)> { + let flags = bytes[0]; + + if flags & DATA_INIT_FLAG == 0 { + return None; + } + + let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap()); + let category = Category::from_byte(bytes[5])?; + let ccc = bytes[6]; + let repeated = flags & DATA_REPEATED_FLAG != 0; + + Some((name_index, category, ccc, repeated)) +} + +pub struct DataBuf { + data: Vec, + strings: StringTableBuf, + strings_map: HashMap, +} + +impl DataBuf { + pub fn new() -> Self { + Self { + data: Vec::new(), + strings: StringTableBuf::new(), + strings_map: HashMap::new(), + } + } + + pub fn as_ref_type(&self) -> Data { + Data { data: &self.data, strings: self.strings.as_ref_type() } + } + + pub fn insert(&mut self, char_data: CharData, range: Range) -> Result<(), DataBufError> { + if range.is_empty() { + return Ok(()); + } + + let repeated = range.end - range.start > 1; + + let range = { + let start = usize::try_from(range.start) + .map_err(|_| DataBufError::DataOutOfCapacity)? + .checked_mul(DATA_ENTRY_SIZE) + .ok_or(DataBufError::DataOutOfCapacity)?; + let end = usize::try_from(range.end) + .map_err(|_| DataBufError::DataOutOfCapacity)? + .checked_mul(DATA_ENTRY_SIZE) + .ok_or(DataBufError::DataOutOfCapacity)?; + start..end + }; + + if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) { + self.data.try_reserve(extra_capacity_needed) + .map_err(|_| DataBufError::DataOutOfCapacity)?; + } + + let name_index = self.add_string(char_data.name().to_owned())?; + + let encoded_char_data = encode_char_data( + name_index, + char_data.category(), + char_data.ccc(), + repeated + ); + + if self.data.len() < range.end { + // Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries. + self.data.resize(range.end, 0); + } + + for i in range.step_by(DATA_ENTRY_SIZE) { + self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data); + } + + Ok(()) + } + + fn add_string(&mut self, name: String) -> Result { + match self.strings_map.entry(name) { + hash_map::Entry::Occupied(entry) => Ok(*entry.get()), + hash_map::Entry::Vacant(entry) => { + let index = self.strings.push(entry.key())?; + entry.insert(index); + Ok(index) + }, + } + } +} + +#[derive(Clone, Copy)] +pub struct Data<'a> { + data: &'a [u8], + strings: StringTable<'a>, +} + +impl<'a> Data<'a> { + pub fn get(self, codepoint: u32) -> Option> { + let index = usize::try_from(codepoint).ok()?; + let start = index.checked_mul(DATA_ENTRY_SIZE)?; + let end = start.checked_add(DATA_ENTRY_SIZE)?; + let encoded = self.data.get(start..end)?; + let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?; + let name = self.strings.get(name_index)?; + Some(CharData::from_parts(name, category, ccc)) + } + + pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> { + let strings = self.strings.to_bytes(); + let strings_len = u32::try_from(strings.len()) + .ok()? + .to_le_bytes(); + Some((strings_len, [strings, self.data])) + } + + pub fn from_bytes(bytes: &'a [u8]) -> Option { + let strings_len = usize::try_from( + u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap()) + ).ok()?; + let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?); + let data = bytes.get((4 + strings_len)..)?; + Some(Self { data, strings }) + } +} + +#[derive(Debug)] +pub enum DataBufError { + DataOutOfCapacity, + StringsMapOutOfCapacity, + StringTable(StringTableBufError), +} + +impl fmt::Display for DataBufError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::DataOutOfCapacity => write!(f, "data buf out of capacity"), + Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"), + Self::StringTable(err) => write!(f, "string table error: {}", err), + } + } +} + +impl error::Error for DataBufError {} + +impl From for DataBufError { + fn from(err: StringTableBufError) -> Self { + Self::StringTable(err) + } +} + +#[derive(Clone, Copy)] +pub struct StringTable<'a> { + bytes: &'a [u8], +} + +impl<'a> StringTable<'a> { + pub fn from_bytes(bytes: &'a [u8]) -> Self { + Self { bytes } + } + + pub fn to_bytes(self) -> &'a [u8] { + self.bytes + } + + pub fn get(self, index: u32) -> Option<&'a str> { + let index = usize::try_from(index).ok()?; + let len = *self.bytes.get(index)?; + let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?; + str::from_utf8(bytes).ok() + } +} + +pub struct StringTableBuf { + buf: Vec, +} + +impl StringTableBuf { + pub fn new() -> Self { + Self { buf: Vec::new() } + } + + pub fn as_ref_type(&self) -> StringTable { + StringTable { bytes: &self.buf } + } + + pub fn push(&mut self, s: &str) -> Result { + let len = u8::try_from(s.len()) + .map_err(|_| StringTableBufError::StringTooLong)?; + + let index = u32::try_from(self.buf.len()) + .map_err(|_| StringTableBufError::OutOfCapacity)?; + + self.buf.try_reserve(s.len() + 1) + .map_err(|_| StringTableBufError::OutOfCapacity)?; + + self.buf.push(len); + self.buf.extend(s.bytes()); + + Ok(index) + } +} + +#[derive(Debug)] +pub enum StringTableBufError { + StringTooLong, + OutOfCapacity, +} + +impl fmt::Display for StringTableBufError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::StringTooLong => write!(f, "string too long to add to table"), + Self::OutOfCapacity => write!(f, "string table out of capacity"), + } + } +} + +impl error::Error for StringTableBufError {} + diff --git a/utfdump_core/src/lib.rs b/utfdump_core/src/lib.rs new file mode 100644 index 0000000..d21680b --- /dev/null +++ b/utfdump_core/src/lib.rs @@ -0,0 +1,2 @@ +pub mod chardata; +pub mod encoded;