remove core

main
pantonshire 3 years ago
parent f9430db2f9
commit 6e8d197ae4

@ -1,6 +1,5 @@
[workspace]
members = [
"core",
"lib",
"bin",
]

@ -1,10 +0,0 @@
[package]
name = "utfdump_core"
version = "0.1.0"
edition = "2021"
authors = ["Tom Panton <pantonshire@gmail.com>"]
license = "MIT"
repository = "https://github.com/pantonshire/utfdump"
description = "Core library for the utfdump command-line tool"
[dependencies]

@ -1,281 +0,0 @@
use std::fmt;
#[derive(Clone, Debug)]
pub struct CharData<'a> {
name: &'a str,
category: Category,
combining_class: CombiningClass,
}
impl<'a> CharData<'a> {
pub fn from_row(row: &'a str) -> Option<(u32, Self)> {
let mut fields = [""; 15];
for (i, field) in row.splitn(15, ';').enumerate() {
fields[i] = field;
}
let codepoint = u32::from_str_radix(fields[0], 16).ok()?;
let name = fields[1];
let category = Category::from_abbr(fields[2])?;
let ccc = CombiningClass(u8::from_str_radix(fields[3], 10).ok()?);
Some((codepoint, Self::from_parts(name, category, ccc)))
}
pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self {
Self { name, category, combining_class }
}
pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
where
'a: 'b,
{
Self { name, ..self }
}
pub fn name(&self) -> &'a str {
self.name
}
pub fn category(&self) -> Category {
self.category
}
pub fn combining_class(&self) -> CombiningClass {
self.combining_class
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum Category {
Lu,
Ll,
Lt,
Mn,
Mc,
Me,
Nd,
Nl,
No,
Zs,
Zl,
Zp,
Cc,
Cf,
Cs,
Co,
Cn,
Lm,
Lo,
Pc,
Pd,
Ps,
Pe,
Pi,
Pf,
Po,
Sm,
Sc,
Sk,
So,
}
impl Category {
pub fn from_byte(b: u8) -> Option<Self> {
match b {
0 => Some(Self::Lu),
1 => Some(Self::Ll),
2 => Some(Self::Lt),
3 => Some(Self::Mn),
4 => Some(Self::Mc),
5 => Some(Self::Me),
6 => Some(Self::Nd),
7 => Some(Self::Nl),
8 => Some(Self::No),
9 => Some(Self::Zs),
10 => Some(Self::Zl),
11 => Some(Self::Zp),
12 => Some(Self::Cc),
13 => Some(Self::Cf),
14 => Some(Self::Cs),
15 => Some(Self::Co),
16 => Some(Self::Cn),
17 => Some(Self::Lm),
18 => Some(Self::Lo),
19 => Some(Self::Pc),
20 => Some(Self::Pd),
21 => Some(Self::Ps),
22 => Some(Self::Pe),
23 => Some(Self::Pi),
24 => Some(Self::Pf),
25 => Some(Self::Po),
26 => Some(Self::Sm),
27 => Some(Self::Sc),
28 => Some(Self::Sk),
29 => Some(Self::So),
_ => None,
}
}
pub fn byte_repr(self) -> u8 {
self as u8
}
pub fn from_abbr(s: &str) -> Option<Self> {
match s {
"Lu" => Some(Self::Lu),
"Ll" => Some(Self::Ll),
"Lt" => Some(Self::Lt),
"Mn" => Some(Self::Mn),
"Mc" => Some(Self::Mc),
"Me" => Some(Self::Me),
"Nd" => Some(Self::Nd),
"Nl" => Some(Self::Nl),
"No" => Some(Self::No),
"Zs" => Some(Self::Zs),
"Zl" => Some(Self::Zl),
"Zp" => Some(Self::Zp),
"Cc" => Some(Self::Cc),
"Cf" => Some(Self::Cf),
"Cs" => Some(Self::Cs),
"Co" => Some(Self::Co),
"Cn" => Some(Self::Cn),
"Lm" => Some(Self::Lm),
"Lo" => Some(Self::Lo),
"Pc" => Some(Self::Pc),
"Pd" => Some(Self::Pd),
"Ps" => Some(Self::Ps),
"Pe" => Some(Self::Pe),
"Pi" => Some(Self::Pi),
"Pf" => Some(Self::Pf),
"Po" => Some(Self::Po),
"Sm" => Some(Self::Sm),
"Sc" => Some(Self::Sc),
"Sk" => Some(Self::Sk),
"So" => Some(Self::So),
_ => None,
}
}
pub fn abbr(self) -> &'static str {
match self {
Self::Lu => "Lu",
Self::Ll => "Ll",
Self::Lt => "Lt",
Self::Mn => "Mn",
Self::Mc => "Mc",
Self::Me => "Me",
Self::Nd => "Nd",
Self::Nl => "Nl",
Self::No => "No",
Self::Zs => "Zs",
Self::Zl => "Zl",
Self::Zp => "Zp",
Self::Cc => "Cc",
Self::Cf => "Cf",
Self::Cs => "Cs",
Self::Co => "Co",
Self::Cn => "Cn",
Self::Lm => "Lm",
Self::Lo => "Lo",
Self::Pc => "Pc",
Self::Pd => "Pd",
Self::Ps => "Ps",
Self::Pe => "Pe",
Self::Pi => "Pi",
Self::Pf => "Pf",
Self::Po => "Po",
Self::Sm => "Sm",
Self::Sc => "Sc",
Self::Sk => "Sk",
Self::So => "So",
}
}
pub fn full_name(self) -> &'static str {
match self {
Self::Lu => "Letter, Uppercase",
Self::Ll => "Letter, Lowercase",
Self::Lt => "Letter, Titlecase",
Self::Mn => "Mark, Non-Spacing",
Self::Mc => "Mark, Spacing Combining",
Self::Me => "Mark, Enclosing",
Self::Nd => "Number, Decimal Digit",
Self::Nl => "Number, Letter",
Self::No => "Number, Other",
Self::Zs => "Separator, Space",
Self::Zl => "Separator, Line",
Self::Zp => "Separator: Paragraph",
Self::Cc => "Other, Control",
Self::Cf => "Other, Format",
Self::Cs => "Other, Surrogate",
Self::Co => "Other, Private Use",
Self::Cn => "Other, Not Assigned",
Self::Lm => "Letter, Modifier",
Self::Lo => "Letter, Other",
Self::Pc => "Punctuation, Connector",
Self::Pd => "Punctuation, Dash",
Self::Ps => "Punctuation, Open",
Self::Pe => "Punctuation, Close",
Self::Pi => "Punctuation, Initial Quote",
Self::Pf => "Punctuation, Final Quote",
Self::Po => "Punctuation, Other",
Self::Sm => "Symbol, Math",
Self::Sc => "Symbol, Currency",
Self::Sk => "Symbol, Modifier",
Self::So => "Symbol, Other",
}
}
}
impl fmt::Display for Category {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.abbr())
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct CombiningClass(pub u8);
impl CombiningClass {
pub fn name(self) -> Option<&'static str> {
match self.0 {
0 => Some("Not_Reordered"),
1 => Some("Overlay"),
6 => Some("Han_Reading"),
7 => Some("Nukta"),
8 => Some("Kana_Voicing"),
9 => Some("Virama"),
200 => Some("Attached_Below_Left"),
202 => Some("Attached_Below"),
214 => Some("Attached_Above"),
216 => Some("Attached_Above_Right"),
218 => Some("Below_Left"),
220 => Some("Below"),
222 => Some("Below_Right"),
224 => Some("Left"),
226 => Some("Right"),
228 => Some("Above_Left"),
230 => Some("Above"),
232 => Some("Above_Right"),
233 => Some("Double_Below"),
234 => Some("Double_Above"),
240 => Some("Iota_Subscript"),
_ => None,
}
}
pub fn is_combining(self) -> bool {
self.0 != 0
}
}
impl fmt::Display for CombiningClass {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.name() {
Some(name) => write!(f, "{}", name),
None => write!(f, "Ccc{}", self.0),
}
}
}

@ -1,189 +0,0 @@
use std::{collections::{HashMap, hash_map}, error, fmt, ops::Range};
use crate::{
char_data::{CharData, Category, CombiningClass},
string_table::{StringTableBufError, StringTableBuf, StringTable},
};
const DATA_ENTRY_SIZE: usize = 8;
const DATA_INIT_FLAG: u8 = 1;
const DATA_REPEATED_FLAG: u8 = 2;
fn encode_char_data(
name_index: u32,
category: Category,
combining_class: CombiningClass,
repeated: bool
) -> [u8; DATA_ENTRY_SIZE]
{
let mut buf = [0u8; DATA_ENTRY_SIZE];
buf[0] |= DATA_INIT_FLAG;
if repeated {
buf[0] |= DATA_REPEATED_FLAG;
}
buf[1..5].copy_from_slice(&name_index.to_le_bytes());
buf[5] = category.byte_repr();
buf[6] = combining_class.0;
buf
}
fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
-> Option<(u32, Category, CombiningClass, bool)>
{
let flags = bytes[0];
if flags & DATA_INIT_FLAG == 0 {
return None;
}
let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
let category = Category::from_byte(bytes[5])?;
let combining_class = CombiningClass(bytes[6]);
let repeated = flags & DATA_REPEATED_FLAG != 0;
Some((name_index, category, combining_class, repeated))
}
pub struct DataStoreBuf {
data: Vec<u8>,
strings: StringTableBuf,
strings_map: HashMap<String, u32>,
}
impl DataStoreBuf {
pub fn new() -> Self {
Self {
data: Vec::new(),
strings: StringTableBuf::new(),
strings_map: HashMap::new(),
}
}
pub fn as_ref_type(&self) -> DataStore {
DataStore { data: &self.data, strings: &*self.strings }
}
pub fn insert(&mut self, char_data: CharData, range: Range<u32>) -> Result<(), DataBufError> {
if range.is_empty() {
return Ok(());
}
let repeated = range.end
.checked_sub(range.start)
.map(|len| len > 1)
.unwrap_or(false);
let range = {
let start = usize::try_from(range.start)
.map_err(|_| DataBufError::DataOutOfCapacity)?
.checked_mul(DATA_ENTRY_SIZE)
.ok_or(DataBufError::DataOutOfCapacity)?;
let end = usize::try_from(range.end)
.map_err(|_| DataBufError::DataOutOfCapacity)?
.checked_mul(DATA_ENTRY_SIZE)
.ok_or(DataBufError::DataOutOfCapacity)?;
start..end
};
if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) {
self.data.try_reserve(extra_capacity_needed)
.map_err(|_| DataBufError::DataOutOfCapacity)?;
}
let name_index = self.add_string(char_data.name().to_owned())?;
let encoded_char_data = encode_char_data(
name_index,
char_data.category(),
char_data.combining_class(),
repeated
);
if self.data.len() < range.end {
// Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries.
self.data.resize(range.end, 0);
}
for i in range.step_by(DATA_ENTRY_SIZE) {
self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data);
}
Ok(())
}
fn add_string(&mut self, name: String) -> Result<u32, DataBufError> {
match self.strings_map.entry(name) {
hash_map::Entry::Occupied(entry) => Ok(*entry.get()),
hash_map::Entry::Vacant(entry) => {
let index = self.strings.push(entry.key())?;
entry.insert(index);
Ok(index)
},
}
}
}
#[derive(Clone, Copy)]
pub struct DataStore<'a> {
data: &'a [u8],
strings: &'a StringTable,
}
impl<'a> DataStore<'a> {
pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
let index = usize::try_from(u32::from(codepoint)).ok()?;
let start = index.checked_mul(DATA_ENTRY_SIZE)?;
let end = start.checked_add(DATA_ENTRY_SIZE)?;
let encoded = self.data.get(start..end)?;
let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?;
let name = self.strings.get(name_index)?;
Some(CharData::from_parts(name, category, ccc))
}
pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> {
let strings = self.strings.to_bytes();
let strings_len = u32::try_from(strings.len())
.ok()?
.to_le_bytes();
Some((strings_len, [strings, self.data]))
}
pub fn from_bytes(bytes: &'a [u8]) -> Option<Self> {
let strings_len = usize::try_from(
u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap())
).ok()?;
let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?);
let data = bytes.get((4 + strings_len)..)?;
Some(Self { data, strings })
}
}
#[derive(Debug)]
pub enum DataBufError {
DataOutOfCapacity,
StringsMapOutOfCapacity,
StringTable(StringTableBufError),
}
impl fmt::Display for DataBufError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::DataOutOfCapacity => write!(f, "data buf out of capacity"),
Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"),
Self::StringTable(err) => write!(f, "string table error: {}", err),
}
}
}
impl error::Error for DataBufError {}
impl From<StringTableBufError> for DataBufError {
fn from(err: StringTableBufError) -> Self {
Self::StringTable(err)
}
}

@ -1,6 +0,0 @@
pub mod char_data;
pub mod data_store;
mod string_table;
pub use char_data::{CharData, Category, CombiningClass};
pub use data_store::{DataStore, DataStoreBuf, DataBufError};

@ -1,98 +0,0 @@
use std::{fmt, error, str, ops::Deref};
/// A view into a [`StringTableBuf`](StringTableBuf). The table stores a collection of strings
/// contiguously, with each string being prefixed by its length in bytes.
#[repr(transparent)]
pub struct StringTable {
bytes: [u8],
}
impl StringTable {
pub fn from_bytes(bytes: &[u8]) -> &Self {
// SAFETY:
// `StringTable` uses `repr(transparent)`, so it has the same memory layout as `[u8]`.
unsafe { &*(bytes as *const [u8] as *const Self) }
}
pub fn to_bytes(&self) -> &[u8] {
&self.bytes
}
/// Attempt to retrieve the string at the given byte offset in the table. The given index must
/// be the start of a table entry; providing any other index may result in an error or an
/// unintended string.
///
/// Note that the string table does not have a sure-fire mechanism for detecting whether the
/// given index is valid, so providing an invalid index may not always result in an error; the
/// bytes starting at the invalid index may be incorrectly interpreted as a valid table entry.
/// However, this will never result in unsoundness, and thus the function is not marked as
/// unsafe; it is checked that the resulting string is valid UTF-8.
pub fn get(&self, index: u32) -> Option<&str> {
let index = usize::try_from(index).ok()?;
let len = *self.bytes.get(index)?;
let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
str::from_utf8(bytes).ok()
}
}
/// An owned [`StringTable`](StringTable). Stores a collection of strings contiguously, with each
/// string being prefixed by its length in bytes.
pub struct StringTableBuf {
buf: Vec<u8>,
}
impl StringTableBuf {
pub fn new() -> Self {
Self { buf: Vec::new() }
}
/// Append the given string to the table, returning the byte offset in the table at which it
/// was stored. This byte offset can then be used to retrieve the string from the table later,
/// via `StringTable::get`.
pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
let len = u8::try_from(s.len())
.map_err(|_| StringTableBufError::StringTooLong)?;
let index = u32::try_from(self.buf.len())
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.try_reserve(s.len() + 1)
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.push(len);
self.buf.extend(s.bytes());
Ok(index)
}
}
impl AsRef<StringTable> for StringTableBuf {
fn as_ref(&self) -> &StringTable {
self
}
}
impl Deref for StringTableBuf {
type Target = StringTable;
fn deref(&self) -> &Self::Target {
StringTable::from_bytes(&self.buf)
}
}
#[derive(Debug)]
pub enum StringTableBufError {
StringTooLong,
OutOfCapacity,
}
impl fmt::Display for StringTableBufError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::StringTooLong => write!(f, "string too long to add to table"),
Self::OutOfCapacity => write!(f, "string table out of capacity"),
}
}
}
impl error::Error for StringTableBufError {}
Loading…
Cancel
Save