remove core
parent
f9430db2f9
commit
6e8d197ae4
@ -1,6 +1,5 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"core",
|
||||
"lib",
|
||||
"bin",
|
||||
]
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
[package]
|
||||
name = "utfdump_core"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
authors = ["Tom Panton <pantonshire@gmail.com>"]
|
||||
license = "MIT"
|
||||
repository = "https://github.com/pantonshire/utfdump"
|
||||
description = "Core library for the utfdump command-line tool"
|
||||
|
||||
[dependencies]
|
||||
@ -1,281 +0,0 @@
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CharData<'a> {
|
||||
name: &'a str,
|
||||
category: Category,
|
||||
combining_class: CombiningClass,
|
||||
}
|
||||
|
||||
impl<'a> CharData<'a> {
|
||||
pub fn from_row(row: &'a str) -> Option<(u32, Self)> {
|
||||
let mut fields = [""; 15];
|
||||
for (i, field) in row.splitn(15, ';').enumerate() {
|
||||
fields[i] = field;
|
||||
}
|
||||
|
||||
let codepoint = u32::from_str_radix(fields[0], 16).ok()?;
|
||||
let name = fields[1];
|
||||
let category = Category::from_abbr(fields[2])?;
|
||||
let ccc = CombiningClass(u8::from_str_radix(fields[3], 10).ok()?);
|
||||
|
||||
Some((codepoint, Self::from_parts(name, category, ccc)))
|
||||
}
|
||||
|
||||
pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self {
|
||||
Self { name, category, combining_class }
|
||||
}
|
||||
|
||||
pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
|
||||
where
|
||||
'a: 'b,
|
||||
{
|
||||
Self { name, ..self }
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'a str {
|
||||
self.name
|
||||
}
|
||||
|
||||
pub fn category(&self) -> Category {
|
||||
self.category
|
||||
}
|
||||
|
||||
pub fn combining_class(&self) -> CombiningClass {
|
||||
self.combining_class
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum Category {
|
||||
Lu,
|
||||
Ll,
|
||||
Lt,
|
||||
Mn,
|
||||
Mc,
|
||||
Me,
|
||||
Nd,
|
||||
Nl,
|
||||
No,
|
||||
Zs,
|
||||
Zl,
|
||||
Zp,
|
||||
Cc,
|
||||
Cf,
|
||||
Cs,
|
||||
Co,
|
||||
Cn,
|
||||
Lm,
|
||||
Lo,
|
||||
Pc,
|
||||
Pd,
|
||||
Ps,
|
||||
Pe,
|
||||
Pi,
|
||||
Pf,
|
||||
Po,
|
||||
Sm,
|
||||
Sc,
|
||||
Sk,
|
||||
So,
|
||||
}
|
||||
|
||||
impl Category {
|
||||
pub fn from_byte(b: u8) -> Option<Self> {
|
||||
match b {
|
||||
0 => Some(Self::Lu),
|
||||
1 => Some(Self::Ll),
|
||||
2 => Some(Self::Lt),
|
||||
3 => Some(Self::Mn),
|
||||
4 => Some(Self::Mc),
|
||||
5 => Some(Self::Me),
|
||||
6 => Some(Self::Nd),
|
||||
7 => Some(Self::Nl),
|
||||
8 => Some(Self::No),
|
||||
9 => Some(Self::Zs),
|
||||
10 => Some(Self::Zl),
|
||||
11 => Some(Self::Zp),
|
||||
12 => Some(Self::Cc),
|
||||
13 => Some(Self::Cf),
|
||||
14 => Some(Self::Cs),
|
||||
15 => Some(Self::Co),
|
||||
16 => Some(Self::Cn),
|
||||
17 => Some(Self::Lm),
|
||||
18 => Some(Self::Lo),
|
||||
19 => Some(Self::Pc),
|
||||
20 => Some(Self::Pd),
|
||||
21 => Some(Self::Ps),
|
||||
22 => Some(Self::Pe),
|
||||
23 => Some(Self::Pi),
|
||||
24 => Some(Self::Pf),
|
||||
25 => Some(Self::Po),
|
||||
26 => Some(Self::Sm),
|
||||
27 => Some(Self::Sc),
|
||||
28 => Some(Self::Sk),
|
||||
29 => Some(Self::So),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn byte_repr(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn from_abbr(s: &str) -> Option<Self> {
|
||||
match s {
|
||||
"Lu" => Some(Self::Lu),
|
||||
"Ll" => Some(Self::Ll),
|
||||
"Lt" => Some(Self::Lt),
|
||||
"Mn" => Some(Self::Mn),
|
||||
"Mc" => Some(Self::Mc),
|
||||
"Me" => Some(Self::Me),
|
||||
"Nd" => Some(Self::Nd),
|
||||
"Nl" => Some(Self::Nl),
|
||||
"No" => Some(Self::No),
|
||||
"Zs" => Some(Self::Zs),
|
||||
"Zl" => Some(Self::Zl),
|
||||
"Zp" => Some(Self::Zp),
|
||||
"Cc" => Some(Self::Cc),
|
||||
"Cf" => Some(Self::Cf),
|
||||
"Cs" => Some(Self::Cs),
|
||||
"Co" => Some(Self::Co),
|
||||
"Cn" => Some(Self::Cn),
|
||||
"Lm" => Some(Self::Lm),
|
||||
"Lo" => Some(Self::Lo),
|
||||
"Pc" => Some(Self::Pc),
|
||||
"Pd" => Some(Self::Pd),
|
||||
"Ps" => Some(Self::Ps),
|
||||
"Pe" => Some(Self::Pe),
|
||||
"Pi" => Some(Self::Pi),
|
||||
"Pf" => Some(Self::Pf),
|
||||
"Po" => Some(Self::Po),
|
||||
"Sm" => Some(Self::Sm),
|
||||
"Sc" => Some(Self::Sc),
|
||||
"Sk" => Some(Self::Sk),
|
||||
"So" => Some(Self::So),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn abbr(self) -> &'static str {
|
||||
match self {
|
||||
Self::Lu => "Lu",
|
||||
Self::Ll => "Ll",
|
||||
Self::Lt => "Lt",
|
||||
Self::Mn => "Mn",
|
||||
Self::Mc => "Mc",
|
||||
Self::Me => "Me",
|
||||
Self::Nd => "Nd",
|
||||
Self::Nl => "Nl",
|
||||
Self::No => "No",
|
||||
Self::Zs => "Zs",
|
||||
Self::Zl => "Zl",
|
||||
Self::Zp => "Zp",
|
||||
Self::Cc => "Cc",
|
||||
Self::Cf => "Cf",
|
||||
Self::Cs => "Cs",
|
||||
Self::Co => "Co",
|
||||
Self::Cn => "Cn",
|
||||
Self::Lm => "Lm",
|
||||
Self::Lo => "Lo",
|
||||
Self::Pc => "Pc",
|
||||
Self::Pd => "Pd",
|
||||
Self::Ps => "Ps",
|
||||
Self::Pe => "Pe",
|
||||
Self::Pi => "Pi",
|
||||
Self::Pf => "Pf",
|
||||
Self::Po => "Po",
|
||||
Self::Sm => "Sm",
|
||||
Self::Sc => "Sc",
|
||||
Self::Sk => "Sk",
|
||||
Self::So => "So",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn full_name(self) -> &'static str {
|
||||
match self {
|
||||
Self::Lu => "Letter, Uppercase",
|
||||
Self::Ll => "Letter, Lowercase",
|
||||
Self::Lt => "Letter, Titlecase",
|
||||
Self::Mn => "Mark, Non-Spacing",
|
||||
Self::Mc => "Mark, Spacing Combining",
|
||||
Self::Me => "Mark, Enclosing",
|
||||
Self::Nd => "Number, Decimal Digit",
|
||||
Self::Nl => "Number, Letter",
|
||||
Self::No => "Number, Other",
|
||||
Self::Zs => "Separator, Space",
|
||||
Self::Zl => "Separator, Line",
|
||||
Self::Zp => "Separator: Paragraph",
|
||||
Self::Cc => "Other, Control",
|
||||
Self::Cf => "Other, Format",
|
||||
Self::Cs => "Other, Surrogate",
|
||||
Self::Co => "Other, Private Use",
|
||||
Self::Cn => "Other, Not Assigned",
|
||||
Self::Lm => "Letter, Modifier",
|
||||
Self::Lo => "Letter, Other",
|
||||
Self::Pc => "Punctuation, Connector",
|
||||
Self::Pd => "Punctuation, Dash",
|
||||
Self::Ps => "Punctuation, Open",
|
||||
Self::Pe => "Punctuation, Close",
|
||||
Self::Pi => "Punctuation, Initial Quote",
|
||||
Self::Pf => "Punctuation, Final Quote",
|
||||
Self::Po => "Punctuation, Other",
|
||||
Self::Sm => "Symbol, Math",
|
||||
Self::Sc => "Symbol, Currency",
|
||||
Self::Sk => "Symbol, Modifier",
|
||||
Self::So => "Symbol, Other",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Category {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.abbr())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub struct CombiningClass(pub u8);
|
||||
|
||||
impl CombiningClass {
|
||||
pub fn name(self) -> Option<&'static str> {
|
||||
match self.0 {
|
||||
0 => Some("Not_Reordered"),
|
||||
1 => Some("Overlay"),
|
||||
6 => Some("Han_Reading"),
|
||||
7 => Some("Nukta"),
|
||||
8 => Some("Kana_Voicing"),
|
||||
9 => Some("Virama"),
|
||||
200 => Some("Attached_Below_Left"),
|
||||
202 => Some("Attached_Below"),
|
||||
214 => Some("Attached_Above"),
|
||||
216 => Some("Attached_Above_Right"),
|
||||
218 => Some("Below_Left"),
|
||||
220 => Some("Below"),
|
||||
222 => Some("Below_Right"),
|
||||
224 => Some("Left"),
|
||||
226 => Some("Right"),
|
||||
228 => Some("Above_Left"),
|
||||
230 => Some("Above"),
|
||||
232 => Some("Above_Right"),
|
||||
233 => Some("Double_Below"),
|
||||
234 => Some("Double_Above"),
|
||||
240 => Some("Iota_Subscript"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_combining(self) -> bool {
|
||||
self.0 != 0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for CombiningClass {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.name() {
|
||||
Some(name) => write!(f, "{}", name),
|
||||
None => write!(f, "Ccc{}", self.0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,189 +0,0 @@
|
||||
use std::{collections::{HashMap, hash_map}, error, fmt, ops::Range};
|
||||
|
||||
use crate::{
|
||||
char_data::{CharData, Category, CombiningClass},
|
||||
string_table::{StringTableBufError, StringTableBuf, StringTable},
|
||||
};
|
||||
|
||||
const DATA_ENTRY_SIZE: usize = 8;
|
||||
|
||||
const DATA_INIT_FLAG: u8 = 1;
|
||||
const DATA_REPEATED_FLAG: u8 = 2;
|
||||
|
||||
fn encode_char_data(
|
||||
name_index: u32,
|
||||
category: Category,
|
||||
combining_class: CombiningClass,
|
||||
repeated: bool
|
||||
) -> [u8; DATA_ENTRY_SIZE]
|
||||
{
|
||||
let mut buf = [0u8; DATA_ENTRY_SIZE];
|
||||
|
||||
buf[0] |= DATA_INIT_FLAG;
|
||||
|
||||
if repeated {
|
||||
buf[0] |= DATA_REPEATED_FLAG;
|
||||
}
|
||||
|
||||
buf[1..5].copy_from_slice(&name_index.to_le_bytes());
|
||||
buf[5] = category.byte_repr();
|
||||
buf[6] = combining_class.0;
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
|
||||
-> Option<(u32, Category, CombiningClass, bool)>
|
||||
{
|
||||
let flags = bytes[0];
|
||||
|
||||
if flags & DATA_INIT_FLAG == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
|
||||
let category = Category::from_byte(bytes[5])?;
|
||||
let combining_class = CombiningClass(bytes[6]);
|
||||
let repeated = flags & DATA_REPEATED_FLAG != 0;
|
||||
|
||||
Some((name_index, category, combining_class, repeated))
|
||||
}
|
||||
|
||||
pub struct DataStoreBuf {
|
||||
data: Vec<u8>,
|
||||
strings: StringTableBuf,
|
||||
strings_map: HashMap<String, u32>,
|
||||
}
|
||||
|
||||
impl DataStoreBuf {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
strings: StringTableBuf::new(),
|
||||
strings_map: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_ref_type(&self) -> DataStore {
|
||||
DataStore { data: &self.data, strings: &*self.strings }
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, char_data: CharData, range: Range<u32>) -> Result<(), DataBufError> {
|
||||
if range.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let repeated = range.end
|
||||
.checked_sub(range.start)
|
||||
.map(|len| len > 1)
|
||||
.unwrap_or(false);
|
||||
|
||||
let range = {
|
||||
let start = usize::try_from(range.start)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?
|
||||
.checked_mul(DATA_ENTRY_SIZE)
|
||||
.ok_or(DataBufError::DataOutOfCapacity)?;
|
||||
let end = usize::try_from(range.end)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?
|
||||
.checked_mul(DATA_ENTRY_SIZE)
|
||||
.ok_or(DataBufError::DataOutOfCapacity)?;
|
||||
start..end
|
||||
};
|
||||
|
||||
if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) {
|
||||
self.data.try_reserve(extra_capacity_needed)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?;
|
||||
}
|
||||
|
||||
let name_index = self.add_string(char_data.name().to_owned())?;
|
||||
|
||||
let encoded_char_data = encode_char_data(
|
||||
name_index,
|
||||
char_data.category(),
|
||||
char_data.combining_class(),
|
||||
repeated
|
||||
);
|
||||
|
||||
if self.data.len() < range.end {
|
||||
// Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries.
|
||||
self.data.resize(range.end, 0);
|
||||
}
|
||||
|
||||
for i in range.step_by(DATA_ENTRY_SIZE) {
|
||||
self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_string(&mut self, name: String) -> Result<u32, DataBufError> {
|
||||
match self.strings_map.entry(name) {
|
||||
hash_map::Entry::Occupied(entry) => Ok(*entry.get()),
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
let index = self.strings.push(entry.key())?;
|
||||
entry.insert(index);
|
||||
Ok(index)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct DataStore<'a> {
|
||||
data: &'a [u8],
|
||||
strings: &'a StringTable,
|
||||
}
|
||||
|
||||
impl<'a> DataStore<'a> {
|
||||
pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
|
||||
let index = usize::try_from(u32::from(codepoint)).ok()?;
|
||||
let start = index.checked_mul(DATA_ENTRY_SIZE)?;
|
||||
let end = start.checked_add(DATA_ENTRY_SIZE)?;
|
||||
let encoded = self.data.get(start..end)?;
|
||||
let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?;
|
||||
let name = self.strings.get(name_index)?;
|
||||
Some(CharData::from_parts(name, category, ccc))
|
||||
}
|
||||
|
||||
pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> {
|
||||
let strings = self.strings.to_bytes();
|
||||
let strings_len = u32::try_from(strings.len())
|
||||
.ok()?
|
||||
.to_le_bytes();
|
||||
Some((strings_len, [strings, self.data]))
|
||||
}
|
||||
|
||||
pub fn from_bytes(bytes: &'a [u8]) -> Option<Self> {
|
||||
let strings_len = usize::try_from(
|
||||
u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap())
|
||||
).ok()?;
|
||||
let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?);
|
||||
let data = bytes.get((4 + strings_len)..)?;
|
||||
Some(Self { data, strings })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DataBufError {
|
||||
DataOutOfCapacity,
|
||||
StringsMapOutOfCapacity,
|
||||
StringTable(StringTableBufError),
|
||||
}
|
||||
|
||||
impl fmt::Display for DataBufError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::DataOutOfCapacity => write!(f, "data buf out of capacity"),
|
||||
Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"),
|
||||
Self::StringTable(err) => write!(f, "string table error: {}", err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for DataBufError {}
|
||||
|
||||
impl From<StringTableBufError> for DataBufError {
|
||||
fn from(err: StringTableBufError) -> Self {
|
||||
Self::StringTable(err)
|
||||
}
|
||||
}
|
||||
@ -1,6 +0,0 @@
|
||||
pub mod char_data;
|
||||
pub mod data_store;
|
||||
mod string_table;
|
||||
|
||||
pub use char_data::{CharData, Category, CombiningClass};
|
||||
pub use data_store::{DataStore, DataStoreBuf, DataBufError};
|
||||
@ -1,98 +0,0 @@
|
||||
use std::{fmt, error, str, ops::Deref};
|
||||
|
||||
/// A view into a [`StringTableBuf`](StringTableBuf). The table stores a collection of strings
|
||||
/// contiguously, with each string being prefixed by its length in bytes.
|
||||
#[repr(transparent)]
|
||||
pub struct StringTable {
|
||||
bytes: [u8],
|
||||
}
|
||||
|
||||
impl StringTable {
|
||||
pub fn from_bytes(bytes: &[u8]) -> &Self {
|
||||
// SAFETY:
|
||||
// `StringTable` uses `repr(transparent)`, so it has the same memory layout as `[u8]`.
|
||||
unsafe { &*(bytes as *const [u8] as *const Self) }
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> &[u8] {
|
||||
&self.bytes
|
||||
}
|
||||
|
||||
/// Attempt to retrieve the string at the given byte offset in the table. The given index must
|
||||
/// be the start of a table entry; providing any other index may result in an error or an
|
||||
/// unintended string.
|
||||
///
|
||||
/// Note that the string table does not have a sure-fire mechanism for detecting whether the
|
||||
/// given index is valid, so providing an invalid index may not always result in an error; the
|
||||
/// bytes starting at the invalid index may be incorrectly interpreted as a valid table entry.
|
||||
/// However, this will never result in unsoundness, and thus the function is not marked as
|
||||
/// unsafe; it is checked that the resulting string is valid UTF-8.
|
||||
pub fn get(&self, index: u32) -> Option<&str> {
|
||||
let index = usize::try_from(index).ok()?;
|
||||
let len = *self.bytes.get(index)?;
|
||||
let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
|
||||
str::from_utf8(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// An owned [`StringTable`](StringTable). Stores a collection of strings contiguously, with each
|
||||
/// string being prefixed by its length in bytes.
|
||||
pub struct StringTableBuf {
|
||||
buf: Vec<u8>,
|
||||
}
|
||||
|
||||
impl StringTableBuf {
|
||||
pub fn new() -> Self {
|
||||
Self { buf: Vec::new() }
|
||||
}
|
||||
|
||||
/// Append the given string to the table, returning the byte offset in the table at which it
|
||||
/// was stored. This byte offset can then be used to retrieve the string from the table later,
|
||||
/// via `StringTable::get`.
|
||||
pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
|
||||
let len = u8::try_from(s.len())
|
||||
.map_err(|_| StringTableBufError::StringTooLong)?;
|
||||
|
||||
let index = u32::try_from(self.buf.len())
|
||||
.map_err(|_| StringTableBufError::OutOfCapacity)?;
|
||||
|
||||
self.buf.try_reserve(s.len() + 1)
|
||||
.map_err(|_| StringTableBufError::OutOfCapacity)?;
|
||||
|
||||
self.buf.push(len);
|
||||
self.buf.extend(s.bytes());
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<StringTable> for StringTableBuf {
|
||||
fn as_ref(&self) -> &StringTable {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for StringTableBuf {
|
||||
type Target = StringTable;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
StringTable::from_bytes(&self.buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StringTableBufError {
|
||||
StringTooLong,
|
||||
OutOfCapacity,
|
||||
}
|
||||
|
||||
impl fmt::Display for StringTableBufError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::StringTooLong => write!(f, "string too long to add to table"),
|
||||
Self::OutOfCapacity => write!(f, "string table out of capacity"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for StringTableBufError {}
|
||||
Loading…
Reference in New Issue