♻️ refactoring

main
pantonshire 3 years ago
parent 96d328c829
commit ef6765e037

@ -1,6 +1,6 @@
[workspace]
members = [
"utfdump_core",
"utfdump",
"utfdump_bin",
"core",
"lib",
"bin",
]

@ -8,7 +8,7 @@ repository = "https://github.com/pantonshire/utfdump"
description = "Command-line Unicode character info tool"
[dependencies]
utfdump = { path = "../utfdump" }
utfdump = { path = "../lib" }
libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
tabled = "0.8.0"
clap = { version = "3.2.22", features = ["derive"] }

@ -1,6 +1,9 @@
use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range};
use std::{collections::{HashMap, hash_map}, error, fmt, ops::Range};
use crate::chardata::{CharData, Category, CombiningClass};
use crate::{
char_data::{CharData, Category, CombiningClass},
string_table::{StringTableBufError, StringTableBuf, StringTable},
};
const DATA_ENTRY_SIZE: usize = 8;
@ -46,13 +49,13 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
Some((name_index, category, combining_class, repeated))
}
pub struct DataBuf {
pub struct DataStoreBuf {
data: Vec<u8>,
strings: StringTableBuf,
strings_map: HashMap<String, u32>,
}
impl DataBuf {
impl DataStoreBuf {
pub fn new() -> Self {
Self {
data: Vec::new(),
@ -61,8 +64,8 @@ impl DataBuf {
}
}
pub fn as_ref_type(&self) -> Data {
Data { data: &self.data, strings: self.strings.as_ref_type() }
pub fn as_ref_type(&self) -> DataStore {
DataStore { data: &self.data, strings: &*self.strings }
}
pub fn insert(&mut self, char_data: CharData, range: Range<u32>) -> Result<(), DataBufError> {
@ -126,12 +129,12 @@ impl DataBuf {
}
#[derive(Clone, Copy)]
pub struct Data<'a> {
pub struct DataStore<'a> {
data: &'a [u8],
strings: StringTable<'a>,
strings: &'a StringTable,
}
impl<'a> Data<'a> {
impl<'a> DataStore<'a> {
pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
let index = usize::try_from(u32::from(codepoint)).ok()?;
let start = index.checked_mul(DATA_ENTRY_SIZE)?;
@ -184,73 +187,3 @@ impl From<StringTableBufError> for DataBufError {
Self::StringTable(err)
}
}
#[derive(Clone, Copy)]
pub struct StringTable<'a> {
bytes: &'a [u8],
}
impl<'a> StringTable<'a> {
pub fn from_bytes(bytes: &'a [u8]) -> Self {
Self { bytes }
}
pub fn to_bytes(self) -> &'a [u8] {
self.bytes
}
pub fn get(self, index: u32) -> Option<&'a str> {
let index = usize::try_from(index).ok()?;
let len = *self.bytes.get(index)?;
let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
str::from_utf8(bytes).ok()
}
}
pub struct StringTableBuf {
buf: Vec<u8>,
}
impl StringTableBuf {
pub fn new() -> Self {
Self { buf: Vec::new() }
}
pub fn as_ref_type(&self) -> StringTable {
StringTable { bytes: &self.buf }
}
pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
let len = u8::try_from(s.len())
.map_err(|_| StringTableBufError::StringTooLong)?;
let index = u32::try_from(self.buf.len())
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.try_reserve(s.len() + 1)
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.push(len);
self.buf.extend(s.bytes());
Ok(index)
}
}
#[derive(Debug)]
pub enum StringTableBufError {
StringTooLong,
OutOfCapacity,
}
impl fmt::Display for StringTableBufError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::StringTooLong => write!(f, "string too long to add to table"),
Self::OutOfCapacity => write!(f, "string table out of capacity"),
}
}
}
impl error::Error for StringTableBufError {}

@ -0,0 +1,6 @@
pub mod char_data;
pub mod data_store;
mod string_table;
pub use char_data::{CharData, Category, CombiningClass};
pub use data_store::{DataStore, DataStoreBuf, DataBufError};

@ -0,0 +1,98 @@
use std::{fmt, error, str, ops::Deref};
/// A view into a [`StringTableBuf`](StringTableBuf). The table stores a collection of strings
/// contiguously, with each string being prefixed by its length in bytes.
#[repr(transparent)]
pub struct StringTable {
bytes: [u8],
}
impl StringTable {
pub fn from_bytes(bytes: &[u8]) -> &Self {
// SAFETY:
// `StringTable` uses `repr(transparent)`, so it has the same memory layout as `[u8]`.
unsafe { &*(bytes as *const [u8] as *const Self) }
}
pub fn to_bytes(&self) -> &[u8] {
&self.bytes
}
/// Attempt to retrieve the string at the given byte offset in the table. The given index must
/// be the start of a table entry; providing any other index may result in an error or an
/// unintended string.
///
/// Note that the string table does not have a sure-fire mechanism for detecting whether the
/// given index is valid, so providing an invalid index may not always result in an error; the
/// bytes starting at the invalid index may be incorrectly interpreted as a valid table entry.
/// However, this will never result in unsoundness, and thus the function is not marked as
/// unsafe; it is checked that the resulting string is valid UTF-8.
pub fn get(&self, index: u32) -> Option<&str> {
let index = usize::try_from(index).ok()?;
let len = *self.bytes.get(index)?;
let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
str::from_utf8(bytes).ok()
}
}
/// An owned [`StringTable`](StringTable). Stores a collection of strings contiguously, with each
/// string being prefixed by its length in bytes.
pub struct StringTableBuf {
buf: Vec<u8>,
}
impl StringTableBuf {
pub fn new() -> Self {
Self { buf: Vec::new() }
}
/// Append the given string to the table, returning the byte offset in the table at which it
/// was stored. This byte offset can then be used to retrieve the string from the table later,
/// via `StringTable::get`.
pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
let len = u8::try_from(s.len())
.map_err(|_| StringTableBufError::StringTooLong)?;
let index = u32::try_from(self.buf.len())
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.try_reserve(s.len() + 1)
.map_err(|_| StringTableBufError::OutOfCapacity)?;
self.buf.push(len);
self.buf.extend(s.bytes());
Ok(index)
}
}
impl AsRef<StringTable> for StringTableBuf {
fn as_ref(&self) -> &StringTable {
self
}
}
impl Deref for StringTableBuf {
type Target = StringTable;
fn deref(&self) -> &Self::Target {
StringTable::from_bytes(&self.buf)
}
}
#[derive(Debug)]
pub enum StringTableBufError {
StringTooLong,
OutOfCapacity,
}
impl fmt::Display for StringTableBufError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::StringTooLong => write!(f, "string too long to add to table"),
Self::OutOfCapacity => write!(f, "string table out of capacity"),
}
}
}
impl error::Error for StringTableBufError {}

@ -4,8 +4,8 @@ version = "0.1.0"
edition = "2021"
[dependencies]
utfdump_core = { path = "../utfdump_core" }
utfdump_core = { path = "../core" }
once_cell = "1.17.1"
[build-dependencies]
utfdump_core = { path = "../utfdump_core" }
utfdump_core = { path = "../core" }

@ -1,6 +1,6 @@
use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path};
use utfdump_core::{chardata::CharData, encoded::DataBuf};
use utfdump_core::{CharData, DataStoreBuf};
const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt";
const OUT_DATA_PATH: &str = "unicode_data_encoded";
@ -16,7 +16,7 @@ fn main() {
let buf_reader = BufReader::new(data_file);
let mut data = DataBuf::new();
let mut data = DataStoreBuf::new();
let mut start_codepoint = None;
for line in buf_reader.lines() {

@ -1,16 +1,16 @@
pub mod utf8;
pub use utfdump_core::chardata::{CharData, Category, CombiningClass};
pub use utfdump_core::{CharData, Category, CombiningClass};
use once_cell::sync::Lazy;
use utfdump_core::encoded::Data;
use utfdump_core::data_store::DataStore;
const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
concat!(env!("OUT_DIR"), "/unicode_data_encoded")
);
static UNICODE_DATA: Lazy<Data> = Lazy::new(|| {
Data::from_bytes(UNICODE_DATA_BYTES).unwrap()
static UNICODE_DATA: Lazy<DataStore> = Lazy::new(|| {
DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap()
});
pub fn char_data(c: char) -> Option<CharData<'static>> {

@ -1,2 +0,0 @@
pub mod chardata;
pub mod encoded;
Loading…
Cancel
Save