work on rust side of new encoded data format

main
pantonshire 3 years ago
parent d9b0c049ab
commit 24abc7ed79

66
Cargo.lock generated

@ -2,6 +2,12 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "atty"
version = "0.2.14"
@ -31,6 +37,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "3.2.25"
@ -70,6 +82,25 @@ dependencies = [
"os_str_bytes",
]
[[package]]
name = "crc32fast"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [
"cfg-if",
]
[[package]]
name = "flate2"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -121,11 +152,20 @@ dependencies = [
"serde",
]
[[package]]
name = "miniz_oxide"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [
"adler",
]
[[package]]
name = "once_cell"
version = "1.17.1"
version = "1.17.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
[[package]]
name = "os_str_bytes"
@ -170,18 +210,18 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.58"
version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.27"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
dependencies = [
"proc-macro2",
]
@ -233,6 +273,12 @@ dependencies = [
"syn",
]
[[package]]
name = "tap"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "termcolor"
version = "1.2.0"
@ -250,9 +296,9 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
[[package]]
name = "unicode-ident"
version = "1.0.8"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
[[package]]
name = "unicode-width"
@ -264,8 +310,8 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
name = "utfdump"
version = "0.1.0"
dependencies = [
"once_cell",
"utfdump_core",
"flate2",
"tap",
]
[[package]]

@ -4,8 +4,7 @@ version = "0.1.0"
edition = "2021"
[dependencies]
utfdump_core = { path = "../core" }
once_cell = "1.17.1"
tap = "1.0.1"
[build-dependencies]
utfdump_core = { path = "../core" }
flate2 = "1.0.26"

@ -1,71 +1,20 @@
use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path};
use std::{env, fs::File, io, path::Path};
use utfdump_core::{CharData, DataStoreBuf};
const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt";
const COMPRESSED_DATA_PATH: &str = "../unicode_data_encoded.gz";
const OUT_DATA_PATH: &str = "unicode_data_encoded";
fn main() {
println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH);
fn main() -> io::Result<()> {
println!("cargo:rerun-if-changed={}", COMPRESSED_DATA_PATH);
let out_dir = env::var_os("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir).join(OUT_DATA_PATH);
let out_data_fd = File::create(out_path)?;
let mut decoder = flate2::write::GzDecoder::new(out_data_fd);
let mut compressed_data_fd = File::open(COMPRESSED_DATA_PATH)?;
let data_file = File::open(UNICODE_DATA_PATH)
.expect("failed to open unicode data file");
let buf_reader = BufReader::new(data_file);
let mut data = DataStoreBuf::new();
let mut start_codepoint = None;
for line in buf_reader.lines() {
let line = line.unwrap();
let (codepoint, char_data) = CharData::from_row(&line).unwrap();
match start_codepoint {
Some(start_codepoint_inner) => {
let prefix = char_data.name()
.strip_suffix(", Last>")
.expect("expected end of codepoint block");
let name = {
let mut buf = String::with_capacity(prefix.len() + 1);
buf.push_str(prefix);
buf.push('>');
buf
};
let char_data = char_data.with_name(&name);
data.insert(char_data, start_codepoint_inner..(codepoint + 1))
.unwrap();
start_codepoint = None;
},
None => {
if char_data.name().ends_with(", First>") {
start_codepoint = Some(codepoint);
} else {
data.insert(char_data, codepoint..(codepoint + 1))
.unwrap();
}
},
}
}
let (strings_len, [strings, data]) = data
.as_ref_type()
.to_bytes()
.unwrap();
let mut out_file = File::create(&out_path)
.expect("failed to open output file");
out_file.write_all(&strings_len).unwrap();
out_file.write_all(strings).unwrap();
out_file.write_all(data).unwrap();
io::copy(&mut compressed_data_fd, &mut decoder)?;
decoder.finish()?;
drop(out_file);
Ok(())
}

@ -1,18 +1,36 @@
pub mod unicode_data;
pub mod utf8;
pub use utfdump_core::{CharData, Category, CombiningClass};
// pub use utfdump_core::{CharData, Category, CombiningClass};
use once_cell::sync::Lazy;
use utfdump_core::data_store::DataStore;
// use once_cell::sync::Lazy;
// use utfdump_core::data_store::DataStore;
// const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
// concat!(env!("OUT_DIR"), "/unicode_data_encoded")
// );
// static UNICODE_DATA: Lazy<DataStore> = Lazy::new(|| {
// DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap()
// });
// pub fn char_data(c: char) -> Option<CharData<'static>> {
// UNICODE_DATA.get(c)
// }
const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
concat!(env!("OUT_DIR"), "/unicode_data_encoded")
);
static UNICODE_DATA: Lazy<DataStore> = Lazy::new(|| {
DataStore::from_bytes(UNICODE_DATA_BYTES).unwrap()
});
#[cfg(test)]
mod tests {
use crate::{UNICODE_DATA_BYTES, unicode_data};
#[test]
fn test_encoded_data() {
let data = unicode_data::UnicodeData::from_bytes(UNICODE_DATA_BYTES)
.unwrap();
pub fn char_data(c: char) -> Option<CharData<'static>> {
UNICODE_DATA.get(c)
println!("{:#?}", data.groups());
}
}

@ -0,0 +1,217 @@
use core::{fmt, mem, slice};
use tap::Pipe;
const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!";
#[derive(Clone, Copy)]
pub struct UnicodeData<'a> {
group_table: GroupTable<'a>,
char_table: CharTable<'a>,
string_table: StringTable<'a>,
}
impl<'a> UnicodeData<'a> {
pub(crate) fn from_bytes(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
let mut bs = ByteStream(bs);
if bs.consume(MAGIC_NUMBER.len())? != MAGIC_NUMBER {
return Err(UnicodeDataError::InvalidHeader);
}
let group_table_len = bs.consume_4_byte_len()?;
let char_table_len = bs.consume_4_byte_len()?;
let string_table_len = bs.consume_4_byte_len()?;
let group_table = bs.consume(group_table_len)?.pipe(GroupTable::new)?;
let char_table = bs.consume(char_table_len)?.pipe(CharTable::new);
let string_table = bs.consume(string_table_len)?.pipe(StringTable::new);
bs.check_empty()?;
Ok(Self { group_table, char_table, string_table })
}
pub(crate) fn groups(self) -> GroupTable<'a> {
self.group_table
}
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct GroupTable<'a> {
entries: &'a [GroupTableEntry],
}
impl<'a> GroupTable<'a> {
fn new(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
if bs.len() % GroupTableEntry::SIZE != 0 {
return Err(UnicodeDataError::InvalidTableSize);
}
let num_entries = bs.len() / GroupTableEntry::SIZE;
// SAFETY:
// - The pointer is valid for reads of `num_entries * mem::size_of::<GroupTableEntry>()`
// bytes; `num_entries = bs.len() / mem::size_of::<GroupTableEntry>()`, so
// `num_entries * mem::size_of::<GroupTableEntry>() <= bs.len()` (the inequality is due
// to flooring integer division), and clearly a pointer to `bs` is valid for reads of
// <= `bs.len()` bytes.
//
// - `u8` and `GroupTableEntry` both have an alignment of 1 (since `GroupTableEntry` is
// packed), so the pointer is correctly aligned.
//
// - The pointer points to `num_entries` consecutive properly-initialised `GroupTableEntry`
// values, as `bs` contains initialised data and `GroupTableEntry` consists only of
// arrays of `u8` of varying sizes, for which any bit pattern is valid.
//
// - Since we obtained the pointer from an immutable reference `bs`, the data cannot be
// mutated by safe code for the duration of the lifetime `'a`.
//
// - The total length of the slice does not exceed `isize::MAX`, since it is no larger
// than `bs` which is a valid slice and therefore no larger than `isize::MAX`.
let entries = unsafe {
slice::from_raw_parts(
bs.as_ptr() as *const GroupTableEntry,
num_entries
)
};
Ok(Self { entries })
}
}
#[derive(Debug)]
#[repr(C, packed)]
struct GroupTableEntry {
start: U32Le,
end: U32Le,
total_len_before: U32Le,
kind: u8,
}
impl GroupTableEntry {
const SIZE: usize = mem::size_of::<Self>();
}
#[derive(Clone, Copy)]
struct CharTable<'a> {
inner: &'a [u8],
}
impl<'a> CharTable<'a> {
fn new(bs: &'a [u8]) -> Self {
Self { inner: bs }
}
}
#[derive(Clone, Copy)]
struct StringTable<'a> {
inner: &'a [u8],
}
impl<'a> StringTable<'a> {
fn new(bs: &'a [u8]) -> Self {
Self { inner: bs }
}
}
#[derive(Clone, Copy)]
#[repr(transparent)]
struct U16Le([u8; 2]);
impl U16Le {
fn to_u16(self) -> u16 {
u16::from_le_bytes(self.0)
}
}
impl fmt::Debug for U16Le {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.to_u16(), f)
}
}
#[derive(Clone, Copy)]
#[repr(transparent)]
struct U24Le([u8; 3]);
impl U24Le {
fn to_u32(self) -> u32 {
let mut buf = [0u8; 4];
(&mut buf[..3]).copy_from_slice(&self.0);
u32::from_le_bytes(buf)
}
}
impl fmt::Debug for U24Le {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.to_u32(), f)
}
}
#[derive(Clone, Copy)]
#[repr(transparent)]
struct U32Le([u8; 4]);
impl U32Le {
fn to_u32(self) -> u32 {
u32::from_le_bytes(self.0)
}
}
impl fmt::Debug for U32Le {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.to_u32(), f)
}
}
struct ByteStream<'a>(&'a [u8]);
impl<'a> ByteStream<'a> {
fn consume(&mut self, n: usize) -> Result<&'a [u8], UnicodeDataError> {
if n > self.0.len() {
return Err(UnicodeDataError::InsufficientBytes);
}
let consumed = &self.0[..n];
self.0 = &self.0[n..];
Ok(consumed)
}
fn consume_4_byte_len(&mut self) -> Result<usize, UnicodeDataError> {
self.consume(4)?
.pipe(<[u8; 4]>::try_from)
.unwrap()
.pipe(u32::from_le_bytes)
.pipe(usize::try_from)
.map_err(|_| UnicodeDataError::OutOfBounds)
}
fn check_empty(&self) -> Result<(), UnicodeDataError> {
self.0
.is_empty()
.then_some(())
.ok_or(UnicodeDataError::LeftoverBytes)
}
}
#[derive(Debug)]
pub enum UnicodeDataError {
InvalidHeader,
InsufficientBytes,
OutOfBounds,
LeftoverBytes,
InvalidTableSize,
}
impl fmt::Display for UnicodeDataError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidHeader => write!(f, "invalid header"),
Self::InsufficientBytes => write!(f, "fewer bytes than expected"),
Self::OutOfBounds => write!(f, "index out of bounds"),
Self::LeftoverBytes => write!(f, "unexpected bytes found after expected end of data"),
Self::InvalidTableSize => write!(f, "invalid table size"),
}
}
}

@ -1,4 +1,4 @@
use std::iter::Peekable;
use core::iter::Peekable;
pub trait ToByte {
fn to_byte(self) -> u8;
@ -186,7 +186,7 @@ impl Utf8Error {
#[cfg(test)]
mod tests {
use std::char::REPLACEMENT_CHARACTER;
use core::char::REPLACEMENT_CHARACTER;
use super::Utf8Decode;

Loading…
Cancel
Save