Initial commit
commit
172d1a14fe
@ -0,0 +1,2 @@
|
||||
/target
|
||||
unicode_data_latest.txt
|
||||
@ -0,0 +1,154 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||
|
||||
[[package]]
|
||||
name = "libshire"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papergrid"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453cf71f2a37af495a1a124bf30d4d7469cfbea58e9f2479be9d222396a518a2"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"fnv",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.144"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5b2f8c37d26d87d2252187b0a45ea3cbf42baca10377c7e7eaaa2800fa9bf97"
|
||||
dependencies = [
|
||||
"papergrid",
|
||||
"tabled_derive",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled_derive"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9ee618502f497abf593e1c5c9577f34775b111480009ffccd7ad70d23fcaba8"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
|
||||
|
||||
[[package]]
|
||||
name = "utfdump"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"libshire",
|
||||
"tabled",
|
||||
"utfdump_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utfdump_core"
|
||||
version = "0.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
@ -0,0 +1,5 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"utfdump_core",
|
||||
"utfdump_bin",
|
||||
]
|
||||
@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt
|
||||
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "utfdump"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
utfdump_core = { path = "../utfdump_core" }
|
||||
libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
|
||||
tabled = "0.8.0"
|
||||
|
||||
[build-dependencies]
|
||||
utfdump_core = { path = "../utfdump_core" }
|
||||
@ -0,0 +1,71 @@
|
||||
use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path};
|
||||
|
||||
use utfdump_core::{chardata::CharData, encoded::DataBuf};
|
||||
|
||||
const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt";
|
||||
const OUT_DATA_PATH: &str = "unicode_data_encoded";
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH);
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
let out_path = Path::new(&out_dir).join(OUT_DATA_PATH);
|
||||
|
||||
let data_file = File::open(UNICODE_DATA_PATH)
|
||||
.expect("failed to open unicode data file");
|
||||
|
||||
let buf_reader = BufReader::new(data_file);
|
||||
|
||||
let mut data = DataBuf::new();
|
||||
let mut start_codepoint = None;
|
||||
|
||||
for line in buf_reader.lines() {
|
||||
let line = line.unwrap();
|
||||
let (codepoint, char_data) = CharData::from_row(&line).unwrap();
|
||||
|
||||
match start_codepoint {
|
||||
Some(start_codepoint_inner) => {
|
||||
let prefix = char_data.name()
|
||||
.strip_suffix(", Last>")
|
||||
.expect("expected end of codepoint block");
|
||||
|
||||
let name = {
|
||||
let mut buf = String::with_capacity(prefix.len() + 1);
|
||||
buf.push_str(prefix);
|
||||
buf.push('>');
|
||||
buf
|
||||
};
|
||||
|
||||
let char_data = char_data.with_name(&name);
|
||||
|
||||
data.insert(char_data, start_codepoint_inner..(codepoint + 1))
|
||||
.unwrap();
|
||||
|
||||
start_codepoint = None;
|
||||
},
|
||||
|
||||
None => {
|
||||
if char_data.name().ends_with(", First>") {
|
||||
start_codepoint = Some(codepoint);
|
||||
} else {
|
||||
data.insert(char_data, codepoint..(codepoint + 1))
|
||||
.unwrap();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let (strings_len, [strings, data]) = data
|
||||
.as_ref_type()
|
||||
.to_bytes()
|
||||
.unwrap();
|
||||
|
||||
let mut out_file = File::create(&out_path)
|
||||
.expect("failed to open output file");
|
||||
|
||||
out_file.write_all(&strings_len).unwrap();
|
||||
out_file.write_all(strings).unwrap();
|
||||
out_file.write_all(data).unwrap();
|
||||
|
||||
drop(out_file);
|
||||
}
|
||||
@ -0,0 +1,126 @@
|
||||
use std::{fmt, io::{self, Read}};
|
||||
|
||||
use libshire::strings::CappedString;
|
||||
use tabled::{Tabled, Table, Style};
|
||||
|
||||
use utfdump_core::{chardata::Category, encoded::Data};
|
||||
|
||||
const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded"));
|
||||
|
||||
fn main() {
|
||||
let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap();
|
||||
|
||||
let input = {
|
||||
let mut buf = Vec::<u8>::new();
|
||||
let stdin = io::stdin();
|
||||
let mut guard = stdin.lock();
|
||||
guard.read_to_end(&mut buf)
|
||||
.expect("failed to read stdin");
|
||||
// TODO: just skip over invalid utf-8 characters
|
||||
String::from_utf8(buf)
|
||||
.expect("invalid utf-8")
|
||||
};
|
||||
|
||||
let rows = input
|
||||
.chars()
|
||||
.map(|c| {
|
||||
let mut name = Optional::None;
|
||||
let mut category = Optional::None;
|
||||
let mut char_combining_class = Optional::None;
|
||||
|
||||
let mut combining = false;
|
||||
|
||||
if let Some(char_data) = data.get(c as u32) {
|
||||
name = Optional::Some(char_data.name());
|
||||
category = Optional::Some(char_data.category());
|
||||
|
||||
let ccc = char_data.ccc();
|
||||
char_combining_class = Optional::Some(ccc);
|
||||
combining = ccc != 0;
|
||||
}
|
||||
|
||||
let display_char = {
|
||||
let mut buf = CappedString::empty();
|
||||
if combining {
|
||||
buf.push_truncating('\u{25cc}');
|
||||
}
|
||||
buf.push_truncating(c);
|
||||
buf
|
||||
};
|
||||
|
||||
OutRow {
|
||||
display_char,
|
||||
codepoint: Codepoint(c),
|
||||
utf_8_bytes: Utf8Bytes(c),
|
||||
name,
|
||||
category,
|
||||
char_combining_class,
|
||||
}
|
||||
});
|
||||
|
||||
let table = Table::new(rows)
|
||||
.with(Style::modern());
|
||||
|
||||
println!("{}", table);
|
||||
}
|
||||
|
||||
#[derive(Tabled)]
|
||||
struct OutRow {
|
||||
#[tabled(rename = "")]
|
||||
display_char: CappedString<8>,
|
||||
#[tabled(rename = "Codepoint")]
|
||||
codepoint: Codepoint,
|
||||
#[tabled(rename = "UTF-8")]
|
||||
utf_8_bytes: Utf8Bytes,
|
||||
#[tabled(rename = "Name")]
|
||||
name: Optional<&'static str>,
|
||||
#[tabled(rename = "Category")]
|
||||
category: Optional<Category>,
|
||||
#[tabled(rename = "Combining")]
|
||||
char_combining_class: Optional<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Optional<T> {
|
||||
Some(T),
|
||||
None,
|
||||
}
|
||||
|
||||
impl<T> fmt::Display for Optional<T>
|
||||
where
|
||||
T: fmt::Display,
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Some(x) => fmt::Display::fmt(&x, f),
|
||||
Self::None => f.write_str("??"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Codepoint(char);
|
||||
|
||||
impl fmt::Display for Codepoint {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "U+{:04x}", self.0 as u32)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Utf8Bytes(char);
|
||||
|
||||
impl fmt::Display for Utf8Bytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut buf = [0u8; 4];
|
||||
let s = self.0.encode_utf8(&mut buf);
|
||||
let mut bytes = s.bytes();
|
||||
if let Some(b) = bytes.next() {
|
||||
write!(f, "0x{:02x}", b)?;
|
||||
for b in bytes {
|
||||
write!(f, " 0x{:02x}", b)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,6 @@
|
||||
[package]
|
||||
name = "utfdump_core"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
@ -0,0 +1,237 @@
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CharData<'a> {
|
||||
name: &'a str,
|
||||
category: Category,
|
||||
ccc: u8,
|
||||
}
|
||||
|
||||
impl<'a> CharData<'a> {
|
||||
pub fn from_row(row: &'a str) -> Option<(u32, Self)> {
|
||||
let mut fields = [""; 15];
|
||||
for (i, field) in row.splitn(15, ';').enumerate() {
|
||||
fields[i] = field;
|
||||
}
|
||||
|
||||
let codepoint = u32::from_str_radix(fields[0], 16).ok()?;
|
||||
let name = fields[1];
|
||||
let category = Category::from_abbr(fields[2])?;
|
||||
let ccc = u8::from_str_radix(fields[3], 10).ok()?;
|
||||
|
||||
Some((codepoint, Self::from_parts(name, category, ccc)))
|
||||
}
|
||||
|
||||
pub fn from_parts(name: &'a str, category: Category, ccc: u8) -> Self {
|
||||
Self { name, category, ccc }
|
||||
}
|
||||
|
||||
pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
|
||||
where
|
||||
'a: 'b,
|
||||
{
|
||||
Self { name, ..self }
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'a str {
|
||||
self.name
|
||||
}
|
||||
|
||||
pub fn category(&self) -> Category {
|
||||
self.category
|
||||
}
|
||||
|
||||
pub fn ccc(&self) -> u8 {
|
||||
self.ccc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
#[repr(u8)]
|
||||
pub enum Category {
|
||||
Lu = 0,
|
||||
Ll = 1,
|
||||
Lt = 2,
|
||||
Mn = 3,
|
||||
Mc = 4,
|
||||
Me = 5,
|
||||
Nd = 6,
|
||||
Nl = 7,
|
||||
No = 8,
|
||||
Zs = 9,
|
||||
Zl = 10,
|
||||
Zp = 11,
|
||||
Cc = 12,
|
||||
Cf = 13,
|
||||
Cs = 14,
|
||||
Co = 15,
|
||||
Cn = 16,
|
||||
Lm = 17,
|
||||
Lo = 18,
|
||||
Pc = 19,
|
||||
Pd = 20,
|
||||
Ps = 21,
|
||||
Pe = 22,
|
||||
Pi = 23,
|
||||
Pf = 24,
|
||||
Po = 25,
|
||||
Sm = 26,
|
||||
Sc = 27,
|
||||
Sk = 28,
|
||||
So = 29,
|
||||
}
|
||||
|
||||
impl Category {
|
||||
pub fn from_byte(b: u8) -> Option<Self> {
|
||||
match b {
|
||||
0 => Some(Self::Lu),
|
||||
1 => Some(Self::Ll),
|
||||
2 => Some(Self::Lt),
|
||||
3 => Some(Self::Mn),
|
||||
4 => Some(Self::Mc),
|
||||
5 => Some(Self::Me),
|
||||
6 => Some(Self::Nd),
|
||||
7 => Some(Self::Nl),
|
||||
8 => Some(Self::No),
|
||||
9 => Some(Self::Zs),
|
||||
10 => Some(Self::Zl),
|
||||
11 => Some(Self::Zp),
|
||||
12 => Some(Self::Cc),
|
||||
13 => Some(Self::Cf),
|
||||
14 => Some(Self::Cs),
|
||||
15 => Some(Self::Co),
|
||||
16 => Some(Self::Cn),
|
||||
17 => Some(Self::Lm),
|
||||
18 => Some(Self::Lo),
|
||||
19 => Some(Self::Pc),
|
||||
20 => Some(Self::Pd),
|
||||
21 => Some(Self::Ps),
|
||||
22 => Some(Self::Pe),
|
||||
23 => Some(Self::Pi),
|
||||
24 => Some(Self::Pf),
|
||||
25 => Some(Self::Po),
|
||||
26 => Some(Self::Sm),
|
||||
27 => Some(Self::Sc),
|
||||
28 => Some(Self::Sk),
|
||||
29 => Some(Self::So),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn byte_repr(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn from_abbr(s: &str) -> Option<Self> {
|
||||
match s {
|
||||
"Lu" => Some(Self::Lu),
|
||||
"Ll" => Some(Self::Ll),
|
||||
"Lt" => Some(Self::Lt),
|
||||
"Mn" => Some(Self::Mn),
|
||||
"Mc" => Some(Self::Mc),
|
||||
"Me" => Some(Self::Me),
|
||||
"Nd" => Some(Self::Nd),
|
||||
"Nl" => Some(Self::Nl),
|
||||
"No" => Some(Self::No),
|
||||
"Zs" => Some(Self::Zs),
|
||||
"Zl" => Some(Self::Zl),
|
||||
"Zp" => Some(Self::Zp),
|
||||
"Cc" => Some(Self::Cc),
|
||||
"Cf" => Some(Self::Cf),
|
||||
"Cs" => Some(Self::Cs),
|
||||
"Co" => Some(Self::Co),
|
||||
"Cn" => Some(Self::Cn),
|
||||
"Lm" => Some(Self::Lm),
|
||||
"Lo" => Some(Self::Lo),
|
||||
"Pc" => Some(Self::Pc),
|
||||
"Pd" => Some(Self::Pd),
|
||||
"Ps" => Some(Self::Ps),
|
||||
"Pe" => Some(Self::Pe),
|
||||
"Pi" => Some(Self::Pi),
|
||||
"Pf" => Some(Self::Pf),
|
||||
"Po" => Some(Self::Po),
|
||||
"Sm" => Some(Self::Sm),
|
||||
"Sc" => Some(Self::Sc),
|
||||
"Sk" => Some(Self::Sk),
|
||||
"So" => Some(Self::So),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn abbr(self) -> &'static str {
|
||||
match self {
|
||||
Self::Lu => "Lu",
|
||||
Self::Ll => "Ll",
|
||||
Self::Lt => "Lt",
|
||||
Self::Mn => "Mn",
|
||||
Self::Mc => "Mc",
|
||||
Self::Me => "Me",
|
||||
Self::Nd => "Nd",
|
||||
Self::Nl => "Nl",
|
||||
Self::No => "No",
|
||||
Self::Zs => "Zs",
|
||||
Self::Zl => "Zl",
|
||||
Self::Zp => "Zp",
|
||||
Self::Cc => "Cc",
|
||||
Self::Cf => "Cf",
|
||||
Self::Cs => "Cs",
|
||||
Self::Co => "Co",
|
||||
Self::Cn => "Cn",
|
||||
Self::Lm => "Lm",
|
||||
Self::Lo => "Lo",
|
||||
Self::Pc => "Pc",
|
||||
Self::Pd => "Pd",
|
||||
Self::Ps => "Ps",
|
||||
Self::Pe => "Pe",
|
||||
Self::Pi => "Pi",
|
||||
Self::Pf => "Pf",
|
||||
Self::Po => "Po",
|
||||
Self::Sm => "Sm",
|
||||
Self::Sc => "Sc",
|
||||
Self::Sk => "Sk",
|
||||
Self::So => "So",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn full_name(self) -> &'static str {
|
||||
match self {
|
||||
Self::Lu => "Letter, Uppercase",
|
||||
Self::Ll => "Letter, Lowercase",
|
||||
Self::Lt => "Letter, Titlecase",
|
||||
Self::Mn => "Mark, Non-Spacing",
|
||||
Self::Mc => "Mark, Spacing Combining",
|
||||
Self::Me => "Mark, Enclosing",
|
||||
Self::Nd => "Number, Decimal Digit",
|
||||
Self::Nl => "Number, Letter",
|
||||
Self::No => "Number, Other",
|
||||
Self::Zs => "Separator, Space",
|
||||
Self::Zl => "Separator, Line",
|
||||
Self::Zp => "Separator: Paragraph",
|
||||
Self::Cc => "Other, Control",
|
||||
Self::Cf => "Other, Format",
|
||||
Self::Cs => "Other, Surrogate",
|
||||
Self::Co => "Other, Private Use",
|
||||
Self::Cn => "Other, Not Assigned",
|
||||
Self::Lm => "Letter, Modifier",
|
||||
Self::Lo => "Letter, Other",
|
||||
Self::Pc => "Punctuation, Connector",
|
||||
Self::Pd => "Punctuation, Dash",
|
||||
Self::Ps => "Punctuation, Open",
|
||||
Self::Pe => "Punctuation, Close",
|
||||
Self::Pi => "Punctuation, Initial Quote",
|
||||
Self::Pf => "Punctuation, Final Quote",
|
||||
Self::Po => "Punctuation, Other",
|
||||
Self::Sm => "Symbol, Math",
|
||||
Self::Sc => "Symbol, Currency",
|
||||
Self::Sk => "Symbol, Modifier",
|
||||
Self::So => "Symbol, Other",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Category {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}: {}", self.abbr(), self.full_name())
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,245 @@
|
||||
use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range};
|
||||
|
||||
use crate::chardata::{CharData, Category};
|
||||
|
||||
const DATA_ENTRY_SIZE: usize = 8;
|
||||
|
||||
const DATA_INIT_FLAG: u8 = 1;
|
||||
const DATA_REPEATED_FLAG: u8 = 2;
|
||||
|
||||
fn encode_char_data(name_index: u32, category: Category, ccc: u8, repeated: bool) -> [u8; DATA_ENTRY_SIZE] {
|
||||
let mut buf = [0u8; DATA_ENTRY_SIZE];
|
||||
|
||||
buf[0] |= DATA_INIT_FLAG;
|
||||
|
||||
if repeated {
|
||||
buf[0] |= DATA_REPEATED_FLAG;
|
||||
}
|
||||
|
||||
buf[1..5].copy_from_slice(&name_index.to_le_bytes());
|
||||
buf[5] = category.byte_repr();
|
||||
buf[6] = ccc;
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, u8, bool)> {
|
||||
let flags = bytes[0];
|
||||
|
||||
if flags & DATA_INIT_FLAG == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
|
||||
let category = Category::from_byte(bytes[5])?;
|
||||
let ccc = bytes[6];
|
||||
let repeated = flags & DATA_REPEATED_FLAG != 0;
|
||||
|
||||
Some((name_index, category, ccc, repeated))
|
||||
}
|
||||
|
||||
pub struct DataBuf {
|
||||
data: Vec<u8>,
|
||||
strings: StringTableBuf,
|
||||
strings_map: HashMap<String, u32>,
|
||||
}
|
||||
|
||||
impl DataBuf {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
strings: StringTableBuf::new(),
|
||||
strings_map: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_ref_type(&self) -> Data {
|
||||
Data { data: &self.data, strings: self.strings.as_ref_type() }
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, char_data: CharData, range: Range<u32>) -> Result<(), DataBufError> {
|
||||
if range.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let repeated = range.end - range.start > 1;
|
||||
|
||||
let range = {
|
||||
let start = usize::try_from(range.start)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?
|
||||
.checked_mul(DATA_ENTRY_SIZE)
|
||||
.ok_or(DataBufError::DataOutOfCapacity)?;
|
||||
let end = usize::try_from(range.end)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?
|
||||
.checked_mul(DATA_ENTRY_SIZE)
|
||||
.ok_or(DataBufError::DataOutOfCapacity)?;
|
||||
start..end
|
||||
};
|
||||
|
||||
if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) {
|
||||
self.data.try_reserve(extra_capacity_needed)
|
||||
.map_err(|_| DataBufError::DataOutOfCapacity)?;
|
||||
}
|
||||
|
||||
let name_index = self.add_string(char_data.name().to_owned())?;
|
||||
|
||||
let encoded_char_data = encode_char_data(
|
||||
name_index,
|
||||
char_data.category(),
|
||||
char_data.ccc(),
|
||||
repeated
|
||||
);
|
||||
|
||||
if self.data.len() < range.end {
|
||||
// Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries.
|
||||
self.data.resize(range.end, 0);
|
||||
}
|
||||
|
||||
for i in range.step_by(DATA_ENTRY_SIZE) {
|
||||
self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_string(&mut self, name: String) -> Result<u32, DataBufError> {
|
||||
match self.strings_map.entry(name) {
|
||||
hash_map::Entry::Occupied(entry) => Ok(*entry.get()),
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
let index = self.strings.push(entry.key())?;
|
||||
entry.insert(index);
|
||||
Ok(index)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Data<'a> {
|
||||
data: &'a [u8],
|
||||
strings: StringTable<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Data<'a> {
|
||||
pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
|
||||
let index = usize::try_from(codepoint).ok()?;
|
||||
let start = index.checked_mul(DATA_ENTRY_SIZE)?;
|
||||
let end = start.checked_add(DATA_ENTRY_SIZE)?;
|
||||
let encoded = self.data.get(start..end)?;
|
||||
let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?;
|
||||
let name = self.strings.get(name_index)?;
|
||||
Some(CharData::from_parts(name, category, ccc))
|
||||
}
|
||||
|
||||
pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> {
|
||||
let strings = self.strings.to_bytes();
|
||||
let strings_len = u32::try_from(strings.len())
|
||||
.ok()?
|
||||
.to_le_bytes();
|
||||
Some((strings_len, [strings, self.data]))
|
||||
}
|
||||
|
||||
pub fn from_bytes(bytes: &'a [u8]) -> Option<Self> {
|
||||
let strings_len = usize::try_from(
|
||||
u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap())
|
||||
).ok()?;
|
||||
let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?);
|
||||
let data = bytes.get((4 + strings_len)..)?;
|
||||
Some(Self { data, strings })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DataBufError {
|
||||
DataOutOfCapacity,
|
||||
StringsMapOutOfCapacity,
|
||||
StringTable(StringTableBufError),
|
||||
}
|
||||
|
||||
impl fmt::Display for DataBufError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::DataOutOfCapacity => write!(f, "data buf out of capacity"),
|
||||
Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"),
|
||||
Self::StringTable(err) => write!(f, "string table error: {}", err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for DataBufError {}
|
||||
|
||||
impl From<StringTableBufError> for DataBufError {
|
||||
fn from(err: StringTableBufError) -> Self {
|
||||
Self::StringTable(err)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct StringTable<'a> {
|
||||
bytes: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> StringTable<'a> {
|
||||
pub fn from_bytes(bytes: &'a [u8]) -> Self {
|
||||
Self { bytes }
|
||||
}
|
||||
|
||||
pub fn to_bytes(self) -> &'a [u8] {
|
||||
self.bytes
|
||||
}
|
||||
|
||||
pub fn get(self, index: u32) -> Option<&'a str> {
|
||||
let index = usize::try_from(index).ok()?;
|
||||
let len = *self.bytes.get(index)?;
|
||||
let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
|
||||
str::from_utf8(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StringTableBuf {
|
||||
buf: Vec<u8>,
|
||||
}
|
||||
|
||||
impl StringTableBuf {
|
||||
pub fn new() -> Self {
|
||||
Self { buf: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn as_ref_type(&self) -> StringTable {
|
||||
StringTable { bytes: &self.buf }
|
||||
}
|
||||
|
||||
pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
|
||||
let len = u8::try_from(s.len())
|
||||
.map_err(|_| StringTableBufError::StringTooLong)?;
|
||||
|
||||
let index = u32::try_from(self.buf.len())
|
||||
.map_err(|_| StringTableBufError::OutOfCapacity)?;
|
||||
|
||||
self.buf.try_reserve(s.len() + 1)
|
||||
.map_err(|_| StringTableBufError::OutOfCapacity)?;
|
||||
|
||||
self.buf.push(len);
|
||||
self.buf.extend(s.bytes());
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StringTableBufError {
|
||||
StringTooLong,
|
||||
OutOfCapacity,
|
||||
}
|
||||
|
||||
impl fmt::Display for StringTableBufError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::StringTooLong => write!(f, "string too long to add to table"),
|
||||
Self::OutOfCapacity => write!(f, "string table out of capacity"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for StringTableBufError {}
|
||||
|
||||
@ -0,0 +1,2 @@
|
||||
pub mod chardata;
|
||||
pub mod encoded;
|
||||
Loading…
Reference in New Issue