diff --git a/bin/src/main.rs b/bin/src/main.rs index d86f2ab..c32045b 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -3,7 +3,7 @@ use std::{fmt, io::{self, Read}}; use clap::Parser; use libshire::strings::CappedString; use tabled::{Tabled, Table, Style}; -use utfdump::{char_data, CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}}; +use utfdump::{CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}, StaticUnicodeData}; #[derive(Parser)] #[clap(author, version, about, long_about = None)] @@ -14,6 +14,8 @@ struct Args { } fn main() { + let unicode_data = StaticUnicodeData::new().unwrap(); + let args = Args::parse(); let input = { @@ -27,7 +29,7 @@ fn main() { let rows = input .decode_utf8() - .map(|c| OutRow::from_char_result(c, args.full_category_names)); + .map(|c| OutRow::from_char_result(&unicode_data, c, args.full_category_names)); let table = Table::new(rows) .with(Style::modern()); @@ -48,25 +50,35 @@ struct OutRow { #[tabled(rename = "Category")] category: Optional, #[tabled(rename = "Combining")] - char_combining_class: Optional, + char_combining_class: Optional, } impl OutRow { - fn from_char_result(c: Result, full_category_names: bool) -> Self { + fn from_char_result( + unicode_data: &StaticUnicodeData, + c: Result, + full_category_names: bool + ) -> Self + { match c { - Ok(c) => Self::from_good_char(c, full_category_names), + Ok(c) => Self::from_good_char(unicode_data, c, full_category_names), Err(err) => Self::from_bad_char(err), } } - fn from_good_char(c: char, full_category_names: bool) -> Self { + fn from_good_char( + unicode_data: &StaticUnicodeData, + c: char, + full_category_names: bool + ) -> Self + { let mut name = Optional::None; let mut category = Optional::None; let mut char_combining_class = Optional::None; let mut combining = false; - if let Some(char_data) = char_data(c) { + if let Some(char_data) = unicode_data.get(u32::from(c)) { name = Optional::Some(char_data.name()); category = Optional::Some(DisplayCategory { category: char_data.category(), @@ -74,7 +86,7 @@ impl OutRow { }); let ccc = char_data.combining_class(); - char_combining_class = Optional::Some(ccc); + char_combining_class = Optional::Some(DisplayCombiningClass { ccc }); combining = ccc.is_combining(); } @@ -184,8 +196,20 @@ impl fmt::Display for DisplayCategory { if self.full_name { write!(f, "{}", self.category.full_name()) } else { - write!(f, "{}", self.category.abbr()) + write!(f, "{}", self.category.abbreviation()) } } } +struct DisplayCombiningClass { + ccc: CombiningClass, +} + +impl fmt::Display for DisplayCombiningClass { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.ccc.name() { + Some(name) => write!(f, "{}", name), + None => write!(f, "{}", self.ccc.0), + } + } +} diff --git a/lib/src/character.rs b/lib/src/character.rs index 9537143..dce2417 100644 --- a/lib/src/character.rs +++ b/lib/src/character.rs @@ -112,6 +112,39 @@ impl<'a> CharData<'a> { #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] pub struct CombiningClass(pub u8); +impl CombiningClass { + pub fn name(self) -> Option<&'static str> { + match self.0 { + 0 => Some("Not_Reordered"), + 1 => Some("Overlay"), + 6 => Some("Han_Reading"), + 7 => Some("Nukta"), + 8 => Some("Kana_Voicing"), + 9 => Some("Virama"), + 200 => Some("Attached_Below_Left"), + 202 => Some("Attached_Below"), + 214 => Some("Attached_Above"), + 216 => Some("Attached_Above_Right"), + 218 => Some("Below_Left"), + 220 => Some("Below"), + 222 => Some("Below_Right"), + 224 => Some("Left"), + 226 => Some("Right"), + 228 => Some("Above_Left"), + 230 => Some("Above"), + 232 => Some("Above_Right"), + 233 => Some("Double_Below"), + 234 => Some("Double_Above"), + 240 => Some("Iota_Subscript"), + _ => None, + } + } + + pub fn is_combining(self) -> bool { + self.0 != 0 + } +} + #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] pub enum Category { Lu, @@ -182,6 +215,76 @@ impl Category { _ => None, } } + + pub fn abbreviation(self) -> &'static str { + match self { + Self::Lu => "Lu", + Self::Ll => "Ll", + Self::Lt => "Lt", + Self::Mn => "Mn", + Self::Mc => "Mc", + Self::Me => "Me", + Self::Nd => "Nd", + Self::Nl => "Nl", + Self::No => "No", + Self::Zs => "Zs", + Self::Zl => "Zl", + Self::Zp => "Zp", + Self::Cc => "Cc", + Self::Cf => "Cf", + Self::Cs => "Cs", + Self::Co => "Co", + Self::Cn => "Cn", + Self::Lm => "Lm", + Self::Lo => "Lo", + Self::Pc => "Pc", + Self::Pd => "Pd", + Self::Ps => "Ps", + Self::Pe => "Pe", + Self::Pi => "Pi", + Self::Pf => "Pf", + Self::Po => "Po", + Self::Sm => "Sm", + Self::Sc => "Sc", + Self::Sk => "Sk", + Self::So => "So", + } + } + + pub fn full_name(self) -> &'static str { + match self { + Self::Lu => "Letter, Uppercase", + Self::Ll => "Letter, Lowercase", + Self::Lt => "Letter, Titlecase", + Self::Mn => "Mark, Non-Spacing", + Self::Mc => "Mark, Spacing Combining", + Self::Me => "Mark, Enclosing", + Self::Nd => "Number, Decimal Digit", + Self::Nl => "Number, Letter", + Self::No => "Number, Other", + Self::Zs => "Separator, Space", + Self::Zl => "Separator, Line", + Self::Zp => "Separator: Paragraph", + Self::Cc => "Other, Control", + Self::Cf => "Other, Format", + Self::Cs => "Other, Surrogate", + Self::Co => "Other, Private Use", + Self::Cn => "Other, Not Assigned", + Self::Lm => "Letter, Modifier", + Self::Lo => "Letter, Other", + Self::Pc => "Punctuation, Connector", + Self::Pd => "Punctuation, Dash", + Self::Ps => "Punctuation, Open", + Self::Pe => "Punctuation, Close", + Self::Pi => "Punctuation, Initial Quote", + Self::Pf => "Punctuation, Final Quote", + Self::Po => "Punctuation, Other", + Self::Sm => "Symbol, Math", + Self::Sc => "Symbol, Currency", + Self::Sk => "Symbol, Modifier", + Self::So => "Symbol, Other", + } + } } #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] @@ -240,6 +343,62 @@ impl BidiCategory { _ => None, } } + + pub fn abbreviation(self) -> &'static str { + match self { + Self::L => "L", + Self::R => "R", + Self::Al => "AL", + Self::En => "EN", + Self::Es => "ES", + Self::Et => "ET", + Self::An => "AN", + Self::Cs => "CS", + Self::Nsm => "NSM", + Self::Bn => "BN", + Self::B => "B", + Self::S => "S", + Self::Ws => "WS", + Self::On => "ON", + Self::Lre => "LRE", + Self::Lro => "LRO", + Self::Rle => "RLE", + Self::Rlo => "RLO", + Self::Pdf => "PDF", + Self::Lri => "LRI", + Self::Rli => "RLI", + Self::Fsi => "FSI", + Self::Pdi => "PDI", + } + } + + pub fn full_name(self) -> &'static str { + match self { + Self::L => "Left_To_Right", + Self::R => "Right_To_Left", + Self::Al => "Arabic_Letter", + Self::En => "European_Number", + Self::Es => "European_Separator", + Self::Et => "European_Terminator", + Self::An => "Arabic_Number", + Self::Cs => "Common_Separator", + Self::Nsm => "Nonspacing_Mark", + Self::Bn => "Boundary_Neutral", + Self::B => "Paragraph_Separator", + Self::S => "Segment_Separator", + Self::Ws => "White_Space", + Self::On => "Other_Neutral", + Self::Lre => "Left_To_Right_Embedding", + Self::Lro => "Left_To_Right_Override", + Self::Rle => "Right_To_Left_Embedding", + Self::Rlo => "Right_To_Left_Override", + Self::Pdf => "Pop_Directional_Format", + Self::Lri => "Left_To_Right_Isolate", + Self::Rli => "Right_To_Left_Isolate", + Self::Fsi => "First_Strong_Isolate", + Self::Pdi => "Pop_Directional_Isolate", + } + } } #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] @@ -286,6 +445,29 @@ pub enum DecompKind { Narrow, } +impl DecompKind { + pub fn name(self) -> &'static str { + match self { + Self::Nobreak => "noBreak", + Self::Compat => "compat", + Self::Super => "super", + Self::Fraction => "fraction", + Self::Sub => "sub", + Self::Font => "font", + Self::Circle => "circle", + Self::Wide => "wide", + Self::Vertical => "vertical", + Self::Square => "square", + Self::Isolated => "isolated", + Self::Final => "final", + Self::Initial => "initial", + Self::Medial => "medial", + Self::Small => "small", + Self::Narrow => "narrow", + } + } +} + pub(crate) enum OptionalDecompKind { None, Anon, diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 2765303..963ef2d 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -1,3 +1,14 @@ pub mod character; pub mod unicode_data; pub mod utf8; + +pub use character::{ + BidiCategory, + Category, + CharData, + CombiningClass, + DecompKind, + DecompMapping, +}; + +pub use unicode_data::{StaticUnicodeData, UnicodeData}; diff --git a/lib/src/unicode_data.rs b/lib/src/unicode_data.rs index 6d869eb..b365a88 100644 --- a/lib/src/unicode_data.rs +++ b/lib/src/unicode_data.rs @@ -13,6 +13,8 @@ use crate::character::{ const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!"; +pub type StaticUnicodeData = UnicodeData<'static>; + #[derive(Clone, Copy)] pub struct UnicodeData<'a> { group_table: GroupTable<'a>, @@ -24,11 +26,13 @@ const UNICODE_DATA_BYTES: &[u8] = include_bytes!( concat!(env!("OUT_DIR"), "/unicode_data_encoded") ); -impl<'a> UnicodeData<'a> { +impl UnicodeData<'static> { pub fn new() -> Result { Self::from_bytes(UNICODE_DATA_BYTES) } +} +impl<'a> UnicodeData<'a> { pub(crate) fn from_bytes(bs: &'a [u8]) -> Result { let mut bs = ByteStream(bs); @@ -225,7 +229,6 @@ impl<'a> GroupTable<'a> { } } -const GROUP_KIND_NO_VALUE: u8 = 0; const GROUP_KIND_USE_PREV_VALUE: u8 = 1; #[derive(Debug)]