From 26ee43af2eee8571f6d5316c794b14e464c8bf73 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Tue, 20 Sep 2022 16:39:27 +0100 Subject: [PATCH] make Combining column human-readable This patch introduces a `CombiningClass` type to represent the canonical combining class of a codepoint. The `fmt::Display` implementation of this type writes the human-readable name of the combining class, if there is one. This replaces the previous behaviour, which was to just print the raw byte value of the combining class. --- utfdump_bin/src/main.rs | 6 ++-- utfdump_core/src/chardata.rs | 53 +++++++++++++++++++++++++++++++++--- utfdump_core/src/encoded.rs | 10 +++---- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/utfdump_bin/src/main.rs b/utfdump_bin/src/main.rs index 53338bb..b42bb7f 100644 --- a/utfdump_bin/src/main.rs +++ b/utfdump_bin/src/main.rs @@ -4,7 +4,7 @@ use clap::Parser; use libshire::strings::CappedString; use tabled::{Tabled, Table, Style}; -use utfdump_core::{chardata::Category, encoded::Data}; +use utfdump_core::{chardata::{Category, CombiningClass}, encoded::Data}; const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded")); @@ -50,7 +50,7 @@ fn main() { let ccc = char_data.ccc(); char_combining_class = Optional::Some(ccc); - combining = ccc != 0; + combining = ccc.is_combining(); } let display_char = { @@ -91,7 +91,7 @@ struct OutRow { #[tabled(rename = "Category")] category: Optional, #[tabled(rename = "Combining")] - char_combining_class: Optional, + char_combining_class: Optional, } #[derive(Debug)] diff --git a/utfdump_core/src/chardata.rs b/utfdump_core/src/chardata.rs index 1b5a100..f0eafd2 100644 --- a/utfdump_core/src/chardata.rs +++ b/utfdump_core/src/chardata.rs @@ -4,7 +4,7 @@ use std::fmt; pub struct CharData<'a> { name: &'a str, category: Category, - ccc: u8, + ccc: CombiningClass, } impl<'a> CharData<'a> { @@ -17,12 +17,12 @@ impl<'a> CharData<'a> { let codepoint = u32::from_str_radix(fields[0], 16).ok()?; let name = fields[1]; let category = Category::from_abbr(fields[2])?; - let ccc = u8::from_str_radix(fields[3], 10).ok()?; + let ccc = CombiningClass(u8::from_str_radix(fields[3], 10).ok()?); Some((codepoint, Self::from_parts(name, category, ccc))) } - pub fn from_parts(name: &'a str, category: Category, ccc: u8) -> Self { + pub fn from_parts(name: &'a str, category: Category, ccc: CombiningClass) -> Self { Self { name, category, ccc } } @@ -41,7 +41,7 @@ impl<'a> CharData<'a> { self.category } - pub fn ccc(&self) -> u8 { + pub fn ccc(&self) -> CombiningClass { self.ccc } } @@ -235,3 +235,48 @@ impl fmt::Display for Category { } } +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub struct CombiningClass(pub u8); + +impl CombiningClass { + pub fn name(self) -> Option<&'static str> { + match self.0 { + 0 => Some("Not_Reordered"), + 1 => Some("Overlay"), + 6 => Some("Han_Reading"), + 7 => Some("Nukta"), + 8 => Some("Kana_Voicing"), + 9 => Some("Virama"), + 200 => Some("Attached_Below_Left"), + 202 => Some("Attached_Below"), + 214 => Some("Attached_Above"), + 216 => Some("Attached_Above_Right"), + 218 => Some("Below_Left"), + 220 => Some("Below"), + 222 => Some("Below_Right"), + 224 => Some("Left"), + 226 => Some("Right"), + 228 => Some("Above_Left"), + 230 => Some("Above"), + 232 => Some("Above_Right"), + 233 => Some("Double_Below"), + 234 => Some("Double_Above"), + 240 => Some("Iota_Subscript"), + _ => None, + } + } + + pub fn is_combining(self) -> bool { + self.0 != 0 + } +} + +impl fmt::Display for CombiningClass { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.name() { + Some(name) => write!(f, "{}", name), + None => write!(f, "Ccc{}", self.0), + } + } +} + diff --git a/utfdump_core/src/encoded.rs b/utfdump_core/src/encoded.rs index 430114c..48d9f20 100644 --- a/utfdump_core/src/encoded.rs +++ b/utfdump_core/src/encoded.rs @@ -1,13 +1,13 @@ use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range}; -use crate::chardata::{CharData, Category}; +use crate::chardata::{CharData, Category, CombiningClass}; const DATA_ENTRY_SIZE: usize = 8; const DATA_INIT_FLAG: u8 = 1; const DATA_REPEATED_FLAG: u8 = 2; -fn encode_char_data(name_index: u32, category: Category, ccc: u8, repeated: bool) -> [u8; DATA_ENTRY_SIZE] { +fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, repeated: bool) -> [u8; DATA_ENTRY_SIZE] { let mut buf = [0u8; DATA_ENTRY_SIZE]; buf[0] |= DATA_INIT_FLAG; @@ -18,12 +18,12 @@ fn encode_char_data(name_index: u32, category: Category, ccc: u8, repeated: bool buf[1..5].copy_from_slice(&name_index.to_le_bytes()); buf[5] = category.byte_repr(); - buf[6] = ccc; + buf[6] = ccc.0; buf } -fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, u8, bool)> { +fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, CombiningClass, bool)> { let flags = bytes[0]; if flags & DATA_INIT_FLAG == 0 { @@ -32,7 +32,7 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, u8, let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap()); let category = Category::from_byte(bytes[5])?; - let ccc = bytes[6]; + let ccc = CombiningClass(bytes[6]); let repeated = flags & DATA_REPEATED_FLAG != 0; Some((name_index, category, ccc, repeated))