From 96d328c829f59dddaeb6eca32ee26d931ddee2d7 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Thu, 18 May 2023 17:04:30 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20support=20for=20invalid=20utf8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 68 ++++---- Cargo.toml | 1 + get_data.sh | 2 +- utfdump/Cargo.toml | 11 ++ {utfdump_bin => utfdump}/build.rs | 0 utfdump/src/lib.rs | 18 +++ utfdump/src/utf8.rs | 255 ++++++++++++++++++++++++++++++ utfdump_bin/Cargo.toml | 7 +- utfdump_bin/src/main.rs | 138 ++++++++++------ utfdump_core/src/chardata.rs | 71 ++++----- utfdump_core/src/encoded.rs | 29 ++-- 11 files changed, 467 insertions(+), 133 deletions(-) create mode 100644 utfdump/Cargo.toml rename {utfdump_bin => utfdump}/build.rs (100%) create mode 100644 utfdump/src/lib.rs create mode 100644 utfdump/src/utf8.rs diff --git a/Cargo.lock b/Cargo.lock index 312f0a4..85026ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,9 +33,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "clap" -version = "3.2.22" +version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "atty", "bitflags", @@ -50,9 +50,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "3.2.18" +version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65" +checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008" dependencies = [ "heck", "proc-macro-error", @@ -84,9 +84,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "heck" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" @@ -99,9 +99,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown", @@ -109,29 +109,29 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.132" +version = "0.2.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" +checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" [[package]] name = "libshire" version = "0.1.0" -source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31" +source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f" dependencies = [ "serde", ] [[package]] name = "once_cell" -version = "1.14.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "os_str_bytes" -version = "6.3.0" +version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" +checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" [[package]] name = "papergrid" @@ -170,27 +170,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.21" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" dependencies = [ "proc-macro2", ] [[package]] name = "serde" -version = "1.0.144" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860" +checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" [[package]] name = "strsim" @@ -200,9 +200,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.99" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", @@ -235,24 +235,24 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.3" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" dependencies = [ "winapi-util", ] [[package]] name = "textwrap" -version = "0.15.1" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "unicode-ident" -version = "1.0.4" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-width" @@ -263,11 +263,19 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "utfdump" version = "0.1.0" +dependencies = [ + "once_cell", + "utfdump_core", +] + +[[package]] +name = "utfdump_bin" +version = "0.1.0" dependencies = [ "clap", "libshire", "tabled", - "utfdump_core", + "utfdump", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 5170b07..e296b0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [workspace] members = [ "utfdump_core", + "utfdump", "utfdump_bin", ] diff --git a/get_data.sh b/get_data.sh index 652cfdc..0254d53 100755 --- a/get_data.sh +++ b/get_data.sh @@ -1,2 +1,2 @@ #!/bin/bash -curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt +curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt diff --git a/utfdump/Cargo.toml b/utfdump/Cargo.toml new file mode 100644 index 0000000..2e7c677 --- /dev/null +++ b/utfdump/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "utfdump" +version = "0.1.0" +edition = "2021" + +[dependencies] +utfdump_core = { path = "../utfdump_core" } +once_cell = "1.17.1" + +[build-dependencies] +utfdump_core = { path = "../utfdump_core" } diff --git a/utfdump_bin/build.rs b/utfdump/build.rs similarity index 100% rename from utfdump_bin/build.rs rename to utfdump/build.rs diff --git a/utfdump/src/lib.rs b/utfdump/src/lib.rs new file mode 100644 index 0000000..01125f2 --- /dev/null +++ b/utfdump/src/lib.rs @@ -0,0 +1,18 @@ +pub mod utf8; + +pub use utfdump_core::chardata::{CharData, Category, CombiningClass}; + +use once_cell::sync::Lazy; +use utfdump_core::encoded::Data; + +const UNICODE_DATA_BYTES: &[u8] = include_bytes!( + concat!(env!("OUT_DIR"), "/unicode_data_encoded") +); + +static UNICODE_DATA: Lazy = Lazy::new(|| { + Data::from_bytes(UNICODE_DATA_BYTES).unwrap() +}); + +pub fn char_data(c: char) -> Option> { + UNICODE_DATA.get(c) +} diff --git a/utfdump/src/utf8.rs b/utfdump/src/utf8.rs new file mode 100644 index 0000000..8cbee11 --- /dev/null +++ b/utfdump/src/utf8.rs @@ -0,0 +1,255 @@ +use std::iter::Peekable; + +pub trait ToByte { + fn to_byte(self) -> u8; + + fn as_byte(&self) -> u8; +} + +impl ToByte for u8 { + fn to_byte(self) -> u8 { + self + } + + fn as_byte(&self) -> u8 { + *self + } +} + +impl<'a, B> ToByte for &'a B +where + B: ToByte, +{ + fn to_byte(self) -> u8 { + ::as_byte(self) + } + + fn as_byte(&self) -> u8 { + ::to_byte(*self) + } +} + +pub trait Utf8Decode { + type Iter: Iterator; + type Byte: ToByte; + + fn decode_utf8(self) -> Utf8Decoder; +} + +impl Utf8Decode for T +where + T: IntoIterator, + B: ToByte, +{ + type Iter = ::IntoIter; + type Byte = B; + + fn decode_utf8(self) -> Utf8Decoder { + Utf8Decoder::new(self.into_iter()) + } +} + +// https://encoding.spec.whatwg.org/#utf-8-decoder +pub struct Utf8Decoder +where + I: Iterator, + B: ToByte, +{ + bytes: Peekable, +} + +impl Utf8Decoder +where + I: Iterator, + B: ToByte, +{ + fn new(bytes: I) -> Self { + Self { + bytes: bytes.peekable(), + } + } +} + +impl Iterator for Utf8Decoder +where + I: Iterator, + B: ToByte, +{ + type Item = Result; + + fn next(&mut self) -> Option { + const DEFAULT_BOUNDARIES: (u8, u8) = (0x80, 0xbf); + + // Keep track of the bytes we have seen so far, so that if there is an error we can return + // the problematic bytes. There is no need for a variable to store the number of bytes we + // have put into this array, since we can always work it out from other sources. + let mut bytes_seen = [0u8; 4]; + + let mut codepoint: u32; + let bytes_needed: u8; + let mut lower_boundary: u8; + let mut upper_boundary: u8; + + let first_byte = self.bytes.next()?.to_byte(); + bytes_seen[0] = first_byte; + + match first_byte { + byte @ 0x00..=0x7f => { + return Some(Ok(char::from(byte))); + }, + + byte @ 0xc2..=0xdf => { + bytes_needed = 1; + codepoint = u32::from(byte & 0x1f) << 6; + (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES; + }, + + byte @ 0xe0..=0xef => { + bytes_needed = 2; + codepoint = u32::from(byte & 0x0f) << 12; + (lower_boundary, upper_boundary) = match byte { + 0xe0 => (0xa0, 0xbf), + 0xed => (0x80, 0x9f), + _ => DEFAULT_BOUNDARIES, + }; + }, + + byte @ 0xf0..=0xf4 => { + bytes_needed = 3; + codepoint = u32::from(byte & 0x07) << 18; + (lower_boundary, upper_boundary) = match byte { + 0xf0 => (0x90, 0xbf), + 0xf4 => (0x80, 0x8f), + _ => DEFAULT_BOUNDARIES, + }; + }, + + _ => { + return Some(Err(Utf8Error { + bad_bytes: bytes_seen, + num_bad_bytes: 1, + })); + }, + } + + for i in 0..bytes_needed { + // Peek the byte rather than consuming it; the specification says we should not consume + // the byte here if it is not between the upper and lower boundaries. + let byte = match self.bytes.peek() { + Some(byte) => byte.as_byte(), + None => return Some(Err(Utf8Error { + bad_bytes: bytes_seen, + num_bad_bytes: usize::from(i) + 1, + })), + }; + + bytes_seen[usize::from(i) + 1] = byte; + + if !(lower_boundary..=upper_boundary).contains(&byte) { + return Some(Err(Utf8Error { + bad_bytes: bytes_seen, + num_bad_bytes: usize::from(i) + 2, + })); + } + + // Consume the byte we peeked. + self.bytes.next(); + + (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES; + + // OR the 6 least significant bits into the codepoint. + codepoint |= u32::from(byte & 0x3f) << (6 * (bytes_needed - i - 1)); + } + + // FIXME: make this unchecked? + let codepoint = char::try_from(codepoint) + .unwrap(); + + Some(Ok(codepoint)) + } +} + +pub struct Utf8Error { + bad_bytes: [u8; 4], + num_bad_bytes: usize, +} + +impl Utf8Error { + pub fn bytes(&self) -> &[u8] { + &self.bad_bytes[..self.num_bad_bytes] + } + + pub fn into_parts(self) -> ([u8; 4], usize) { + (self.bad_bytes, self.num_bad_bytes) + } +} + +#[cfg(test)] +mod tests { + use std::char::REPLACEMENT_CHARACTER; + + use super::Utf8Decode; + + #[test] + fn test_utf8_decoder() { + assert_eq!( + &decode_collect_lossy(&[ + 0x68, 0x65, 0x6c, 0x6c, 0x6f + ]), + "hello" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5 + ]), + "κόσμε" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef, + 0xb8, 0x8f + ]), + "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0xce, 0x61 + ]), + "\u{fffd}a" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0xce, 0xc2 + ]), + "\u{fffd}\u{fffd}" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0x80 + ]), + "\u{fffd}" + ); + + assert_eq!( + &decode_collect_lossy(&[ + 0x80, 0x80 + ]), + "\u{fffd}\u{fffd}" + ); + } + + fn decode_collect_lossy(bytes: &[u8]) -> String { + bytes + .decode_utf8() + .map(|res| match res { + Ok(c) => c, + Err(_) => REPLACEMENT_CHARACTER, + }) + .collect() + } +} diff --git a/utfdump_bin/Cargo.toml b/utfdump_bin/Cargo.toml index f4ec630..0ff8ce0 100644 --- a/utfdump_bin/Cargo.toml +++ b/utfdump_bin/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "utfdump" +name = "utfdump_bin" version = "0.1.0" edition = "2021" authors = ["Tom Panton "] @@ -8,10 +8,7 @@ repository = "https://github.com/pantonshire/utfdump" description = "Command-line Unicode character info tool" [dependencies] -utfdump_core = { path = "../utfdump_core" } +utfdump = { path = "../utfdump" } libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" } tabled = "0.8.0" clap = { version = "3.2.22", features = ["derive"] } - -[build-dependencies] -utfdump_core = { path = "../utfdump_core" } diff --git a/utfdump_bin/src/main.rs b/utfdump_bin/src/main.rs index b42bb7f..d86f2ab 100644 --- a/utfdump_bin/src/main.rs +++ b/utfdump_bin/src/main.rs @@ -3,10 +3,7 @@ use std::{fmt, io::{self, Read}}; use clap::Parser; use libshire::strings::CappedString; use tabled::{Tabled, Table, Style}; - -use utfdump_core::{chardata::{Category, CombiningClass}, encoded::Data}; - -const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded")); +use utfdump::{char_data, CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}}; #[derive(Parser)] #[clap(author, version, about, long_about = None)] @@ -19,58 +16,18 @@ struct Args { fn main() { let args = Args::parse(); - let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap(); - let input = { let mut buf = Vec::::new(); let stdin = io::stdin(); let mut guard = stdin.lock(); guard.read_to_end(&mut buf) .expect("failed to read stdin"); - // TODO: just skip over invalid utf-8 characters - String::from_utf8(buf) - .expect("invalid utf-8") + buf }; let rows = input - .chars() - .map(|c| { - let mut name = Optional::None; - let mut category = Optional::None; - let mut char_combining_class = Optional::None; - - let mut combining = false; - - if let Some(char_data) = data.get(c as u32) { - name = Optional::Some(char_data.name()); - category = Optional::Some(DisplayCategory { - category: char_data.category(), - full_name: args.full_category_names, - }); - - let ccc = char_data.ccc(); - char_combining_class = Optional::Some(ccc); - combining = ccc.is_combining(); - } - - let display_char = { - let mut buf = CappedString::empty(); - if combining { - buf.push_truncating('\u{25cc}'); - } - buf.push_truncating(c); - buf - }; - - OutRow { - display_char, - codepoint: Codepoint(c), - utf_8_bytes: Utf8Bytes(c), - name, - category, - char_combining_class, - } - }); + .decode_utf8() + .map(|c| OutRow::from_char_result(c, args.full_category_names)); let table = Table::new(rows) .with(Style::modern()); @@ -83,7 +40,7 @@ struct OutRow { #[tabled(rename = "")] display_char: CappedString<8>, #[tabled(rename = "Code")] - codepoint: Codepoint, + codepoint: Optional, #[tabled(rename = "UTF-8")] utf_8_bytes: Utf8Bytes, #[tabled(rename = "Name")] @@ -94,6 +51,69 @@ struct OutRow { char_combining_class: Optional, } +impl OutRow { + fn from_char_result(c: Result, full_category_names: bool) -> Self { + match c { + Ok(c) => Self::from_good_char(c, full_category_names), + Err(err) => Self::from_bad_char(err), + } + } + + fn from_good_char(c: char, full_category_names: bool) -> Self { + let mut name = Optional::None; + let mut category = Optional::None; + let mut char_combining_class = Optional::None; + + let mut combining = false; + + if let Some(char_data) = char_data(c) { + name = Optional::Some(char_data.name()); + category = Optional::Some(DisplayCategory { + category: char_data.category(), + full_name: full_category_names, + }); + + let ccc = char_data.combining_class(); + char_combining_class = Optional::Some(ccc); + combining = ccc.is_combining(); + } + + let display_char = { + let mut buf = CappedString::empty(); + if combining { + buf.push_truncating('\u{25cc}'); + } + buf.push_truncating(c); + buf + }; + + Self { + display_char, + codepoint: Optional::Some(Codepoint(c)), + utf_8_bytes: Utf8Bytes::from_char(c), + name, + category, + char_combining_class, + } + } + + fn from_bad_char(err: Utf8Error) -> Self { + let (bad_bytes, num_bad_bytes) = err.into_parts(); + + Self { + display_char: CappedString::new_truncating("\u{fffd}"), + codepoint: Optional::None, + utf_8_bytes: Utf8Bytes { + buf: bad_bytes, + len: num_bad_bytes, + }, + name: Optional::Some(""), + category: Optional::None, + char_combining_class: Optional::None, + } + } +} + #[derive(Debug)] enum Optional { Some(T), @@ -122,13 +142,27 @@ impl fmt::Display for Codepoint { } #[derive(Debug)] -struct Utf8Bytes(char); +struct Utf8Bytes { + buf: [u8; 4], + len: usize, +} + +impl Utf8Bytes { + fn from_char(c: char) -> Self { + let mut buf = [0u8; 4]; + let string = c.encode_utf8(&mut buf); + let len = string.len(); + Self { buf, len } + } + + fn bytes(&self) -> &[u8] { + &self.buf[..self.len] + } +} impl fmt::Display for Utf8Bytes { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut buf = [0u8; 4]; - let s = self.0.encode_utf8(&mut buf); - let mut bytes = s.bytes(); + let mut bytes = self.bytes().iter().copied(); if let Some(b) = bytes.next() { write!(f, "0x{:02x}", b)?; for b in bytes { diff --git a/utfdump_core/src/chardata.rs b/utfdump_core/src/chardata.rs index f0eafd2..4f5d2c0 100644 --- a/utfdump_core/src/chardata.rs +++ b/utfdump_core/src/chardata.rs @@ -4,7 +4,7 @@ use std::fmt; pub struct CharData<'a> { name: &'a str, category: Category, - ccc: CombiningClass, + combining_class: CombiningClass, } impl<'a> CharData<'a> { @@ -22,8 +22,8 @@ impl<'a> CharData<'a> { Some((codepoint, Self::from_parts(name, category, ccc))) } - pub fn from_parts(name: &'a str, category: Category, ccc: CombiningClass) -> Self { - Self { name, category, ccc } + pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self { + Self { name, category, combining_class } } pub fn with_name<'b>(self, name: &'a str) -> CharData<'b> @@ -41,44 +41,43 @@ impl<'a> CharData<'a> { self.category } - pub fn ccc(&self) -> CombiningClass { - self.ccc + pub fn combining_class(&self) -> CombiningClass { + self.combining_class } } #[derive(Clone, Copy, PartialEq, Eq, Debug)] -#[repr(u8)] pub enum Category { - Lu = 0, - Ll = 1, - Lt = 2, - Mn = 3, - Mc = 4, - Me = 5, - Nd = 6, - Nl = 7, - No = 8, - Zs = 9, - Zl = 10, - Zp = 11, - Cc = 12, - Cf = 13, - Cs = 14, - Co = 15, - Cn = 16, - Lm = 17, - Lo = 18, - Pc = 19, - Pd = 20, - Ps = 21, - Pe = 22, - Pi = 23, - Pf = 24, - Po = 25, - Sm = 26, - Sc = 27, - Sk = 28, - So = 29, + Lu, + Ll, + Lt, + Mn, + Mc, + Me, + Nd, + Nl, + No, + Zs, + Zl, + Zp, + Cc, + Cf, + Cs, + Co, + Cn, + Lm, + Lo, + Pc, + Pd, + Ps, + Pe, + Pi, + Pf, + Po, + Sm, + Sc, + Sk, + So, } impl Category { diff --git a/utfdump_core/src/encoded.rs b/utfdump_core/src/encoded.rs index 48d9f20..044ac43 100644 --- a/utfdump_core/src/encoded.rs +++ b/utfdump_core/src/encoded.rs @@ -7,7 +7,13 @@ const DATA_ENTRY_SIZE: usize = 8; const DATA_INIT_FLAG: u8 = 1; const DATA_REPEATED_FLAG: u8 = 2; -fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, repeated: bool) -> [u8; DATA_ENTRY_SIZE] { +fn encode_char_data( + name_index: u32, + category: Category, + combining_class: CombiningClass, + repeated: bool +) -> [u8; DATA_ENTRY_SIZE] +{ let mut buf = [0u8; DATA_ENTRY_SIZE]; buf[0] |= DATA_INIT_FLAG; @@ -18,12 +24,14 @@ fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, re buf[1..5].copy_from_slice(&name_index.to_le_bytes()); buf[5] = category.byte_repr(); - buf[6] = ccc.0; + buf[6] = combining_class.0; buf } -fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, CombiningClass, bool)> { +fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) + -> Option<(u32, Category, CombiningClass, bool)> +{ let flags = bytes[0]; if flags & DATA_INIT_FLAG == 0 { @@ -32,10 +40,10 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, Comb let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap()); let category = Category::from_byte(bytes[5])?; - let ccc = CombiningClass(bytes[6]); + let combining_class = CombiningClass(bytes[6]); let repeated = flags & DATA_REPEATED_FLAG != 0; - Some((name_index, category, ccc, repeated)) + Some((name_index, category, combining_class, repeated)) } pub struct DataBuf { @@ -62,7 +70,10 @@ impl DataBuf { return Ok(()); } - let repeated = range.end - range.start > 1; + let repeated = range.end + .checked_sub(range.start) + .map(|len| len > 1) + .unwrap_or(false); let range = { let start = usize::try_from(range.start) @@ -86,7 +97,7 @@ impl DataBuf { let encoded_char_data = encode_char_data( name_index, char_data.category(), - char_data.ccc(), + char_data.combining_class(), repeated ); @@ -121,8 +132,8 @@ pub struct Data<'a> { } impl<'a> Data<'a> { - pub fn get(self, codepoint: u32) -> Option> { - let index = usize::try_from(codepoint).ok()?; + pub fn get(self, codepoint: char) -> Option> { + let index = usize::try_from(u32::from(codepoint)).ok()?; let start = index.checked_mul(DATA_ENTRY_SIZE)?; let end = start.checked_add(DATA_ENTRY_SIZE)?; let encoded = self.data.get(start..end)?;