support for invalid utf8

main
pantonshire 3 years ago
parent 26ee43af2e
commit 96d328c829

68
Cargo.lock generated

@ -33,9 +33,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
name = "clap"
version = "3.2.22"
version = "3.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750"
checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
dependencies = [
"atty",
"bitflags",
@ -50,9 +50,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "3.2.18"
version = "3.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008"
dependencies = [
"heck",
"proc-macro-error",
@ -84,9 +84,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "heck"
version = "0.4.0"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
@ -99,9 +99,9 @@ dependencies = [
[[package]]
name = "indexmap"
version = "1.9.1"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown",
@ -109,29 +109,29 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.132"
version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "libshire"
version = "0.1.0"
source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31"
source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f"
dependencies = [
"serde",
]
[[package]]
name = "once_cell"
version = "1.14.0"
version = "1.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "os_str_bytes"
version = "6.3.0"
version = "6.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
[[package]]
name = "papergrid"
@ -170,27 +170,27 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.43"
version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.21"
version = "1.0.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.144"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
[[package]]
name = "strsim"
@ -200,9 +200,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "syn"
version = "1.0.99"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
@ -235,24 +235,24 @@ dependencies = [
[[package]]
name = "termcolor"
version = "1.1.3"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.15.1"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16"
checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
[[package]]
name = "unicode-ident"
version = "1.0.4"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
[[package]]
name = "unicode-width"
@ -263,11 +263,19 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "utfdump"
version = "0.1.0"
dependencies = [
"once_cell",
"utfdump_core",
]
[[package]]
name = "utfdump_bin"
version = "0.1.0"
dependencies = [
"clap",
"libshire",
"tabled",
"utfdump_core",
"utfdump",
]
[[package]]

@ -1,5 +1,6 @@
[workspace]
members = [
"utfdump_core",
"utfdump",
"utfdump_bin",
]

@ -1,2 +1,2 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt

@ -0,0 +1,11 @@
[package]
name = "utfdump"
version = "0.1.0"
edition = "2021"
[dependencies]
utfdump_core = { path = "../utfdump_core" }
once_cell = "1.17.1"
[build-dependencies]
utfdump_core = { path = "../utfdump_core" }

@ -0,0 +1,18 @@
pub mod utf8;
pub use utfdump_core::chardata::{CharData, Category, CombiningClass};
use once_cell::sync::Lazy;
use utfdump_core::encoded::Data;
const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
concat!(env!("OUT_DIR"), "/unicode_data_encoded")
);
static UNICODE_DATA: Lazy<Data> = Lazy::new(|| {
Data::from_bytes(UNICODE_DATA_BYTES).unwrap()
});
pub fn char_data(c: char) -> Option<CharData<'static>> {
UNICODE_DATA.get(c)
}

@ -0,0 +1,255 @@
use std::iter::Peekable;
pub trait ToByte {
fn to_byte(self) -> u8;
fn as_byte(&self) -> u8;
}
impl ToByte for u8 {
fn to_byte(self) -> u8 {
self
}
fn as_byte(&self) -> u8 {
*self
}
}
impl<'a, B> ToByte for &'a B
where
B: ToByte,
{
fn to_byte(self) -> u8 {
<B as ToByte>::as_byte(self)
}
fn as_byte(&self) -> u8 {
<Self as ToByte>::to_byte(*self)
}
}
pub trait Utf8Decode {
type Iter: Iterator<Item = Self::Byte>;
type Byte: ToByte;
fn decode_utf8(self) -> Utf8Decoder<Self::Iter, Self::Byte>;
}
impl<T, B> Utf8Decode for T
where
T: IntoIterator<Item = B>,
B: ToByte,
{
type Iter = <T as IntoIterator>::IntoIter;
type Byte = B;
fn decode_utf8(self) -> Utf8Decoder<Self::Iter, B> {
Utf8Decoder::new(self.into_iter())
}
}
// https://encoding.spec.whatwg.org/#utf-8-decoder
pub struct Utf8Decoder<I, B>
where
I: Iterator<Item = B>,
B: ToByte,
{
bytes: Peekable<I>,
}
impl<I, B> Utf8Decoder<I, B>
where
I: Iterator<Item = B>,
B: ToByte,
{
fn new(bytes: I) -> Self {
Self {
bytes: bytes.peekable(),
}
}
}
impl<I, B> Iterator for Utf8Decoder<I, B>
where
I: Iterator<Item = B>,
B: ToByte,
{
type Item = Result<char, Utf8Error>;
fn next(&mut self) -> Option<Self::Item> {
const DEFAULT_BOUNDARIES: (u8, u8) = (0x80, 0xbf);
// Keep track of the bytes we have seen so far, so that if there is an error we can return
// the problematic bytes. There is no need for a variable to store the number of bytes we
// have put into this array, since we can always work it out from other sources.
let mut bytes_seen = [0u8; 4];
let mut codepoint: u32;
let bytes_needed: u8;
let mut lower_boundary: u8;
let mut upper_boundary: u8;
let first_byte = self.bytes.next()?.to_byte();
bytes_seen[0] = first_byte;
match first_byte {
byte @ 0x00..=0x7f => {
return Some(Ok(char::from(byte)));
},
byte @ 0xc2..=0xdf => {
bytes_needed = 1;
codepoint = u32::from(byte & 0x1f) << 6;
(lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
},
byte @ 0xe0..=0xef => {
bytes_needed = 2;
codepoint = u32::from(byte & 0x0f) << 12;
(lower_boundary, upper_boundary) = match byte {
0xe0 => (0xa0, 0xbf),
0xed => (0x80, 0x9f),
_ => DEFAULT_BOUNDARIES,
};
},
byte @ 0xf0..=0xf4 => {
bytes_needed = 3;
codepoint = u32::from(byte & 0x07) << 18;
(lower_boundary, upper_boundary) = match byte {
0xf0 => (0x90, 0xbf),
0xf4 => (0x80, 0x8f),
_ => DEFAULT_BOUNDARIES,
};
},
_ => {
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: 1,
}));
},
}
for i in 0..bytes_needed {
// Peek the byte rather than consuming it; the specification says we should not consume
// the byte here if it is not between the upper and lower boundaries.
let byte = match self.bytes.peek() {
Some(byte) => byte.as_byte(),
None => return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 1,
})),
};
bytes_seen[usize::from(i) + 1] = byte;
if !(lower_boundary..=upper_boundary).contains(&byte) {
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 2,
}));
}
// Consume the byte we peeked.
self.bytes.next();
(lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
// OR the 6 least significant bits into the codepoint.
codepoint |= u32::from(byte & 0x3f) << (6 * (bytes_needed - i - 1));
}
// FIXME: make this unchecked?
let codepoint = char::try_from(codepoint)
.unwrap();
Some(Ok(codepoint))
}
}
pub struct Utf8Error {
bad_bytes: [u8; 4],
num_bad_bytes: usize,
}
impl Utf8Error {
pub fn bytes(&self) -> &[u8] {
&self.bad_bytes[..self.num_bad_bytes]
}
pub fn into_parts(self) -> ([u8; 4], usize) {
(self.bad_bytes, self.num_bad_bytes)
}
}
#[cfg(test)]
mod tests {
use std::char::REPLACEMENT_CHARACTER;
use super::Utf8Decode;
#[test]
fn test_utf8_decoder() {
assert_eq!(
&decode_collect_lossy(&[
0x68, 0x65, 0x6c, 0x6c, 0x6f
]),
"hello"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
]),
"κόσμε"
);
assert_eq!(
&decode_collect_lossy(&[
0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
0xb8, 0x8f
]),
"\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0x61
]),
"\u{fffd}a"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0xc2
]),
"\u{fffd}\u{fffd}"
);
assert_eq!(
&decode_collect_lossy(&[
0x80
]),
"\u{fffd}"
);
assert_eq!(
&decode_collect_lossy(&[
0x80, 0x80
]),
"\u{fffd}\u{fffd}"
);
}
fn decode_collect_lossy(bytes: &[u8]) -> String {
bytes
.decode_utf8()
.map(|res| match res {
Ok(c) => c,
Err(_) => REPLACEMENT_CHARACTER,
})
.collect()
}
}

@ -1,5 +1,5 @@
[package]
name = "utfdump"
name = "utfdump_bin"
version = "0.1.0"
edition = "2021"
authors = ["Tom Panton <pantonshire@gmail.com>"]
@ -8,10 +8,7 @@ repository = "https://github.com/pantonshire/utfdump"
description = "Command-line Unicode character info tool"
[dependencies]
utfdump_core = { path = "../utfdump_core" }
utfdump = { path = "../utfdump" }
libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
tabled = "0.8.0"
clap = { version = "3.2.22", features = ["derive"] }
[build-dependencies]
utfdump_core = { path = "../utfdump_core" }

@ -3,10 +3,7 @@ use std::{fmt, io::{self, Read}};
use clap::Parser;
use libshire::strings::CappedString;
use tabled::{Tabled, Table, Style};
use utfdump_core::{chardata::{Category, CombiningClass}, encoded::Data};
const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded"));
use utfdump::{char_data, CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}};
#[derive(Parser)]
#[clap(author, version, about, long_about = None)]
@ -19,58 +16,18 @@ struct Args {
fn main() {
let args = Args::parse();
let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap();
let input = {
let mut buf = Vec::<u8>::new();
let stdin = io::stdin();
let mut guard = stdin.lock();
guard.read_to_end(&mut buf)
.expect("failed to read stdin");
// TODO: just skip over invalid utf-8 characters
String::from_utf8(buf)
.expect("invalid utf-8")
buf
};
let rows = input
.chars()
.map(|c| {
let mut name = Optional::None;
let mut category = Optional::None;
let mut char_combining_class = Optional::None;
let mut combining = false;
if let Some(char_data) = data.get(c as u32) {
name = Optional::Some(char_data.name());
category = Optional::Some(DisplayCategory {
category: char_data.category(),
full_name: args.full_category_names,
});
let ccc = char_data.ccc();
char_combining_class = Optional::Some(ccc);
combining = ccc.is_combining();
}
let display_char = {
let mut buf = CappedString::empty();
if combining {
buf.push_truncating('\u{25cc}');
}
buf.push_truncating(c);
buf
};
OutRow {
display_char,
codepoint: Codepoint(c),
utf_8_bytes: Utf8Bytes(c),
name,
category,
char_combining_class,
}
});
.decode_utf8()
.map(|c| OutRow::from_char_result(c, args.full_category_names));
let table = Table::new(rows)
.with(Style::modern());
@ -83,7 +40,7 @@ struct OutRow {
#[tabled(rename = "")]
display_char: CappedString<8>,
#[tabled(rename = "Code")]
codepoint: Codepoint,
codepoint: Optional<Codepoint>,
#[tabled(rename = "UTF-8")]
utf_8_bytes: Utf8Bytes,
#[tabled(rename = "Name")]
@ -94,6 +51,69 @@ struct OutRow {
char_combining_class: Optional<CombiningClass>,
}
impl OutRow {
fn from_char_result(c: Result<char, Utf8Error>, full_category_names: bool) -> Self {
match c {
Ok(c) => Self::from_good_char(c, full_category_names),
Err(err) => Self::from_bad_char(err),
}
}
fn from_good_char(c: char, full_category_names: bool) -> Self {
let mut name = Optional::None;
let mut category = Optional::None;
let mut char_combining_class = Optional::None;
let mut combining = false;
if let Some(char_data) = char_data(c) {
name = Optional::Some(char_data.name());
category = Optional::Some(DisplayCategory {
category: char_data.category(),
full_name: full_category_names,
});
let ccc = char_data.combining_class();
char_combining_class = Optional::Some(ccc);
combining = ccc.is_combining();
}
let display_char = {
let mut buf = CappedString::empty();
if combining {
buf.push_truncating('\u{25cc}');
}
buf.push_truncating(c);
buf
};
Self {
display_char,
codepoint: Optional::Some(Codepoint(c)),
utf_8_bytes: Utf8Bytes::from_char(c),
name,
category,
char_combining_class,
}
}
fn from_bad_char(err: Utf8Error) -> Self {
let (bad_bytes, num_bad_bytes) = err.into_parts();
Self {
display_char: CappedString::new_truncating("\u{fffd}"),
codepoint: Optional::None,
utf_8_bytes: Utf8Bytes {
buf: bad_bytes,
len: num_bad_bytes,
},
name: Optional::Some("<invalid>"),
category: Optional::None,
char_combining_class: Optional::None,
}
}
}
#[derive(Debug)]
enum Optional<T> {
Some(T),
@ -122,13 +142,27 @@ impl fmt::Display for Codepoint {
}
#[derive(Debug)]
struct Utf8Bytes(char);
struct Utf8Bytes {
buf: [u8; 4],
len: usize,
}
impl Utf8Bytes {
fn from_char(c: char) -> Self {
let mut buf = [0u8; 4];
let string = c.encode_utf8(&mut buf);
let len = string.len();
Self { buf, len }
}
fn bytes(&self) -> &[u8] {
&self.buf[..self.len]
}
}
impl fmt::Display for Utf8Bytes {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut buf = [0u8; 4];
let s = self.0.encode_utf8(&mut buf);
let mut bytes = s.bytes();
let mut bytes = self.bytes().iter().copied();
if let Some(b) = bytes.next() {
write!(f, "0x{:02x}", b)?;
for b in bytes {

@ -4,7 +4,7 @@ use std::fmt;
pub struct CharData<'a> {
name: &'a str,
category: Category,
ccc: CombiningClass,
combining_class: CombiningClass,
}
impl<'a> CharData<'a> {
@ -22,8 +22,8 @@ impl<'a> CharData<'a> {
Some((codepoint, Self::from_parts(name, category, ccc)))
}
pub fn from_parts(name: &'a str, category: Category, ccc: CombiningClass) -> Self {
Self { name, category, ccc }
pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self {
Self { name, category, combining_class }
}
pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
@ -41,44 +41,43 @@ impl<'a> CharData<'a> {
self.category
}
pub fn ccc(&self) -> CombiningClass {
self.ccc
pub fn combining_class(&self) -> CombiningClass {
self.combining_class
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[repr(u8)]
pub enum Category {
Lu = 0,
Ll = 1,
Lt = 2,
Mn = 3,
Mc = 4,
Me = 5,
Nd = 6,
Nl = 7,
No = 8,
Zs = 9,
Zl = 10,
Zp = 11,
Cc = 12,
Cf = 13,
Cs = 14,
Co = 15,
Cn = 16,
Lm = 17,
Lo = 18,
Pc = 19,
Pd = 20,
Ps = 21,
Pe = 22,
Pi = 23,
Pf = 24,
Po = 25,
Sm = 26,
Sc = 27,
Sk = 28,
So = 29,
Lu,
Ll,
Lt,
Mn,
Mc,
Me,
Nd,
Nl,
No,
Zs,
Zl,
Zp,
Cc,
Cf,
Cs,
Co,
Cn,
Lm,
Lo,
Pc,
Pd,
Ps,
Pe,
Pi,
Pf,
Po,
Sm,
Sc,
Sk,
So,
}
impl Category {

@ -7,7 +7,13 @@ const DATA_ENTRY_SIZE: usize = 8;
const DATA_INIT_FLAG: u8 = 1;
const DATA_REPEATED_FLAG: u8 = 2;
fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, repeated: bool) -> [u8; DATA_ENTRY_SIZE] {
fn encode_char_data(
name_index: u32,
category: Category,
combining_class: CombiningClass,
repeated: bool
) -> [u8; DATA_ENTRY_SIZE]
{
let mut buf = [0u8; DATA_ENTRY_SIZE];
buf[0] |= DATA_INIT_FLAG;
@ -18,12 +24,14 @@ fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, re
buf[1..5].copy_from_slice(&name_index.to_le_bytes());
buf[5] = category.byte_repr();
buf[6] = ccc.0;
buf[6] = combining_class.0;
buf
}
fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, CombiningClass, bool)> {
fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
-> Option<(u32, Category, CombiningClass, bool)>
{
let flags = bytes[0];
if flags & DATA_INIT_FLAG == 0 {
@ -32,10 +40,10 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, Comb
let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
let category = Category::from_byte(bytes[5])?;
let ccc = CombiningClass(bytes[6]);
let combining_class = CombiningClass(bytes[6]);
let repeated = flags & DATA_REPEATED_FLAG != 0;
Some((name_index, category, ccc, repeated))
Some((name_index, category, combining_class, repeated))
}
pub struct DataBuf {
@ -62,7 +70,10 @@ impl DataBuf {
return Ok(());
}
let repeated = range.end - range.start > 1;
let repeated = range.end
.checked_sub(range.start)
.map(|len| len > 1)
.unwrap_or(false);
let range = {
let start = usize::try_from(range.start)
@ -86,7 +97,7 @@ impl DataBuf {
let encoded_char_data = encode_char_data(
name_index,
char_data.category(),
char_data.ccc(),
char_data.combining_class(),
repeated
);
@ -121,8 +132,8 @@ pub struct Data<'a> {
}
impl<'a> Data<'a> {
pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
let index = usize::try_from(codepoint).ok()?;
pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
let index = usize::try_from(u32::from(codepoint)).ok()?;
let start = index.checked_mul(DATA_ENTRY_SIZE)?;
let end = start.checked_add(DATA_ENTRY_SIZE)?;
let encoded = self.data.get(start..end)?;

Loading…
Cancel
Save