✨ support for invalid utf8

3 years ago · 96d328c829
parent 26ee43af2e
commit 96d328c829
11 changed files with 467 additions and 133 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -33,9 +33,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
 [[package]]
 name = "clap"
-version = "3.2.22"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
 "atty",
 "bitflags",
@ -50,9 +50,9 @@ dependencies = [
 [[package]]
 name = "clap_derive"
-version = "3.2.18"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
+checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008"
 dependencies = [
 "heck",
 "proc-macro-error",
@ -84,9 +84,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 [[package]]
 name = "heck"
-version = "0.4.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 [[package]]
 name = "hermit-abi"
@ -99,9 +99,9 @@ dependencies = [
 [[package]]
 name = "indexmap"
-version = "1.9.1"
+version = "1.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
 dependencies = [
 "autocfg",
 "hashbrown",
@ -109,29 +109,29 @@ dependencies = [
 [[package]]
 name = "libc"
-version = "0.2.132"
+version = "0.2.144"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
+checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
 [[package]]
 name = "libshire"
 version = "0.1.0"
-source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31"
+source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "once_cell"
-version = "1.14.0"
+version = "1.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
 [[package]]
 name = "os_str_bytes"
-version = "6.3.0"
+version = "6.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
 [[package]]
 name = "papergrid"
@ -170,27 +170,27 @@ dependencies = [
 [[package]]
 name = "proc-macro2"
-version = "1.0.43"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
-version = "1.0.21"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "serde"
-version = "1.0.144"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
 [[package]]
 name = "strsim"
@ -200,9 +200,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 [[package]]
 name = "syn"
-version = "1.0.99"
+version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
 "proc-macro2",
 "quote",
@ -235,24 +235,24 @@ dependencies = [
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
 "winapi-util",
 ]
 [[package]]
 name = "textwrap"
-version = "0.15.1"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16"
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 [[package]]
 name = "unicode-ident"
-version = "1.0.4"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
+checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
 [[package]]
 name = "unicode-width"
@ -263,11 +263,19 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
 [[package]]
 name = "utfdump"
 version = "0.1.0"
 dependencies = [
 "once_cell",
 "utfdump_core",
 ]
 [[package]]
 name = "utfdump_bin"
 version = "0.1.0"
 dependencies = [
 "clap",
 "libshire",
 "tabled",
- "utfdump_core",
+ "utfdump",
 ]
 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,6 @@
 [workspace]
 members = [
    "utfdump_core",
    "utfdump",
    "utfdump_bin",
 ]
--- a/get_data.sh
+++ b/get_data.sh
@ -1,2 +1,2 @@
 #!/bin/bash
-curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt
+curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt
--- a/utfdump/Cargo.toml
+++ b/utfdump/Cargo.toml
@ -0,0 +1,11 @@
 [package]
 name = "utfdump"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 utfdump_core = { path = "../utfdump_core" }
 once_cell = "1.17.1"
 [build-dependencies]
 utfdump_core = { path = "../utfdump_core" }
--- a/utfdump_bin/build.rs
+++ b/utfdump_bin/build.rs
--- a/utfdump/src/lib.rs
+++ b/utfdump/src/lib.rs
@ -0,0 +1,18 @@
 pub mod utf8;
 pub use utfdump_core::chardata::{CharData, Category, CombiningClass};
 use once_cell::sync::Lazy;
 use utfdump_core::encoded::Data;
 const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
    concat!(env!("OUT_DIR"), "/unicode_data_encoded")
 );
 static UNICODE_DATA: Lazy<Data> = Lazy::new(|| {
    Data::from_bytes(UNICODE_DATA_BYTES).unwrap()
 });
 pub fn char_data(c: char) -> Option<CharData<'static>> {
    UNICODE_DATA.get(c)
 }
--- a/utfdump/src/utf8.rs
+++ b/utfdump/src/utf8.rs
@ -0,0 +1,255 @@
 use std::iter::Peekable;
 pub trait ToByte {
    fn to_byte(self) -> u8;
    fn as_byte(&self) -> u8;
 }
 impl ToByte for u8 {
    fn to_byte(self) -> u8 {
        self
    }
    fn as_byte(&self) -> u8 {
        *self
    }
 }
 impl<'a, B> ToByte for &'a B
 where
    B: ToByte,
 {
    fn to_byte(self) -> u8 {
        <B as ToByte>::as_byte(self)
    }
    fn as_byte(&self) -> u8 {
        <Self as ToByte>::to_byte(*self)
    }
 }
 pub trait Utf8Decode {
    type Iter: Iterator<Item = Self::Byte>;
    type Byte: ToByte;
    fn decode_utf8(self) -> Utf8Decoder<Self::Iter, Self::Byte>;
 }
 impl<T, B> Utf8Decode for T
 where
    T: IntoIterator<Item = B>,
    B: ToByte,
 {
    type Iter = <T as IntoIterator>::IntoIter;
    type Byte = B;
    fn decode_utf8(self) -> Utf8Decoder<Self::Iter, B> {
        Utf8Decoder::new(self.into_iter())
    }
 }
 // https://encoding.spec.whatwg.org/#utf-8-decoder
 pub struct Utf8Decoder<I, B>
 where
    I: Iterator<Item = B>,
    B: ToByte,
 {
    bytes: Peekable<I>,
 }
 impl<I, B> Utf8Decoder<I, B>
 where
    I: Iterator<Item = B>,
    B: ToByte,
 {
    fn new(bytes: I) -> Self {
        Self {
            bytes: bytes.peekable(),
        }
    }
 }
 impl<I, B> Iterator for Utf8Decoder<I, B>
 where
    I: Iterator<Item = B>,
    B: ToByte,
 {
    type Item = Result<char, Utf8Error>;
    fn next(&mut self) -> Option<Self::Item> {
        const DEFAULT_BOUNDARIES: (u8, u8) = (0x80, 0xbf);
        // Keep track of the bytes we have seen so far, so that if there is an error we can return
        // the problematic bytes. There is no need for a variable to store the number of bytes we
        // have put into this array, since we can always work it out from other sources.
        let mut bytes_seen = [0u8; 4];
        let mut codepoint: u32;
        let bytes_needed: u8;
        let mut lower_boundary: u8;
        let mut upper_boundary: u8;
        let first_byte = self.bytes.next()?.to_byte();
        bytes_seen[0] = first_byte;
        match first_byte {
            byte @ 0x00..=0x7f => {
                return Some(Ok(char::from(byte)));
            },
            byte @ 0xc2..=0xdf => {
                bytes_needed = 1;
                codepoint = u32::from(byte & 0x1f) << 6;
                (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
            },
            byte @ 0xe0..=0xef => {
                bytes_needed = 2;
                codepoint = u32::from(byte & 0x0f) << 12;
                (lower_boundary, upper_boundary) = match byte {
                    0xe0 => (0xa0, 0xbf),
                    0xed => (0x80, 0x9f),
                    _ => DEFAULT_BOUNDARIES,
                };
            },
            byte @ 0xf0..=0xf4 => {
                bytes_needed = 3;
                codepoint = u32::from(byte & 0x07) << 18;
                (lower_boundary, upper_boundary) = match byte {
                    0xf0 => (0x90, 0xbf),
                    0xf4 => (0x80, 0x8f),
                    _ => DEFAULT_BOUNDARIES,
                };
            },
            _ => {
                return Some(Err(Utf8Error {
                    bad_bytes: bytes_seen,
                    num_bad_bytes: 1,
                }));
            },
        }
        for i in 0..bytes_needed {
            // Peek the byte rather than consuming it; the specification says we should not consume
            // the byte here if it is not between the upper and lower boundaries.
            let byte = match self.bytes.peek() {
                Some(byte) => byte.as_byte(),
                None => return Some(Err(Utf8Error {
                    bad_bytes: bytes_seen,
                    num_bad_bytes: usize::from(i) + 1,
                })),
            };
            bytes_seen[usize::from(i) + 1] = byte;
            if !(lower_boundary..=upper_boundary).contains(&byte) {
                return Some(Err(Utf8Error {
                    bad_bytes: bytes_seen,
                    num_bad_bytes: usize::from(i) + 2,
                }));
            }
            // Consume the byte we peeked.
            self.bytes.next();
            (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
            // OR the 6 least significant bits into the codepoint.
            codepoint |= u32::from(byte & 0x3f) << (6 * (bytes_needed - i - 1));
        }
        // FIXME: make this unchecked?
        let codepoint = char::try_from(codepoint)
            .unwrap();
        Some(Ok(codepoint))
    }
 }
 pub struct Utf8Error {
    bad_bytes: [u8; 4],
    num_bad_bytes: usize,
 }
 impl Utf8Error {
    pub fn bytes(&self) -> &[u8] {
        &self.bad_bytes[..self.num_bad_bytes]
    }
    pub fn into_parts(self) -> ([u8; 4], usize) {
        (self.bad_bytes, self.num_bad_bytes)
    }
 }
 #[cfg(test)]
 mod tests {
    use std::char::REPLACEMENT_CHARACTER;
    use super::Utf8Decode;
    #[test]
    fn test_utf8_decoder() {
        assert_eq!(
            &decode_collect_lossy(&[
                0x68, 0x65, 0x6c, 0x6c, 0x6f
            ]),
            "hello"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
            ]),
            "κόσμε"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
                0xb8, 0x8f
            ]),
            "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0xce, 0x61
            ]),
            "\u{fffd}a"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0xce, 0xc2
            ]),
            "\u{fffd}\u{fffd}"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0x80
            ]),
            "\u{fffd}"
        );
        assert_eq!(
            &decode_collect_lossy(&[
                0x80, 0x80
            ]),
            "\u{fffd}\u{fffd}"
        );
    }
    fn decode_collect_lossy(bytes: &[u8]) -> String {
        bytes
            .decode_utf8()
            .map(|res| match res {
                Ok(c) => c,
                Err(_) => REPLACEMENT_CHARACTER,
            })
            .collect()
    }
 }
--- a/utfdump_bin/Cargo.toml
+++ b/utfdump_bin/Cargo.toml
@ -1,5 +1,5 @@
 [package]
-name = "utfdump"
+name = "utfdump_bin"
 version = "0.1.0"
 edition = "2021"
 authors = ["Tom Panton <pantonshire@gmail.com>"]
@ -8,10 +8,7 @@ repository = "https://github.com/pantonshire/utfdump"
 description = "Command-line Unicode character info tool"
 [dependencies]
-utfdump_core = { path = "../utfdump_core" }
+utfdump = { path = "../utfdump" }
 libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
 tabled = "0.8.0"
 clap = { version = "3.2.22", features = ["derive"] }
 [build-dependencies]
 utfdump_core = { path = "../utfdump_core" }
--- a/utfdump_bin/src/main.rs
+++ b/utfdump_bin/src/main.rs
@ -3,10 +3,7 @@ use std::{fmt, io::{self, Read}};
 use clap::Parser;
 use libshire::strings::CappedString;
 use tabled::{Tabled, Table, Style};
-
+use utfdump::{char_data, CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}};
 use utfdump_core::{chardata::{Category, CombiningClass}, encoded::Data};
 const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded"));
 #[derive(Parser)]
 #[clap(author, version, about, long_about = None)]
@ -19,58 +16,18 @@ struct Args {
 fn main() {
    let args = Args::parse();
    let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap();
    let input = {
        let mut buf = Vec::<u8>::new();
        let stdin = io::stdin();
        let mut guard = stdin.lock();
        guard.read_to_end(&mut buf)
            .expect("failed to read stdin");
-        // TODO: just skip over invalid utf-8 characters
+        buf
        String::from_utf8(buf)
            .expect("invalid utf-8")
    };
    let rows = input
-        .chars()
+        .decode_utf8()
-        .map(|c| {
+        .map(|c| OutRow::from_char_result(c, args.full_category_names));
            let mut name = Optional::None;
            let mut category = Optional::None;
            let mut char_combining_class = Optional::None;
            let mut combining = false;
            if let Some(char_data) = data.get(c as u32) {
                name = Optional::Some(char_data.name());
                category = Optional::Some(DisplayCategory {
                    category: char_data.category(),
                    full_name: args.full_category_names,
                });
                let ccc = char_data.ccc();
                char_combining_class = Optional::Some(ccc);
                combining = ccc.is_combining();
            }
            let display_char = {
                let mut buf = CappedString::empty();
                if combining {
                    buf.push_truncating('\u{25cc}');
                }
                buf.push_truncating(c);
                buf
            };
            OutRow {
                display_char,
                codepoint: Codepoint(c),
                utf_8_bytes: Utf8Bytes(c),
                name,
                category,
                char_combining_class,
            }
        });
    let table = Table::new(rows)
        .with(Style::modern());
@ -83,7 +40,7 @@ struct OutRow {
    #[tabled(rename = "")]
    display_char: CappedString<8>, 
    #[tabled(rename = "Code")]
-    codepoint: Codepoint,
+    codepoint: Optional<Codepoint>,
    #[tabled(rename = "UTF-8")]
    utf_8_bytes: Utf8Bytes,
    #[tabled(rename = "Name")]
@ -94,6 +51,69 @@ struct OutRow {
    char_combining_class: Optional<CombiningClass>,
 }
 impl OutRow {
    fn from_char_result(c: Result<char, Utf8Error>, full_category_names: bool) -> Self {
        match c {
            Ok(c) => Self::from_good_char(c, full_category_names),
            Err(err) => Self::from_bad_char(err),
        }
    }
    fn from_good_char(c: char, full_category_names: bool) -> Self {
        let mut name = Optional::None;
        let mut category = Optional::None;
        let mut char_combining_class = Optional::None;
        let mut combining = false;
        if let Some(char_data) = char_data(c) {
            name = Optional::Some(char_data.name());
            category = Optional::Some(DisplayCategory {
                category: char_data.category(),
                full_name: full_category_names,
            });
            let ccc = char_data.combining_class();
            char_combining_class = Optional::Some(ccc);
            combining = ccc.is_combining();
        }
        let display_char = {
            let mut buf = CappedString::empty();
            if combining {
                buf.push_truncating('\u{25cc}');
            }
            buf.push_truncating(c);
            buf
        };
        Self {
            display_char,
            codepoint: Optional::Some(Codepoint(c)),
            utf_8_bytes: Utf8Bytes::from_char(c),
            name,
            category,
            char_combining_class,
        }
    }
    fn from_bad_char(err: Utf8Error) -> Self {
        let (bad_bytes, num_bad_bytes) = err.into_parts();
        Self {
            display_char: CappedString::new_truncating("\u{fffd}"),
            codepoint: Optional::None,
            utf_8_bytes: Utf8Bytes {
                buf: bad_bytes,
                len: num_bad_bytes,
            },
            name: Optional::Some("<invalid>"),
            category: Optional::None,
            char_combining_class: Optional::None,
        }
    }
 }
 #[derive(Debug)]
 enum Optional<T> {
    Some(T),
@ -122,13 +142,27 @@ impl fmt::Display for Codepoint {
 }
 #[derive(Debug)]
-struct Utf8Bytes(char);
+struct Utf8Bytes {
    buf: [u8; 4],
    len: usize,
 }
 impl Utf8Bytes {
    fn from_char(c: char) -> Self {
        let mut buf = [0u8; 4];
        let string = c.encode_utf8(&mut buf);
        let len = string.len();
        Self { buf, len }
    }
    fn bytes(&self) -> &[u8] {
        &self.buf[..self.len]
    }
 }
 impl fmt::Display for Utf8Bytes {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut buf = [0u8; 4];
+        let mut bytes = self.bytes().iter().copied();
        let s = self.0.encode_utf8(&mut buf);
        let mut bytes = s.bytes();
        if let Some(b) = bytes.next() {
            write!(f, "0x{:02x}", b)?;
            for b in bytes {
--- a/utfdump_core/src/chardata.rs
+++ b/utfdump_core/src/chardata.rs
@ -4,7 +4,7 @@ use std::fmt;
 pub struct CharData<'a> {
    name: &'a str,
    category: Category,
-    ccc: CombiningClass,
+    combining_class: CombiningClass,
 }
 impl<'a> CharData<'a> {
@ -22,8 +22,8 @@ impl<'a> CharData<'a> {
        Some((codepoint, Self::from_parts(name, category, ccc)))
    }
-    pub fn from_parts(name: &'a str, category: Category, ccc: CombiningClass) -> Self {
+    pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self {
-        Self { name, category, ccc }
+        Self { name, category, combining_class }
    }
    pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
@ -41,44 +41,43 @@ impl<'a> CharData<'a> {
        self.category
    }
-    pub fn ccc(&self) -> CombiningClass {
+    pub fn combining_class(&self) -> CombiningClass {
-        self.ccc
+        self.combining_class
    }
 }
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 #[repr(u8)]
 pub enum Category {
-    Lu = 0,
+    Lu,
-    Ll = 1,
+    Ll,
-    Lt = 2,
+    Lt,
-    Mn = 3,
+    Mn,
-    Mc = 4,
+    Mc,
-    Me = 5,
+    Me,
-    Nd = 6,
+    Nd,
-    Nl = 7,
+    Nl,
-    No = 8,
+    No,
-    Zs = 9,
+    Zs,
-    Zl = 10,
+    Zl,
-    Zp = 11,
+    Zp,
-    Cc = 12,
+    Cc,
-    Cf = 13,
+    Cf,
-    Cs = 14,
+    Cs,
-    Co = 15,
+    Co,
-    Cn = 16,
+    Cn,
-    Lm = 17,
+    Lm,
-    Lo = 18,
+    Lo,
-    Pc = 19,
+    Pc,
-    Pd = 20,
+    Pd,
-    Ps = 21,
+    Ps,
-    Pe = 22,
+    Pe,
-    Pi = 23,
+    Pi,
-    Pf = 24,
+    Pf,
-    Po = 25,
+    Po,
-    Sm = 26,
+    Sm,
-    Sc = 27,
+    Sc,
-    Sk = 28,
+    Sk,
-    So = 29,
+    So,
 }
 impl Category {
--- a/utfdump_core/src/encoded.rs
+++ b/utfdump_core/src/encoded.rs
@ -7,7 +7,13 @@ const DATA_ENTRY_SIZE: usize = 8;
 const DATA_INIT_FLAG: u8 = 1;
 const DATA_REPEATED_FLAG: u8 = 2;
-fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, repeated: bool) -> [u8; DATA_ENTRY_SIZE] {
+fn encode_char_data(
    name_index: u32,
    category: Category,
    combining_class: CombiningClass,
    repeated: bool
 ) -> [u8; DATA_ENTRY_SIZE]
 {
    let mut buf = [0u8; DATA_ENTRY_SIZE];
    buf[0] |= DATA_INIT_FLAG;
@ -18,12 +24,14 @@ fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, re
    buf[1..5].copy_from_slice(&name_index.to_le_bytes());
    buf[5] = category.byte_repr();
-    buf[6] = ccc.0;
+    buf[6] = combining_class.0;
    buf
 }
-fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, CombiningClass, bool)> {
+fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
    -> Option<(u32, Category, CombiningClass, bool)>
 {
    let flags = bytes[0];
    if flags & DATA_INIT_FLAG == 0 {
@ -32,10 +40,10 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, Comb
    let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
    let category = Category::from_byte(bytes[5])?;
-    let ccc = CombiningClass(bytes[6]);
+    let combining_class = CombiningClass(bytes[6]);
    let repeated = flags & DATA_REPEATED_FLAG != 0;
-    Some((name_index, category, ccc, repeated))
+    Some((name_index, category, combining_class, repeated))
 }
 pub struct DataBuf {
@ -62,7 +70,10 @@ impl DataBuf {
            return Ok(());
        }
-        let repeated = range.end - range.start > 1;
+        let repeated = range.end
            .checked_sub(range.start)
            .map(|len| len > 1)
            .unwrap_or(false);
        let range = {
            let start = usize::try_from(range.start)
@ -86,7 +97,7 @@ impl DataBuf {
        let encoded_char_data = encode_char_data(
            name_index,
            char_data.category(),
-            char_data.ccc(),
+            char_data.combining_class(),
            repeated
        );
@ -121,8 +132,8 @@ pub struct Data<'a> {
 }
 impl<'a> Data<'a> {
-    pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
+    pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
-        let index = usize::try_from(codepoint).ok()?;
+        let index = usize::try_from(u32::from(codepoint)).ok()?;
        let start = index.checked_mul(DATA_ENTRY_SIZE)?;
        let end = start.checked_add(DATA_ENTRY_SIZE)?;
        let encoded = self.data.get(start..end)?;
`@ -1,2 +1,2 @@`
	`#!/bin/bash`	`#!/bin/bash`
	`curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt`	`curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt`