From 96d328c829f59dddaeb6eca32ee26d931ddee2d7 Mon Sep 17 00:00:00 2001
From: pantonshire <tom@tomandtally.co.uk>
Date: Thu, 18 May 2023 17:04:30 +0100
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20support=20for=20invalid=20utf8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Cargo.lock                        |  68 ++++----
 Cargo.toml                        |   1 +
 get_data.sh                       |   2 +-
 utfdump/Cargo.toml                |  11 ++
 {utfdump_bin => utfdump}/build.rs |   0
 utfdump/src/lib.rs                |  18 +++
 utfdump/src/utf8.rs               | 255 ++++++++++++++++++++++++++++++
 utfdump_bin/Cargo.toml            |   7 +-
 utfdump_bin/src/main.rs           | 138 ++++++++++------
 utfdump_core/src/chardata.rs      |  71 ++++-----
 utfdump_core/src/encoded.rs       |  29 ++--
 11 files changed, 467 insertions(+), 133 deletions(-)
 create mode 100644 utfdump/Cargo.toml
 rename {utfdump_bin => utfdump}/build.rs (100%)
 create mode 100644 utfdump/src/lib.rs
 create mode 100644 utfdump/src/utf8.rs

diff --git a/Cargo.lock b/Cargo.lock
index 312f0a4..85026ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -33,9 +33,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
 
 [[package]]
 name = "clap"
-version = "3.2.22"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
  "atty",
  "bitflags",
@@ -50,9 +50,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "3.2.18"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
+checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -84,9 +84,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
 [[package]]
 name = "heck"
-version = "0.4.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
 name = "hermit-abi"
@@ -99,9 +99,9 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "1.9.1"
+version = "1.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
 dependencies = [
  "autocfg",
  "hashbrown",
@@ -109,29 +109,29 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.132"
+version = "0.2.144"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
+checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
 
 [[package]]
 name = "libshire"
 version = "0.1.0"
-source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31"
+source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.14.0"
+version = "1.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
 
 [[package]]
 name = "os_str_bytes"
-version = "6.3.0"
+version = "6.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
 
 [[package]]
 name = "papergrid"
@@ -170,27 +170,27 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.43"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.21"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
 dependencies = [
  "proc-macro2",
 ]
 
 [[package]]
 name = "serde"
-version = "1.0.144"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
 
 [[package]]
 name = "strsim"
@@ -200,9 +200,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "syn"
-version = "1.0.99"
+version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -235,24 +235,24 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
  "winapi-util",
 ]
 
 [[package]]
 name = "textwrap"
-version = "0.15.1"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16"
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.4"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
+checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
 
 [[package]]
 name = "unicode-width"
@@ -263,11 +263,19 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
 [[package]]
 name = "utfdump"
 version = "0.1.0"
+dependencies = [
+ "once_cell",
+ "utfdump_core",
+]
+
+[[package]]
+name = "utfdump_bin"
+version = "0.1.0"
 dependencies = [
  "clap",
  "libshire",
  "tabled",
- "utfdump_core",
+ "utfdump",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 5170b07..e296b0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,6 @@
 [workspace]
 members = [
     "utfdump_core",
+    "utfdump",
     "utfdump_bin",
 ]
diff --git a/get_data.sh b/get_data.sh
index 652cfdc..0254d53 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt
+curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt
diff --git a/utfdump/Cargo.toml b/utfdump/Cargo.toml
new file mode 100644
index 0000000..2e7c677
--- /dev/null
+++ b/utfdump/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "utfdump"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+utfdump_core = { path = "../utfdump_core" }
+once_cell = "1.17.1"
+
+[build-dependencies]
+utfdump_core = { path = "../utfdump_core" }
diff --git a/utfdump_bin/build.rs b/utfdump/build.rs
similarity index 100%
rename from utfdump_bin/build.rs
rename to utfdump/build.rs
diff --git a/utfdump/src/lib.rs b/utfdump/src/lib.rs
new file mode 100644
index 0000000..01125f2
--- /dev/null
+++ b/utfdump/src/lib.rs
@@ -0,0 +1,18 @@
+pub mod utf8;
+
+pub use utfdump_core::chardata::{CharData, Category, CombiningClass};
+
+use once_cell::sync::Lazy;
+use utfdump_core::encoded::Data;
+
+const UNICODE_DATA_BYTES: &[u8] = include_bytes!(
+    concat!(env!("OUT_DIR"), "/unicode_data_encoded")
+);
+
+static UNICODE_DATA: Lazy<Data> = Lazy::new(|| {
+    Data::from_bytes(UNICODE_DATA_BYTES).unwrap()
+});
+
+pub fn char_data(c: char) -> Option<CharData<'static>> {
+    UNICODE_DATA.get(c)
+}
diff --git a/utfdump/src/utf8.rs b/utfdump/src/utf8.rs
new file mode 100644
index 0000000..8cbee11
--- /dev/null
+++ b/utfdump/src/utf8.rs
@@ -0,0 +1,255 @@
+use std::iter::Peekable;
+
+pub trait ToByte {
+    fn to_byte(self) -> u8;
+
+    fn as_byte(&self) -> u8;
+}
+
+impl ToByte for u8 {
+    fn to_byte(self) -> u8 {
+        self
+    }
+
+    fn as_byte(&self) -> u8 {
+        *self
+    }
+}
+
+impl<'a, B> ToByte for &'a B
+where
+    B: ToByte,
+{
+    fn to_byte(self) -> u8 {
+        <B as ToByte>::as_byte(self)
+    }
+
+    fn as_byte(&self) -> u8 {
+        <Self as ToByte>::to_byte(*self)
+    }
+}
+
+pub trait Utf8Decode {
+    type Iter: Iterator<Item = Self::Byte>;
+    type Byte: ToByte;
+
+    fn decode_utf8(self) -> Utf8Decoder<Self::Iter, Self::Byte>;
+}
+
+impl<T, B> Utf8Decode for T
+where
+    T: IntoIterator<Item = B>,
+    B: ToByte,
+{
+    type Iter = <T as IntoIterator>::IntoIter;
+    type Byte = B;
+
+    fn decode_utf8(self) -> Utf8Decoder<Self::Iter, B> {
+        Utf8Decoder::new(self.into_iter())
+    }
+}
+
+// https://encoding.spec.whatwg.org/#utf-8-decoder
+pub struct Utf8Decoder<I, B>
+where
+    I: Iterator<Item = B>,
+    B: ToByte,
+{
+    bytes: Peekable<I>,
+}
+
+impl<I, B> Utf8Decoder<I, B>
+where
+    I: Iterator<Item = B>,
+    B: ToByte,
+{
+    fn new(bytes: I) -> Self {
+        Self {
+            bytes: bytes.peekable(),
+        }
+    }
+}
+
+impl<I, B> Iterator for Utf8Decoder<I, B>
+where
+    I: Iterator<Item = B>,
+    B: ToByte,
+{
+    type Item = Result<char, Utf8Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        const DEFAULT_BOUNDARIES: (u8, u8) = (0x80, 0xbf);
+        
+        // Keep track of the bytes we have seen so far, so that if there is an error we can return
+        // the problematic bytes. There is no need for a variable to store the number of bytes we
+        // have put into this array, since we can always work it out from other sources.
+        let mut bytes_seen = [0u8; 4];
+
+        let mut codepoint: u32;
+        let bytes_needed: u8;
+        let mut lower_boundary: u8;
+        let mut upper_boundary: u8;
+
+        let first_byte = self.bytes.next()?.to_byte();
+        bytes_seen[0] = first_byte;
+
+        match first_byte {
+            byte @ 0x00..=0x7f => {
+                return Some(Ok(char::from(byte)));
+            },
+
+            byte @ 0xc2..=0xdf => {
+                bytes_needed = 1;
+                codepoint = u32::from(byte & 0x1f) << 6;
+                (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
+            },
+
+            byte @ 0xe0..=0xef => {
+                bytes_needed = 2;
+                codepoint = u32::from(byte & 0x0f) << 12;
+                (lower_boundary, upper_boundary) = match byte {
+                    0xe0 => (0xa0, 0xbf),
+                    0xed => (0x80, 0x9f),
+                    _ => DEFAULT_BOUNDARIES,
+                };
+            },
+
+            byte @ 0xf0..=0xf4 => {
+                bytes_needed = 3;
+                codepoint = u32::from(byte & 0x07) << 18;
+                (lower_boundary, upper_boundary) = match byte {
+                    0xf0 => (0x90, 0xbf),
+                    0xf4 => (0x80, 0x8f),
+                    _ => DEFAULT_BOUNDARIES,
+                };
+            },
+
+            _ => {
+                return Some(Err(Utf8Error {
+                    bad_bytes: bytes_seen,
+                    num_bad_bytes: 1,
+                }));
+            },
+        }
+
+        for i in 0..bytes_needed {
+            // Peek the byte rather than consuming it; the specification says we should not consume
+            // the byte here if it is not between the upper and lower boundaries.
+            let byte = match self.bytes.peek() {
+                Some(byte) => byte.as_byte(),
+                None => return Some(Err(Utf8Error {
+                    bad_bytes: bytes_seen,
+                    num_bad_bytes: usize::from(i) + 1,
+                })),
+            };
+
+            bytes_seen[usize::from(i) + 1] = byte;
+            
+            if !(lower_boundary..=upper_boundary).contains(&byte) {
+                return Some(Err(Utf8Error {
+                    bad_bytes: bytes_seen,
+                    num_bad_bytes: usize::from(i) + 2,
+                }));
+            }
+
+            // Consume the byte we peeked.
+            self.bytes.next();
+
+            (lower_boundary, upper_boundary) = DEFAULT_BOUNDARIES;
+
+            // OR the 6 least significant bits into the codepoint.
+            codepoint |= u32::from(byte & 0x3f) << (6 * (bytes_needed - i - 1));
+        }
+
+        // FIXME: make this unchecked?
+        let codepoint = char::try_from(codepoint)
+            .unwrap();
+
+        Some(Ok(codepoint))
+    }
+}
+
+pub struct Utf8Error {
+    bad_bytes: [u8; 4],
+    num_bad_bytes: usize,
+}
+
+impl Utf8Error {
+    pub fn bytes(&self) -> &[u8] {
+        &self.bad_bytes[..self.num_bad_bytes]
+    }
+
+    pub fn into_parts(self) -> ([u8; 4], usize) {
+        (self.bad_bytes, self.num_bad_bytes)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::char::REPLACEMENT_CHARACTER;
+
+    use super::Utf8Decode;
+
+    #[test]
+    fn test_utf8_decoder() {
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0x68, 0x65, 0x6c, 0x6c, 0x6f
+            ]),
+            "hello"
+        );
+        
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
+            ]),
+            "κόσμε"
+        );
+
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
+                0xb8, 0x8f
+            ]),
+            "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"
+        );
+
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0xce, 0x61
+            ]),
+            "\u{fffd}a"
+        );
+
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0xce, 0xc2
+            ]),
+            "\u{fffd}\u{fffd}"
+        );
+
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0x80
+            ]),
+            "\u{fffd}"
+        );
+
+        assert_eq!(
+            &decode_collect_lossy(&[
+                0x80, 0x80
+            ]),
+            "\u{fffd}\u{fffd}"
+        );
+    }
+
+    fn decode_collect_lossy(bytes: &[u8]) -> String {
+        bytes
+            .decode_utf8()
+            .map(|res| match res {
+                Ok(c) => c,
+                Err(_) => REPLACEMENT_CHARACTER,
+            })
+            .collect()
+    }
+}
diff --git a/utfdump_bin/Cargo.toml b/utfdump_bin/Cargo.toml
index f4ec630..0ff8ce0 100644
--- a/utfdump_bin/Cargo.toml
+++ b/utfdump_bin/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "utfdump"
+name = "utfdump_bin"
 version = "0.1.0"
 edition = "2021"
 authors = ["Tom Panton <pantonshire@gmail.com>"]
@@ -8,10 +8,7 @@ repository = "https://github.com/pantonshire/utfdump"
 description = "Command-line Unicode character info tool"
 
 [dependencies]
-utfdump_core = { path = "../utfdump_core" }
+utfdump = { path = "../utfdump" }
 libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
 tabled = "0.8.0"
 clap = { version = "3.2.22", features = ["derive"] }
-
-[build-dependencies]
-utfdump_core = { path = "../utfdump_core" }
diff --git a/utfdump_bin/src/main.rs b/utfdump_bin/src/main.rs
index b42bb7f..d86f2ab 100644
--- a/utfdump_bin/src/main.rs
+++ b/utfdump_bin/src/main.rs
@@ -3,10 +3,7 @@ use std::{fmt, io::{self, Read}};
 use clap::Parser;
 use libshire::strings::CappedString;
 use tabled::{Tabled, Table, Style};
-
-use utfdump_core::{chardata::{Category, CombiningClass}, encoded::Data};
-
-const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded"));
+use utfdump::{char_data, CombiningClass, Category, utf8::{Utf8Decode, Utf8Error}};
 
 #[derive(Parser)]
 #[clap(author, version, about, long_about = None)]
@@ -19,58 +16,18 @@ struct Args {
 fn main() {
     let args = Args::parse();
 
-    let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap();
-
     let input = {
         let mut buf = Vec::<u8>::new();
         let stdin = io::stdin();
         let mut guard = stdin.lock();
         guard.read_to_end(&mut buf)
             .expect("failed to read stdin");
-        // TODO: just skip over invalid utf-8 characters
-        String::from_utf8(buf)
-            .expect("invalid utf-8")
+        buf
     };
 
     let rows = input
-        .chars()
-        .map(|c| {
-            let mut name = Optional::None;
-            let mut category = Optional::None;
-            let mut char_combining_class = Optional::None;
-            
-            let mut combining = false;
-
-            if let Some(char_data) = data.get(c as u32) {
-                name = Optional::Some(char_data.name());
-                category = Optional::Some(DisplayCategory {
-                    category: char_data.category(),
-                    full_name: args.full_category_names,
-                });
-
-                let ccc = char_data.ccc();
-                char_combining_class = Optional::Some(ccc);
-                combining = ccc.is_combining();
-            }
-
-            let display_char = {
-                let mut buf = CappedString::empty();
-                if combining {
-                    buf.push_truncating('\u{25cc}');
-                }
-                buf.push_truncating(c);
-                buf
-            };
-
-            OutRow {
-                display_char,
-                codepoint: Codepoint(c),
-                utf_8_bytes: Utf8Bytes(c),
-                name,
-                category,
-                char_combining_class,
-            }
-        });
+        .decode_utf8()
+        .map(|c| OutRow::from_char_result(c, args.full_category_names));
 
     let table = Table::new(rows)
         .with(Style::modern());
@@ -83,7 +40,7 @@ struct OutRow {
     #[tabled(rename = "")]
     display_char: CappedString<8>, 
     #[tabled(rename = "Code")]
-    codepoint: Codepoint,
+    codepoint: Optional<Codepoint>,
     #[tabled(rename = "UTF-8")]
     utf_8_bytes: Utf8Bytes,
     #[tabled(rename = "Name")]
@@ -94,6 +51,69 @@ struct OutRow {
     char_combining_class: Optional<CombiningClass>,
 }
 
+impl OutRow {
+    fn from_char_result(c: Result<char, Utf8Error>, full_category_names: bool) -> Self {
+        match c {
+            Ok(c) => Self::from_good_char(c, full_category_names),
+            Err(err) => Self::from_bad_char(err),
+        }
+    }
+
+    fn from_good_char(c: char, full_category_names: bool) -> Self {
+        let mut name = Optional::None;
+        let mut category = Optional::None;
+        let mut char_combining_class = Optional::None;
+        
+        let mut combining = false;
+
+        if let Some(char_data) = char_data(c) {
+            name = Optional::Some(char_data.name());
+            category = Optional::Some(DisplayCategory {
+                category: char_data.category(),
+                full_name: full_category_names,
+            });
+
+            let ccc = char_data.combining_class();
+            char_combining_class = Optional::Some(ccc);
+            combining = ccc.is_combining();
+        }
+
+        let display_char = {
+            let mut buf = CappedString::empty();
+            if combining {
+                buf.push_truncating('\u{25cc}');
+            }
+            buf.push_truncating(c);
+            buf
+        };
+
+        Self {
+            display_char,
+            codepoint: Optional::Some(Codepoint(c)),
+            utf_8_bytes: Utf8Bytes::from_char(c),
+            name,
+            category,
+            char_combining_class,
+        }
+    }
+
+    fn from_bad_char(err: Utf8Error) -> Self {
+        let (bad_bytes, num_bad_bytes) = err.into_parts();
+
+        Self {
+            display_char: CappedString::new_truncating("\u{fffd}"),
+            codepoint: Optional::None,
+            utf_8_bytes: Utf8Bytes {
+                buf: bad_bytes,
+                len: num_bad_bytes,
+            },
+            name: Optional::Some("<invalid>"),
+            category: Optional::None,
+            char_combining_class: Optional::None,
+        }
+    }
+}
+
 #[derive(Debug)]
 enum Optional<T> {
     Some(T),
@@ -122,13 +142,27 @@ impl fmt::Display for Codepoint {
 }
 
 #[derive(Debug)]
-struct Utf8Bytes(char);
+struct Utf8Bytes {
+    buf: [u8; 4],
+    len: usize,
+}
+
+impl Utf8Bytes {
+    fn from_char(c: char) -> Self {
+        let mut buf = [0u8; 4];
+        let string = c.encode_utf8(&mut buf);
+        let len = string.len();
+        Self { buf, len }
+    }
+
+    fn bytes(&self) -> &[u8] {
+        &self.buf[..self.len]
+    }
+}
 
 impl fmt::Display for Utf8Bytes {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut buf = [0u8; 4];
-        let s = self.0.encode_utf8(&mut buf);
-        let mut bytes = s.bytes();
+        let mut bytes = self.bytes().iter().copied();
         if let Some(b) = bytes.next() {
             write!(f, "0x{:02x}", b)?;
             for b in bytes {
diff --git a/utfdump_core/src/chardata.rs b/utfdump_core/src/chardata.rs
index f0eafd2..4f5d2c0 100644
--- a/utfdump_core/src/chardata.rs
+++ b/utfdump_core/src/chardata.rs
@@ -4,7 +4,7 @@ use std::fmt;
 pub struct CharData<'a> {
     name: &'a str,
     category: Category,
-    ccc: CombiningClass,
+    combining_class: CombiningClass,
 }
 
 impl<'a> CharData<'a> {
@@ -22,8 +22,8 @@ impl<'a> CharData<'a> {
         Some((codepoint, Self::from_parts(name, category, ccc)))
     }
 
-    pub fn from_parts(name: &'a str, category: Category, ccc: CombiningClass) -> Self {
-        Self { name, category, ccc }
+    pub fn from_parts(name: &'a str, category: Category, combining_class: CombiningClass) -> Self {
+        Self { name, category, combining_class }
     }
 
     pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
@@ -41,44 +41,43 @@ impl<'a> CharData<'a> {
         self.category
     }
 
-    pub fn ccc(&self) -> CombiningClass {
-        self.ccc
+    pub fn combining_class(&self) -> CombiningClass {
+        self.combining_class
     }
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-#[repr(u8)]
 pub enum Category {
-    Lu = 0,
-    Ll = 1,
-    Lt = 2,
-    Mn = 3,
-    Mc = 4,
-    Me = 5,
-    Nd = 6,
-    Nl = 7,
-    No = 8,
-    Zs = 9,
-    Zl = 10,
-    Zp = 11,
-    Cc = 12,
-    Cf = 13,
-    Cs = 14,
-    Co = 15,
-    Cn = 16,
-    Lm = 17,
-    Lo = 18,
-    Pc = 19,
-    Pd = 20,
-    Ps = 21,
-    Pe = 22,
-    Pi = 23,
-    Pf = 24,
-    Po = 25,
-    Sm = 26,
-    Sc = 27,
-    Sk = 28,
-    So = 29,
+    Lu,
+    Ll,
+    Lt,
+    Mn,
+    Mc,
+    Me,
+    Nd,
+    Nl,
+    No,
+    Zs,
+    Zl,
+    Zp,
+    Cc,
+    Cf,
+    Cs,
+    Co,
+    Cn,
+    Lm,
+    Lo,
+    Pc,
+    Pd,
+    Ps,
+    Pe,
+    Pi,
+    Pf,
+    Po,
+    Sm,
+    Sc,
+    Sk,
+    So,
 }
 
 impl Category {
diff --git a/utfdump_core/src/encoded.rs b/utfdump_core/src/encoded.rs
index 48d9f20..044ac43 100644
--- a/utfdump_core/src/encoded.rs
+++ b/utfdump_core/src/encoded.rs
@@ -7,7 +7,13 @@ const DATA_ENTRY_SIZE: usize = 8;
 const DATA_INIT_FLAG: u8 = 1;
 const DATA_REPEATED_FLAG: u8 = 2;
 
-fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, repeated: bool) -> [u8; DATA_ENTRY_SIZE] {
+fn encode_char_data(
+    name_index: u32,
+    category: Category,
+    combining_class: CombiningClass,
+    repeated: bool
+) -> [u8; DATA_ENTRY_SIZE]
+{
     let mut buf = [0u8; DATA_ENTRY_SIZE];
 
     buf[0] |= DATA_INIT_FLAG;
@@ -18,12 +24,14 @@ fn encode_char_data(name_index: u32, category: Category, ccc: CombiningClass, re
 
     buf[1..5].copy_from_slice(&name_index.to_le_bytes());
     buf[5] = category.byte_repr();
-    buf[6] = ccc.0;
+    buf[6] = combining_class.0;
 
     buf
 }
 
-fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, CombiningClass, bool)> {
+fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE])
+    -> Option<(u32, Category, CombiningClass, bool)>
+{
     let flags = bytes[0];
     
     if flags & DATA_INIT_FLAG == 0 {
@@ -32,10 +40,10 @@ fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, Comb
 
     let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
     let category = Category::from_byte(bytes[5])?;
-    let ccc = CombiningClass(bytes[6]);
+    let combining_class = CombiningClass(bytes[6]);
     let repeated = flags & DATA_REPEATED_FLAG != 0;
 
-    Some((name_index, category, ccc, repeated))
+    Some((name_index, category, combining_class, repeated))
 }
 
 pub struct DataBuf {
@@ -62,7 +70,10 @@ impl DataBuf {
             return Ok(());
         }
 
-        let repeated = range.end - range.start > 1;
+        let repeated = range.end
+            .checked_sub(range.start)
+            .map(|len| len > 1)
+            .unwrap_or(false);
 
         let range = {
             let start = usize::try_from(range.start)
@@ -86,7 +97,7 @@ impl DataBuf {
         let encoded_char_data = encode_char_data(
             name_index,
             char_data.category(),
-            char_data.ccc(),
+            char_data.combining_class(),
             repeated
         );
 
@@ -121,8 +132,8 @@ pub struct Data<'a> {
 }
 
 impl<'a> Data<'a> {
-    pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
-        let index = usize::try_from(codepoint).ok()?;
+    pub fn get(self, codepoint: char) -> Option<CharData<'a>> {
+        let index = usize::try_from(u32::from(codepoint)).ok()?;
         let start = index.checked_mul(DATA_ENTRY_SIZE)?;
         let end = start.checked_add(DATA_ENTRY_SIZE)?;
         let encoded = self.data.get(start..end)?;