commit 172d1a14fe295f01b377157997dc02292c93455e
Author: pantonshire <tom@tomandtally.co.uk>
Date:   Sun Sep 18 18:05:00 2022 +0100

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9469cc1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+unicode_data_latest.txt
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..f903703
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,154 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "bytecount"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "heck"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+
+[[package]]
+name = "libshire"
+version = "0.1.0"
+source = "git+https://github.com/pantonshire/libshire?branch=main#7253d950108c729141239f4add4b3df67a54db31"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "papergrid"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "453cf71f2a37af495a1a124bf30d4d7469cfbea58e9f2479be9d222396a518a2"
+dependencies = [
+ "bytecount",
+ "fnv",
+ "unicode-width",
+]
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.144"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
+
+[[package]]
+name = "syn"
+version = "1.0.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tabled"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5b2f8c37d26d87d2252187b0a45ea3cbf42baca10377c7e7eaaa2800fa9bf97"
+dependencies = [
+ "papergrid",
+ "tabled_derive",
+ "unicode-width",
+]
+
+[[package]]
+name = "tabled_derive"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9ee618502f497abf593e1c5c9577f34775b111480009ffccd7ad70d23fcaba8"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+
+[[package]]
+name = "utfdump"
+version = "0.1.0"
+dependencies = [
+ "libshire",
+ "tabled",
+ "utfdump_core",
+]
+
+[[package]]
+name = "utfdump_core"
+version = "0.1.0"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5170b07
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,5 @@
+[workspace]
+members = [
+    "utfdump_core",
+    "utfdump_bin",
+]
diff --git a/get_data.sh b/get_data.sh
new file mode 100755
index 0000000..652cfdc
--- /dev/null
+++ b/get_data.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump_bin/unicode_data_latest.txt
diff --git a/utfdump_bin/Cargo.toml b/utfdump_bin/Cargo.toml
new file mode 100644
index 0000000..ef1f19d
--- /dev/null
+++ b/utfdump_bin/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "utfdump"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+utfdump_core = { path = "../utfdump_core" }
+libshire = { git = "https://github.com/pantonshire/libshire", branch = "main" }
+tabled = "0.8.0"
+
+[build-dependencies]
+utfdump_core = { path = "../utfdump_core" }
diff --git a/utfdump_bin/build.rs b/utfdump_bin/build.rs
new file mode 100644
index 0000000..cda2ff2
--- /dev/null
+++ b/utfdump_bin/build.rs
@@ -0,0 +1,71 @@
+use std::{env, fs::File, io::{BufReader, BufRead, Write}, path::Path};
+
+use utfdump_core::{chardata::CharData, encoded::DataBuf};
+
+const UNICODE_DATA_PATH: &str = "unicode_data_latest.txt";
+const OUT_DATA_PATH: &str = "unicode_data_encoded";
+
+fn main() {
+    println!("cargo:rerun-if-changed={}", UNICODE_DATA_PATH);
+    
+    let out_dir = env::var_os("OUT_DIR").unwrap();
+    let out_path = Path::new(&out_dir).join(OUT_DATA_PATH);
+
+    let data_file = File::open(UNICODE_DATA_PATH)
+        .expect("failed to open unicode data file");
+
+    let buf_reader = BufReader::new(data_file);
+ 
+    let mut data = DataBuf::new();
+    let mut start_codepoint = None;
+
+    for line in buf_reader.lines() {
+        let line = line.unwrap();
+        let (codepoint, char_data) = CharData::from_row(&line).unwrap();
+        
+        match start_codepoint {
+            Some(start_codepoint_inner) => {
+                let prefix = char_data.name()
+                    .strip_suffix(", Last>")
+                    .expect("expected end of codepoint block");
+
+                let name = {
+                    let mut buf = String::with_capacity(prefix.len() + 1);
+                    buf.push_str(prefix);
+                    buf.push('>');
+                    buf
+                };
+
+                let char_data = char_data.with_name(&name);
+
+                data.insert(char_data, start_codepoint_inner..(codepoint + 1))
+                    .unwrap();
+
+                start_codepoint = None;
+            },
+
+            None => {
+                if char_data.name().ends_with(", First>") {
+                    start_codepoint = Some(codepoint);
+                } else {
+                    data.insert(char_data, codepoint..(codepoint + 1))
+                        .unwrap();
+                }
+            },
+        }
+    }
+
+    let (strings_len, [strings, data]) = data
+        .as_ref_type()
+        .to_bytes()
+        .unwrap();
+
+    let mut out_file = File::create(&out_path)
+        .expect("failed to open output file");
+
+    out_file.write_all(&strings_len).unwrap();
+    out_file.write_all(strings).unwrap();
+    out_file.write_all(data).unwrap();
+
+    drop(out_file);
+}
diff --git a/utfdump_bin/src/main.rs b/utfdump_bin/src/main.rs
new file mode 100644
index 0000000..558677d
--- /dev/null
+++ b/utfdump_bin/src/main.rs
@@ -0,0 +1,126 @@
+use std::{fmt, io::{self, Read}};
+
+use libshire::strings::CappedString;
+use tabled::{Tabled, Table, Style};
+
+use utfdump_core::{chardata::Category, encoded::Data};
+
+const UNICODE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/unicode_data_encoded"));
+
+fn main() {
+    let data = Data::<'static>::from_bytes(UNICODE_DATA).unwrap();
+
+    let input = {
+        let mut buf = Vec::<u8>::new();
+        let stdin = io::stdin();
+        let mut guard = stdin.lock();
+        guard.read_to_end(&mut buf)
+            .expect("failed to read stdin");
+        // TODO: just skip over invalid utf-8 characters
+        String::from_utf8(buf)
+            .expect("invalid utf-8")
+    };
+
+    let rows = input
+        .chars()
+        .map(|c| {
+            let mut name = Optional::None;
+            let mut category = Optional::None;
+            let mut char_combining_class = Optional::None;
+            
+            let mut combining = false;
+
+            if let Some(char_data) = data.get(c as u32) {
+                name = Optional::Some(char_data.name());
+                category = Optional::Some(char_data.category());
+
+                let ccc = char_data.ccc();
+                char_combining_class = Optional::Some(ccc);
+                combining = ccc != 0;
+            }
+
+            let display_char = {
+                let mut buf = CappedString::empty();
+                if combining {
+                    buf.push_truncating('\u{25cc}');
+                }
+                buf.push_truncating(c);
+                buf
+            };
+
+            OutRow {
+                display_char,
+                codepoint: Codepoint(c),
+                utf_8_bytes: Utf8Bytes(c),
+                name,
+                category,
+                char_combining_class,
+            }
+        });
+
+    let table = Table::new(rows)
+        .with(Style::modern());
+
+    println!("{}", table);
+}
+
+#[derive(Tabled)]
+struct OutRow {
+    #[tabled(rename = "")]
+    display_char: CappedString<8>, 
+    #[tabled(rename = "Codepoint")]
+    codepoint: Codepoint,
+    #[tabled(rename = "UTF-8")]
+    utf_8_bytes: Utf8Bytes,
+    #[tabled(rename = "Name")]
+    name: Optional<&'static str>,
+    #[tabled(rename = "Category")]
+    category: Optional<Category>,
+    #[tabled(rename = "Combining")]
+    char_combining_class: Optional<u8>,
+}
+
+#[derive(Debug)]
+enum Optional<T> {
+    Some(T),
+    None,
+}
+
+impl<T> fmt::Display for Optional<T>
+where
+    T: fmt::Display,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Some(x) => fmt::Display::fmt(&x, f),
+            Self::None => f.write_str("??"),
+        }
+    }    
+}
+
+#[derive(Debug)]
+struct Codepoint(char);
+
+impl fmt::Display for Codepoint {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "U+{:04x}", self.0 as u32)
+    }
+}
+
+#[derive(Debug)]
+struct Utf8Bytes(char);
+
+impl fmt::Display for Utf8Bytes {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut buf = [0u8; 4];
+        let s = self.0.encode_utf8(&mut buf);
+        let mut bytes = s.bytes();
+        if let Some(b) = bytes.next() {
+            write!(f, "0x{:02x}", b)?;
+            for b in bytes {
+                write!(f, " 0x{:02x}", b)?;
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/utfdump_core/Cargo.toml b/utfdump_core/Cargo.toml
new file mode 100644
index 0000000..d0be7de
--- /dev/null
+++ b/utfdump_core/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "utfdump_core"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
diff --git a/utfdump_core/src/chardata.rs b/utfdump_core/src/chardata.rs
new file mode 100644
index 0000000..9426bb8
--- /dev/null
+++ b/utfdump_core/src/chardata.rs
@@ -0,0 +1,237 @@
+use std::fmt;
+
+#[derive(Clone, Debug)]
+pub struct CharData<'a> {
+    name: &'a str,
+    category: Category,
+    ccc: u8,
+}
+
+impl<'a> CharData<'a> {
+    pub fn from_row(row: &'a str) -> Option<(u32, Self)> {
+        let mut fields = [""; 15];
+        for (i, field) in row.splitn(15, ';').enumerate() {
+            fields[i] = field;
+        }
+
+        let codepoint = u32::from_str_radix(fields[0], 16).ok()?;
+        let name = fields[1];
+        let category = Category::from_abbr(fields[2])?;
+        let ccc = u8::from_str_radix(fields[3], 10).ok()?;
+
+        Some((codepoint, Self::from_parts(name, category, ccc)))
+    }
+
+    pub fn from_parts(name: &'a str, category: Category, ccc: u8) -> Self {
+        Self { name, category, ccc }
+    }
+
+    pub fn with_name<'b>(self, name: &'a str) -> CharData<'b>
+    where
+        'a: 'b,
+    {
+        Self { name, ..self }
+    }
+
+    pub fn name(&self) -> &'a str {
+        self.name
+    }
+
+    pub fn category(&self) -> Category {
+        self.category
+    }
+
+    pub fn ccc(&self) -> u8 {
+        self.ccc
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+#[repr(u8)]
+pub enum Category {
+    Lu = 0,
+    Ll = 1,
+    Lt = 2,
+    Mn = 3,
+    Mc = 4,
+    Me = 5,
+    Nd = 6,
+    Nl = 7,
+    No = 8,
+    Zs = 9,
+    Zl = 10,
+    Zp = 11,
+    Cc = 12,
+    Cf = 13,
+    Cs = 14,
+    Co = 15,
+    Cn = 16,
+    Lm = 17,
+    Lo = 18,
+    Pc = 19,
+    Pd = 20,
+    Ps = 21,
+    Pe = 22,
+    Pi = 23,
+    Pf = 24,
+    Po = 25,
+    Sm = 26,
+    Sc = 27,
+    Sk = 28,
+    So = 29,
+}
+
+impl Category {
+    pub fn from_byte(b: u8) -> Option<Self> {
+        match b {
+            0 => Some(Self::Lu),
+            1 => Some(Self::Ll),
+            2 => Some(Self::Lt),
+            3 => Some(Self::Mn),
+            4 => Some(Self::Mc),
+            5 => Some(Self::Me),
+            6 => Some(Self::Nd),
+            7 => Some(Self::Nl),
+            8 => Some(Self::No),
+            9 => Some(Self::Zs),
+            10 => Some(Self::Zl),
+            11 => Some(Self::Zp),
+            12 => Some(Self::Cc),
+            13 => Some(Self::Cf),
+            14 => Some(Self::Cs),
+            15 => Some(Self::Co),
+            16 => Some(Self::Cn),
+            17 => Some(Self::Lm),
+            18 => Some(Self::Lo),
+            19 => Some(Self::Pc),
+            20 => Some(Self::Pd),
+            21 => Some(Self::Ps),
+            22 => Some(Self::Pe),
+            23 => Some(Self::Pi),
+            24 => Some(Self::Pf),
+            25 => Some(Self::Po),
+            26 => Some(Self::Sm),
+            27 => Some(Self::Sc),
+            28 => Some(Self::Sk),
+            29 => Some(Self::So),
+            _ => None,
+        }
+    }
+
+    pub fn byte_repr(self) -> u8 {
+        self as u8
+    }
+
+    pub fn from_abbr(s: &str) -> Option<Self> {
+        match s {
+            "Lu" => Some(Self::Lu),
+            "Ll" => Some(Self::Ll),
+            "Lt" => Some(Self::Lt),
+            "Mn" => Some(Self::Mn),
+            "Mc" => Some(Self::Mc),
+            "Me" => Some(Self::Me),
+            "Nd" => Some(Self::Nd),
+            "Nl" => Some(Self::Nl),
+            "No" => Some(Self::No),
+            "Zs" => Some(Self::Zs),
+            "Zl" => Some(Self::Zl),
+            "Zp" => Some(Self::Zp),
+            "Cc" => Some(Self::Cc),
+            "Cf" => Some(Self::Cf),
+            "Cs" => Some(Self::Cs),
+            "Co" => Some(Self::Co),
+            "Cn" => Some(Self::Cn),
+            "Lm" => Some(Self::Lm),
+            "Lo" => Some(Self::Lo),
+            "Pc" => Some(Self::Pc),
+            "Pd" => Some(Self::Pd),
+            "Ps" => Some(Self::Ps),
+            "Pe" => Some(Self::Pe),
+            "Pi" => Some(Self::Pi),
+            "Pf" => Some(Self::Pf),
+            "Po" => Some(Self::Po),
+            "Sm" => Some(Self::Sm),
+            "Sc" => Some(Self::Sc),
+            "Sk" => Some(Self::Sk),
+            "So" => Some(Self::So),
+            _ => None,
+        }
+    }
+
+    pub fn abbr(self) -> &'static str {
+        match self {
+            Self::Lu => "Lu",
+            Self::Ll => "Ll",
+            Self::Lt => "Lt",
+            Self::Mn => "Mn",
+            Self::Mc => "Mc",
+            Self::Me => "Me",
+            Self::Nd => "Nd",
+            Self::Nl => "Nl",
+            Self::No => "No",
+            Self::Zs => "Zs",
+            Self::Zl => "Zl",
+            Self::Zp => "Zp",
+            Self::Cc => "Cc",
+            Self::Cf => "Cf",
+            Self::Cs => "Cs",
+            Self::Co => "Co",
+            Self::Cn => "Cn",
+            Self::Lm => "Lm",
+            Self::Lo => "Lo",
+            Self::Pc => "Pc",
+            Self::Pd => "Pd",
+            Self::Ps => "Ps",
+            Self::Pe => "Pe",
+            Self::Pi => "Pi",
+            Self::Pf => "Pf",
+            Self::Po => "Po",
+            Self::Sm => "Sm",
+            Self::Sc => "Sc",
+            Self::Sk => "Sk",
+            Self::So => "So",
+        }
+    }
+
+    pub fn full_name(self) -> &'static str {
+        match self {
+            Self::Lu => "Letter, Uppercase",
+            Self::Ll => "Letter, Lowercase",
+            Self::Lt => "Letter, Titlecase",
+            Self::Mn => "Mark, Non-Spacing",
+            Self::Mc => "Mark, Spacing Combining",
+            Self::Me => "Mark, Enclosing",
+            Self::Nd => "Number, Decimal Digit",
+            Self::Nl => "Number, Letter",
+            Self::No => "Number, Other",
+            Self::Zs => "Separator, Space",
+            Self::Zl => "Separator, Line",
+            Self::Zp => "Separator: Paragraph",
+            Self::Cc => "Other, Control",
+            Self::Cf => "Other, Format",
+            Self::Cs => "Other, Surrogate",
+            Self::Co => "Other, Private Use",
+            Self::Cn => "Other, Not Assigned",
+            Self::Lm => "Letter, Modifier",
+            Self::Lo => "Letter, Other",
+            Self::Pc => "Punctuation, Connector",
+            Self::Pd => "Punctuation, Dash",
+            Self::Ps => "Punctuation, Open",
+            Self::Pe => "Punctuation, Close",
+            Self::Pi => "Punctuation, Initial Quote",
+            Self::Pf => "Punctuation, Final Quote",
+            Self::Po => "Punctuation, Other",
+            Self::Sm => "Symbol, Math",
+            Self::Sc => "Symbol, Currency",
+            Self::Sk => "Symbol, Modifier",
+            Self::So => "Symbol, Other",
+        }
+    }
+}
+
+impl fmt::Display for Category {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}: {}", self.abbr(), self.full_name())
+    }
+}
+
diff --git a/utfdump_core/src/encoded.rs b/utfdump_core/src/encoded.rs
new file mode 100644
index 0000000..430114c
--- /dev/null
+++ b/utfdump_core/src/encoded.rs
@@ -0,0 +1,245 @@
+use std::{collections::{HashMap, hash_map}, error, fmt, str, ops::Range};
+
+use crate::chardata::{CharData, Category};
+
+const DATA_ENTRY_SIZE: usize = 8;
+
+const DATA_INIT_FLAG: u8 = 1;
+const DATA_REPEATED_FLAG: u8 = 2;
+
+fn encode_char_data(name_index: u32, category: Category, ccc: u8, repeated: bool) -> [u8; DATA_ENTRY_SIZE] {
+    let mut buf = [0u8; DATA_ENTRY_SIZE];
+
+    buf[0] |= DATA_INIT_FLAG;
+
+    if repeated {
+        buf[0] |= DATA_REPEATED_FLAG;
+    }
+
+    buf[1..5].copy_from_slice(&name_index.to_le_bytes());
+    buf[5] = category.byte_repr();
+    buf[6] = ccc;
+
+    buf
+}
+
+fn decode_char_data(bytes: [u8; DATA_ENTRY_SIZE]) -> Option<(u32, Category, u8, bool)> {
+    let flags = bytes[0];
+    
+    if flags & DATA_INIT_FLAG == 0 {
+        return None;
+    }
+
+    let name_index = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
+    let category = Category::from_byte(bytes[5])?;
+    let ccc = bytes[6];
+    let repeated = flags & DATA_REPEATED_FLAG != 0;
+
+    Some((name_index, category, ccc, repeated))
+}
+
+pub struct DataBuf {
+    data: Vec<u8>,
+    strings: StringTableBuf,
+    strings_map: HashMap<String, u32>,
+}
+
+impl DataBuf {
+    pub fn new() -> Self {
+        Self {
+            data: Vec::new(),
+            strings: StringTableBuf::new(),
+            strings_map: HashMap::new(),
+        }
+    }
+
+    pub fn as_ref_type(&self) -> Data {
+        Data { data: &self.data, strings: self.strings.as_ref_type() }
+    }
+
+    pub fn insert(&mut self, char_data: CharData, range: Range<u32>) -> Result<(), DataBufError> {
+        if range.is_empty() {
+            return Ok(());
+        }
+
+        let repeated = range.end - range.start > 1;
+
+        let range = {
+            let start = usize::try_from(range.start)
+                .map_err(|_| DataBufError::DataOutOfCapacity)?
+                .checked_mul(DATA_ENTRY_SIZE)
+                .ok_or(DataBufError::DataOutOfCapacity)?;
+            let end = usize::try_from(range.end)
+                .map_err(|_| DataBufError::DataOutOfCapacity)?
+                .checked_mul(DATA_ENTRY_SIZE)
+                .ok_or(DataBufError::DataOutOfCapacity)?;
+            start..end
+        };
+
+        if let Some(extra_capacity_needed) = range.end.checked_sub(self.data.len()) {
+            self.data.try_reserve(extra_capacity_needed)
+                .map_err(|_| DataBufError::DataOutOfCapacity)?;
+        }
+
+        let name_index = self.add_string(char_data.name().to_owned())?;
+
+        let encoded_char_data = encode_char_data(
+            name_index,
+            char_data.category(),
+            char_data.ccc(),
+            repeated
+        );
+
+        if self.data.len() < range.end {
+            // Using 0 means that the DATA_INIT_FLAG won't be set, so these won't be valid entries.
+            self.data.resize(range.end, 0);
+        }
+
+        for i in range.step_by(DATA_ENTRY_SIZE) {
+            self.data[i..(i + DATA_ENTRY_SIZE)].copy_from_slice(&encoded_char_data);
+        }
+
+        Ok(())
+    }
+
+    fn add_string(&mut self, name: String) -> Result<u32, DataBufError> {
+        match self.strings_map.entry(name) {
+            hash_map::Entry::Occupied(entry) => Ok(*entry.get()),
+            hash_map::Entry::Vacant(entry) => {
+                let index = self.strings.push(entry.key())?;
+                entry.insert(index);
+                Ok(index)
+            },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct Data<'a> {
+    data: &'a [u8],
+    strings: StringTable<'a>,
+}
+
+impl<'a> Data<'a> {
+    pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
+        let index = usize::try_from(codepoint).ok()?;
+        let start = index.checked_mul(DATA_ENTRY_SIZE)?;
+        let end = start.checked_add(DATA_ENTRY_SIZE)?;
+        let encoded = self.data.get(start..end)?;
+        let (name_index, category, ccc, _repeated) = decode_char_data(encoded.try_into().unwrap())?;
+        let name = self.strings.get(name_index)?;
+        Some(CharData::from_parts(name, category, ccc))
+    }
+
+    pub fn to_bytes(self) -> Option<([u8; 4], [&'a [u8]; 2])> {
+        let strings = self.strings.to_bytes();
+        let strings_len = u32::try_from(strings.len())
+            .ok()?
+            .to_le_bytes();
+        Some((strings_len, [strings, self.data]))
+    }
+
+    pub fn from_bytes(bytes: &'a [u8]) -> Option<Self> {
+        let strings_len = usize::try_from(
+            u32::from_le_bytes(bytes.get(..4)?.try_into().unwrap())
+        ).ok()?;
+        let strings = StringTable::from_bytes(bytes.get(4..(4 + strings_len))?);
+        let data = bytes.get((4 + strings_len)..)?;
+        Some(Self { data, strings })
+    }
+}
+
+#[derive(Debug)]
+pub enum DataBufError {
+    DataOutOfCapacity,
+    StringsMapOutOfCapacity,
+    StringTable(StringTableBufError),
+}
+
+impl fmt::Display for DataBufError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::DataOutOfCapacity => write!(f, "data buf out of capacity"),
+            Self::StringsMapOutOfCapacity => write!(f, "strings map out of capacity"),
+            Self::StringTable(err) => write!(f, "string table error: {}", err),
+        }
+    }
+}
+
+impl error::Error for DataBufError {}
+
+impl From<StringTableBufError> for DataBufError {
+    fn from(err: StringTableBufError) -> Self {
+        Self::StringTable(err)
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct StringTable<'a> {
+    bytes: &'a [u8],
+}
+
+impl<'a> StringTable<'a> {
+    pub fn from_bytes(bytes: &'a [u8]) -> Self {
+        Self { bytes }
+    }
+
+    pub fn to_bytes(self) -> &'a [u8] {
+        self.bytes
+    }
+
+    pub fn get(self, index: u32) -> Option<&'a str> {
+        let index = usize::try_from(index).ok()?;
+        let len = *self.bytes.get(index)?;
+        let bytes = self.bytes.get((index + 1)..(index + 1 + usize::from(len)))?;
+        str::from_utf8(bytes).ok()
+    }
+}
+
+pub struct StringTableBuf {
+    buf: Vec<u8>,
+}
+
+impl StringTableBuf {
+    pub fn new() -> Self {
+        Self { buf: Vec::new() }
+    }
+
+    pub fn as_ref_type(&self) -> StringTable {
+        StringTable { bytes: &self.buf }
+    }
+
+    pub fn push(&mut self, s: &str) -> Result<u32, StringTableBufError> {
+        let len = u8::try_from(s.len())
+            .map_err(|_| StringTableBufError::StringTooLong)?;
+
+        let index = u32::try_from(self.buf.len())
+            .map_err(|_| StringTableBufError::OutOfCapacity)?;
+
+        self.buf.try_reserve(s.len() + 1)
+            .map_err(|_| StringTableBufError::OutOfCapacity)?;
+
+        self.buf.push(len);
+        self.buf.extend(s.bytes());
+        
+        Ok(index)
+    }
+}
+
+#[derive(Debug)]
+pub enum StringTableBufError {
+    StringTooLong,
+    OutOfCapacity,
+}
+
+impl fmt::Display for StringTableBufError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::StringTooLong => write!(f, "string too long to add to table"),
+            Self::OutOfCapacity => write!(f, "string table out of capacity"),
+        }
+    }
+}
+
+impl error::Error for StringTableBufError {}
+
diff --git a/utfdump_core/src/lib.rs b/utfdump_core/src/lib.rs
new file mode 100644
index 0000000..d21680b
--- /dev/null
+++ b/utfdump_core/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod chardata;
+pub mod encoded;