Compare commits

...

10 Commits

Author SHA1 Message Date
pantonshire 69660a7e10 update dependencies and resolver 1 year ago
pantonshire e90b77256a version bump 3 years ago
pantonshire 402da43bc1 fix utfdump_wasm Cargo.toml 3 years ago
pantonshire 673364873d specify msrv 3 years ago
pantonshire 0d1872b902 wasm library 3 years ago
pantonshire 9d6eae0bd0 make lib no-std compatible 3 years ago
pantonshire effe6916d3 move unicode_data_encoded.gz to lib 3 years ago
pantonshire c274ba6f01 🐛 only show consumed bad bytes for invalid characters
Previously, the bytes displayed for invalid characters included bytes
from the byte stream that were peeked rather than consumed. This
resulted in certain bytes being displayed multiple times, since the
peeked byte could appear in the following character.

For example, `printf '\xce\x61' | utfdump_bin` would result in the byte
0xce being displayed twice, once at the end of the invalid character and
once as the valid character `a`.

This patch modifies `utfdump::utf8::Utf8Error` so it also stores the
number of consumed bad bytes, enabling the binary to output only the
consumed bad bytes.
3 years ago
pantonshire ecf2abbdad retrieve data from unicode.org in data.py 3 years ago
pantonshire 6b911caa15 update Cargo.lock 3 years ago

3
.gitignore vendored

@ -1,5 +1,4 @@
/target
/build
/build.sh
pkg/
__pycache__
unicode_data_latest.txt

261
Cargo.lock generated

@ -3,10 +3,10 @@
version = 3
[[package]]
name = "adler"
version = "1.0.2"
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "atty"
@ -21,9 +21,9 @@ dependencies = [
[[package]]
name = "autocfg"
version = "1.1.0"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "bitflags"
@ -31,11 +31,17 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "bytecount"
version = "0.6.3"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
[[package]]
name = "cfg-if"
@ -70,7 +76,7 @@ dependencies = [
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
"syn 1.0.109",
]
[[package]]
@ -84,18 +90,18 @@ dependencies = [
[[package]]
name = "crc32fast"
version = "1.3.2"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if",
]
[[package]]
name = "flate2"
version = "1.0.26"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
dependencies = [
"crc32fast",
"miniz_oxide",
@ -140,38 +146,44 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.144"
version = "0.2.161"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
[[package]]
name = "libshire"
version = "0.1.0"
source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f"
source = "git+https://github.com/pantonshire/libshire?branch=main#7858cd68cff790f56af78b667acff5d2e6522da1"
dependencies = [
"serde",
]
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "miniz_oxide"
version = "0.7.1"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
dependencies = [
"adler",
"adler2",
]
[[package]]
name = "once_cell"
version = "1.17.2"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "os_str_bytes"
version = "6.5.0"
version = "6.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1"
[[package]]
name = "papergrid"
@ -193,7 +205,7 @@ dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"syn 1.0.109",
"version_check",
]
@ -210,27 +222,41 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.59"
version = "1.0.88"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.28"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.163"
version = "1.0.211"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
checksum = "1ac55e59090389fb9f0dd9e0f3c09615afed1d19094284d0b200441f13550793"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.211"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54be4f245ce16bc58d57ef2716271d0d4519e0f6defa147f6e081005bcb278ff"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.82",
]
[[package]]
name = "strsim"
@ -249,6 +275,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tabled"
version = "0.8.0"
@ -270,7 +307,7 @@ dependencies = [
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
"syn 1.0.109",
]
[[package]]
@ -281,34 +318,34 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "termcolor"
version = "1.2.0"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.16.0"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9"
[[package]]
name = "unicode-ident"
version = "1.0.9"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-width"
version = "0.1.10"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
[[package]]
name = "utfdump"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"flate2",
"tap",
@ -316,7 +353,7 @@ dependencies = [
[[package]]
name = "utfdump_bin"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"clap",
"libshire",
@ -325,14 +362,73 @@ dependencies = [
]
[[package]]
name = "utfdump_core"
name = "utfdump_wasm"
version = "0.1.0"
dependencies = [
"utfdump",
"wasm-bindgen",
]
[[package]]
name = "version_check"
version = "0.9.4"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasm-bindgen"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.82",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.82",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
[[package]]
name = "winapi"
@ -352,11 +448,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"winapi",
"windows-sys",
]
[[package]]
@ -364,3 +460,76 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

@ -2,4 +2,13 @@
members = [
"lib",
"bin",
"wasm",
]
resolver = "2"
[profile.release]
panic = "abort"
lto = "fat"
[profile.release.package.utfdump_wasm]
opt-level = "z"

@ -1,6 +1,6 @@
[package]
name = "utfdump_bin"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
authors = ["Tom Panton <pantonshire@gmail.com>"]
license = "MIT"

@ -110,14 +110,14 @@ impl OutRow {
}
fn from_bad_char(err: Utf8Error) -> Self {
let (bad_bytes, num_bad_bytes) = err.into_parts();
let (bad_bytes, _num_bad_bytes, num_consumed_bad_bytes) = err.into_parts();
Self {
display_char: CappedString::new_truncating("\u{fffd}"),
codepoint: Optional::None,
utf_8_bytes: Utf8Bytes {
buf: bad_bytes,
len: num_bad_bytes,
len: num_consumed_bad_bytes,
},
name: Optional::Some("<invalid>"),
category: Optional::None,

@ -52,6 +52,12 @@ from enum import Enum
from struct import pack
from typing import Optional
from gzip import compress
from time import time
import http.client
unicode_data_host = 'www.unicode.org'
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
out_data_path = 'lib/unicode_data_encoded.gz'
class StringTableIndex:
def __init__(self, bs: bytes):
@ -247,19 +253,31 @@ def encode_char_data(
return bytes(encoded)
with open('unicode_data_latest.txt', 'r') as fd:
input_data = fd.read()
print('Fetching Unicode data from {}...'.format(unicode_data_host))
start_time = time()
conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
conn.request('GET', unicode_data_url_path)
resp = conn.getresponse()
resp_data = resp.read()
input_data = resp_data.decode('utf-8')
end_time = time()
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
char_data_table = bytearray()
string_table = StringTable()
# FIXME: calculate cumulative offset required for codepoints after each group
groups = []
in_group = False
prev_code = None
uniq_vals = {}
for row in input_data.splitlines():
rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]
print('Encoding {} rows...'.format(len(rows)))
for row in rows:
is_group_start = False
[
@ -425,5 +443,9 @@ encoded_data.extend(string_table)
compressed_data = compress(encoded_data)
with open('unicode_data_encoded.gz', 'wb') as fd:
print('Writing encoded data to {}...'.format(out_data_path))
with open(out_data_path, 'wb') as fd:
fd.write(compressed_data)
print('Done!')

@ -1,2 +0,0 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt

@ -1,8 +1,11 @@
[package]
name = "utfdump"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
[features]
std = []
[dependencies]
tap = "1.0.1"

@ -1,6 +1,6 @@
use std::{env, fs::File, io, path::Path};
const COMPRESSED_DATA_PATH: &str = "../unicode_data_encoded.gz";
const COMPRESSED_DATA_PATH: &str = "unicode_data_encoded.gz";
const OUT_DATA_PATH: &str = "unicode_data_encoded";
fn main() -> io::Result<()> {

@ -1,3 +1,5 @@
#![cfg_attr(not(feature = "std"), no_std)]
pub mod character;
pub mod unicode_data;
pub mod utf8;

@ -128,6 +128,7 @@ where
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: 1,
num_consumed_bad_bytes: 1,
}));
},
}
@ -140,6 +141,7 @@ where
None => return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 1,
num_consumed_bad_bytes: usize::from(i),
})),
};
@ -149,6 +151,7 @@ where
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 2,
num_consumed_bad_bytes: usize::from(i) + 1,
}));
}
@ -172,6 +175,7 @@ where
pub struct Utf8Error {
bad_bytes: [u8; 4],
num_bad_bytes: usize,
num_consumed_bad_bytes: usize,
}
impl Utf8Error {
@ -179,8 +183,9 @@ impl Utf8Error {
&self.bad_bytes[..self.num_bad_bytes]
}
pub fn into_parts(self) -> ([u8; 4], usize) {
(self.bad_bytes, self.num_bad_bytes)
// FIXME: return some type with u8 array + length
pub fn into_parts(self) -> ([u8; 4], usize, usize) {
(self.bad_bytes, self.num_bad_bytes, self.num_consumed_bad_bytes)
}
}
@ -192,64 +197,49 @@ mod tests {
#[test]
fn test_utf8_decoder() {
assert_eq!(
&decode_collect_lossy(&[
0x68, 0x65, 0x6c, 0x6c, 0x6f
]),
"hello"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
]),
"κόσμε"
);
assert_eq!(
&decode_collect_lossy(&[
0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
0xb8, 0x8f
]),
"\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0x61
]),
"\u{fffd}a"
);
assert_eq!(
&decode_collect_lossy(&[
0xce, 0xc2
]),
"\u{fffd}\u{fffd}"
);
assert_eq!(
&decode_collect_lossy(&[
0x80
]),
"\u{fffd}"
);
assert_eq!(
&decode_collect_lossy(&[
0x80, 0x80
]),
"\u{fffd}\u{fffd}"
);
assert_decodes_to(&[
0x68, 0x65, 0x6c, 0x6c, 0x6f
], "hello");
assert_decodes_to(&[
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
], "κόσμε");
assert_decodes_to(&[
0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
0xb8, 0x8f
], "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}");
assert_decodes_to(&[
0xce, 0x61
], "\u{fffd}a");
assert_decodes_to(&[
0xce, 0xc2
], "\u{fffd}\u{fffd}");
assert_decodes_to(&[
0x80
], "\u{fffd}");
assert_decodes_to(&[
0x80, 0x80
], "\u{fffd}\u{fffd}");
}
fn decode_collect_lossy(bytes: &[u8]) -> String {
bytes
.decode_utf8()
.map(|res| match res {
Ok(c) => c,
Err(_) => REPLACEMENT_CHARACTER,
})
.collect()
fn assert_decodes_to(bytes: &[u8], expected: &str) {
let mut decoded = bytes.decode_utf8();
for expected_char in expected.chars() {
let decoded_char = match decoded.next() {
Some(Ok(c)) => Some(c),
Some(Err(_)) => Some(REPLACEMENT_CHARACTER),
None => None,
};
assert_eq!(decoded_char, Some(expected_char));
}
assert!(decoded.next().is_none());
}
}

@ -0,0 +1,12 @@
[package]
name = "utfdump_wasm"
version = "0.1.0"
edition = "2021"
rust-version = "1.70.0"
[lib]
crate-type = ["cdylib", "rlib"]
[dependencies]
utfdump = { path = "../lib" }
wasm-bindgen = "0.2.86"

@ -0,0 +1,133 @@
use std::sync::OnceLock;
use utfdump::{UnicodeData, CombiningClass, CharData};
use wasm_bindgen::prelude::wasm_bindgen;
#[wasm_bindgen]
pub struct WbgCharData(CharData<'static>);
#[wasm_bindgen]
impl WbgCharData {
#[wasm_bindgen]
pub fn name(&self) -> String {
self.0.name().to_owned()
}
#[wasm_bindgen]
pub fn encoded_utf8(&self) -> Option<EncodedCodepoint> {
let c = char::try_from(self.0.codepoint()).ok()?;
let mut buf = [0u8; 4];
let len = c.encode_utf8(&mut buf).len() as u8;
Some(EncodedCodepoint::new(buf, len))
}
#[wasm_bindgen]
pub fn encoded_utf16_le(&self) -> Option<EncodedCodepoint> {
let (word_buf, num_words) = self.encoded_utf16()?;
let mut byte_buf = [0u8; 4];
for (i, word) in word_buf.iter().take(usize::from(num_words)).enumerate() {
let le_bytes = word.to_le_bytes();
byte_buf[(i * 2)..(i * 2 + 2)].copy_from_slice(&le_bytes);
}
Some(EncodedCodepoint::new(byte_buf, num_words * 2))
}
fn encoded_utf16(&self) -> Option<([u16; 2], u8)> {
let c = char::try_from(self.0.codepoint()).ok()?;
let mut word_buf = [0u16; 2];
let num_words = c.encode_utf16(&mut word_buf).len() as u8;
Some((word_buf, num_words))
}
#[wasm_bindgen]
pub fn category(&self) -> String {
self.0.category().abbreviation().to_owned()
}
#[wasm_bindgen]
pub fn category_full(&self) -> String {
self.0.category().full_name().to_owned()
}
#[wasm_bindgen]
pub fn combining_class(&self) -> u8 {
self.0.combining_class().0
}
#[wasm_bindgen]
pub fn bidi(&self) -> String {
self.0.bidi_category().abbreviation().to_owned()
}
#[wasm_bindgen]
pub fn bidi_full(&self) -> String {
self.0.bidi_category().full_name().to_owned()
}
#[wasm_bindgen]
pub fn numeric_value(&self) -> Option<String> {
self.0.numeric_value().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn mirrored(&self) -> bool {
self.0.mirrored()
}
#[wasm_bindgen]
pub fn decomp_string(&self) -> Option<String> {
self.0.decomp_mapping().map(|d| d.value().to_owned())
}
#[wasm_bindgen]
pub fn uppercase_string(&self) -> Option<String> {
self.0.uppercase().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn lowercase_string(&self) -> Option<String> {
self.0.lowercase().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn titlecase_string(&self) -> Option<String> {
self.0.titlecase().map(ToOwned::to_owned)
}
}
#[wasm_bindgen]
pub fn combining_class_name(combining_class: u8) -> Option<String> {
CombiningClass(combining_class)
.name()
.map(ToOwned::to_owned)
}
static UNICODE_DATA: OnceLock<UnicodeData> = OnceLock::new();
#[wasm_bindgen]
pub fn codepoint_char_data(codepoint: u32) -> Option<WbgCharData> {
let unicode_data = UNICODE_DATA.get_or_init(|| {
UnicodeData::new()
.unwrap()
});
unicode_data
.get(codepoint)
.map(WbgCharData)
}
#[wasm_bindgen]
pub struct EncodedCodepoint {
// `wasm-bindgen` unfortunately does not support arrays :(
pub b0: u8,
pub b1: u8,
pub b2: u8,
pub b3: u8,
pub len: u8,
}
impl EncodedCodepoint {
fn new([b0, b1, b2, b3]: [u8; 4], len: u8) -> Self {
Self { b0, b1, b2, b3, len }
}
}
Loading…
Cancel
Save