Compare commits

..

No commits in common. '69660a7e10554bef309d035eee9dc299ded1cd51' and '6e8d197ae481668be8e9894504a1733e195a65c3' have entirely different histories.

3
.gitignore vendored

@ -1,4 +1,5 @@
/target /target
pkg/ /build
/build.sh
__pycache__ __pycache__
unicode_data_latest.txt unicode_data_latest.txt

261
Cargo.lock generated

@ -3,10 +3,10 @@
version = 3 version = 3
[[package]] [[package]]
name = "adler2" name = "adler"
version = "2.0.0" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]] [[package]]
name = "atty" name = "atty"
@ -21,9 +21,9 @@ dependencies = [
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.4.0" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
@ -31,17 +31,11 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]] [[package]]
name = "bytecount" name = "bytecount"
version = "0.6.8" version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
@ -76,7 +70,7 @@ dependencies = [
"proc-macro-error", "proc-macro-error",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 1.0.109", "syn",
] ]
[[package]] [[package]]
@ -90,18 +84,18 @@ dependencies = [
[[package]] [[package]]
name = "crc32fast" name = "crc32fast"
version = "1.4.2" version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]] [[package]]
name = "flate2" name = "flate2"
version = "1.0.34" version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [ dependencies = [
"crc32fast", "crc32fast",
"miniz_oxide", "miniz_oxide",
@ -146,44 +140,38 @@ dependencies = [
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.161" version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]] [[package]]
name = "libshire" name = "libshire"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/pantonshire/libshire?branch=main#7858cd68cff790f56af78b667acff5d2e6522da1" source = "git+https://github.com/pantonshire/libshire?branch=main#44e27e9d2387c092d66ddfd871932e85b135499f"
dependencies = [ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]] [[package]]
name = "miniz_oxide" name = "miniz_oxide"
version = "0.8.0" version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [ dependencies = [
"adler2", "adler",
] ]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.20.2" version = "1.17.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
[[package]] [[package]]
name = "os_str_bytes" name = "os_str_bytes"
version = "6.6.1" version = "6.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
[[package]] [[package]]
name = "papergrid" name = "papergrid"
@ -205,7 +193,7 @@ dependencies = [
"proc-macro-error-attr", "proc-macro-error-attr",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 1.0.109", "syn",
"version_check", "version_check",
] ]
@ -222,41 +210,27 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.88" version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.37" version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.211" version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ac55e59090389fb9f0dd9e0f3c09615afed1d19094284d0b200441f13550793" checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.211"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54be4f245ce16bc58d57ef2716271d0d4519e0f6defa147f6e081005bcb278ff"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.82",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
@ -275,17 +249,6 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "syn"
version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]] [[package]]
name = "tabled" name = "tabled"
version = "0.8.0" version = "0.8.0"
@ -307,7 +270,7 @@ dependencies = [
"proc-macro-error", "proc-macro-error",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 1.0.109", "syn",
] ]
[[package]] [[package]]
@ -318,34 +281,34 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]] [[package]]
name = "termcolor" name = "termcolor"
version = "1.4.1" version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [ dependencies = [
"winapi-util", "winapi-util",
] ]
[[package]] [[package]]
name = "textwrap" name = "textwrap"
version = "0.16.1" version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
[[package]] [[package]]
name = "unicode-ident" name = "unicode-ident"
version = "1.0.13" version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.14" version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]] [[package]]
name = "utfdump" name = "utfdump"
version = "0.2.0" version = "0.1.0"
dependencies = [ dependencies = [
"flate2", "flate2",
"tap", "tap",
@ -353,7 +316,7 @@ dependencies = [
[[package]] [[package]]
name = "utfdump_bin" name = "utfdump_bin"
version = "0.2.0" version = "0.1.0"
dependencies = [ dependencies = [
"clap", "clap",
"libshire", "libshire",
@ -362,73 +325,14 @@ dependencies = [
] ]
[[package]] [[package]]
name = "utfdump_wasm" name = "utfdump_core"
version = "0.1.0" version = "0.1.0"
dependencies = [
"utfdump",
"wasm-bindgen",
]
[[package]] [[package]]
name = "version_check" name = "version_check"
version = "0.9.5" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasm-bindgen"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.82",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.82",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
[[package]] [[package]]
name = "winapi" name = "winapi"
@ -448,11 +352,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]] [[package]]
name = "winapi-util" name = "winapi-util"
version = "0.1.9" version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [ dependencies = [
"windows-sys", "winapi",
] ]
[[package]] [[package]]
@ -460,76 +364,3 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

@ -2,13 +2,4 @@
members = [ members = [
"lib", "lib",
"bin", "bin",
"wasm",
] ]
resolver = "2"
[profile.release]
panic = "abort"
lto = "fat"
[profile.release.package.utfdump_wasm]
opt-level = "z"

@ -1,6 +1,6 @@
[package] [package]
name = "utfdump_bin" name = "utfdump_bin"
version = "0.2.0" version = "0.1.0"
edition = "2021" edition = "2021"
authors = ["Tom Panton <pantonshire@gmail.com>"] authors = ["Tom Panton <pantonshire@gmail.com>"]
license = "MIT" license = "MIT"

@ -110,14 +110,14 @@ impl OutRow {
} }
fn from_bad_char(err: Utf8Error) -> Self { fn from_bad_char(err: Utf8Error) -> Self {
let (bad_bytes, _num_bad_bytes, num_consumed_bad_bytes) = err.into_parts(); let (bad_bytes, num_bad_bytes) = err.into_parts();
Self { Self {
display_char: CappedString::new_truncating("\u{fffd}"), display_char: CappedString::new_truncating("\u{fffd}"),
codepoint: Optional::None, codepoint: Optional::None,
utf_8_bytes: Utf8Bytes { utf_8_bytes: Utf8Bytes {
buf: bad_bytes, buf: bad_bytes,
len: num_consumed_bad_bytes, len: num_bad_bytes,
}, },
name: Optional::Some("<invalid>"), name: Optional::Some("<invalid>"),
category: Optional::None, category: Optional::None,

@ -52,12 +52,6 @@ from enum import Enum
from struct import pack from struct import pack
from typing import Optional from typing import Optional
from gzip import compress from gzip import compress
from time import time
import http.client
unicode_data_host = 'www.unicode.org'
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
out_data_path = 'lib/unicode_data_encoded.gz'
class StringTableIndex: class StringTableIndex:
def __init__(self, bs: bytes): def __init__(self, bs: bytes):
@ -253,31 +247,19 @@ def encode_char_data(
return bytes(encoded) return bytes(encoded)
print('Fetching Unicode data from {}...'.format(unicode_data_host)) with open('unicode_data_latest.txt', 'r') as fd:
start_time = time() input_data = fd.read()
conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
conn.request('GET', unicode_data_url_path)
resp = conn.getresponse()
resp_data = resp.read()
input_data = resp_data.decode('utf-8')
end_time = time()
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
char_data_table = bytearray() char_data_table = bytearray()
string_table = StringTable() string_table = StringTable()
# FIXME: calculate cumulative offset required for codepoints after each group
groups = [] groups = []
in_group = False in_group = False
prev_code = None prev_code = None
uniq_vals = {} uniq_vals = {}
rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0] for row in input_data.splitlines():
print('Encoding {} rows...'.format(len(rows)))
for row in rows:
is_group_start = False is_group_start = False
[ [
@ -443,9 +425,5 @@ encoded_data.extend(string_table)
compressed_data = compress(encoded_data) compressed_data = compress(encoded_data)
print('Writing encoded data to {}...'.format(out_data_path)) with open('unicode_data_encoded.gz', 'wb') as fd:
with open(out_data_path, 'wb') as fd:
fd.write(compressed_data) fd.write(compressed_data)
print('Done!')

@ -0,0 +1,2 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt

@ -1,11 +1,8 @@
[package] [package]
name = "utfdump" name = "utfdump"
version = "0.2.0" version = "0.1.0"
edition = "2021" edition = "2021"
[features]
std = []
[dependencies] [dependencies]
tap = "1.0.1" tap = "1.0.1"

@ -1,6 +1,6 @@
use std::{env, fs::File, io, path::Path}; use std::{env, fs::File, io, path::Path};
const COMPRESSED_DATA_PATH: &str = "unicode_data_encoded.gz"; const COMPRESSED_DATA_PATH: &str = "../unicode_data_encoded.gz";
const OUT_DATA_PATH: &str = "unicode_data_encoded"; const OUT_DATA_PATH: &str = "unicode_data_encoded";
fn main() -> io::Result<()> { fn main() -> io::Result<()> {

@ -1,5 +1,3 @@
#![cfg_attr(not(feature = "std"), no_std)]
pub mod character; pub mod character;
pub mod unicode_data; pub mod unicode_data;
pub mod utf8; pub mod utf8;

@ -128,7 +128,6 @@ where
return Some(Err(Utf8Error { return Some(Err(Utf8Error {
bad_bytes: bytes_seen, bad_bytes: bytes_seen,
num_bad_bytes: 1, num_bad_bytes: 1,
num_consumed_bad_bytes: 1,
})); }));
}, },
} }
@ -141,7 +140,6 @@ where
None => return Some(Err(Utf8Error { None => return Some(Err(Utf8Error {
bad_bytes: bytes_seen, bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 1, num_bad_bytes: usize::from(i) + 1,
num_consumed_bad_bytes: usize::from(i),
})), })),
}; };
@ -151,7 +149,6 @@ where
return Some(Err(Utf8Error { return Some(Err(Utf8Error {
bad_bytes: bytes_seen, bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 2, num_bad_bytes: usize::from(i) + 2,
num_consumed_bad_bytes: usize::from(i) + 1,
})); }));
} }
@ -175,7 +172,6 @@ where
pub struct Utf8Error { pub struct Utf8Error {
bad_bytes: [u8; 4], bad_bytes: [u8; 4],
num_bad_bytes: usize, num_bad_bytes: usize,
num_consumed_bad_bytes: usize,
} }
impl Utf8Error { impl Utf8Error {
@ -183,9 +179,8 @@ impl Utf8Error {
&self.bad_bytes[..self.num_bad_bytes] &self.bad_bytes[..self.num_bad_bytes]
} }
// FIXME: return some type with u8 array + length pub fn into_parts(self) -> ([u8; 4], usize) {
pub fn into_parts(self) -> ([u8; 4], usize, usize) { (self.bad_bytes, self.num_bad_bytes)
(self.bad_bytes, self.num_bad_bytes, self.num_consumed_bad_bytes)
} }
} }
@ -197,49 +192,64 @@ mod tests {
#[test] #[test]
fn test_utf8_decoder() { fn test_utf8_decoder() {
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0x68, 0x65, 0x6c, 0x6c, 0x6f 0x68, 0x65, 0x6c, 0x6c, 0x6f
], "hello"); ]),
"hello"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5
], "κόσμε"); ]),
"κόσμε"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef, 0xf0, 0x9f, 0x8f, 0xb3, 0xef, 0xb8, 0x8f, 0xe2, 0x80, 0x8d, 0xe2, 0x9a, 0xa7, 0xef,
0xb8, 0x8f 0xb8, 0x8f
], "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"); ]),
"\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0xce, 0x61 0xce, 0x61
], "\u{fffd}a"); ]),
"\u{fffd}a"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0xce, 0xc2 0xce, 0xc2
], "\u{fffd}\u{fffd}"); ]),
"\u{fffd}\u{fffd}"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0x80 0x80
], "\u{fffd}"); ]),
"\u{fffd}"
);
assert_decodes_to(&[ assert_eq!(
&decode_collect_lossy(&[
0x80, 0x80 0x80, 0x80
], "\u{fffd}\u{fffd}"); ]),
} "\u{fffd}\u{fffd}"
);
fn assert_decodes_to(bytes: &[u8], expected: &str) { }
let mut decoded = bytes.decode_utf8();
fn decode_collect_lossy(bytes: &[u8]) -> String {
for expected_char in expected.chars() { bytes
let decoded_char = match decoded.next() { .decode_utf8()
Some(Ok(c)) => Some(c), .map(|res| match res {
Some(Err(_)) => Some(REPLACEMENT_CHARACTER), Ok(c) => c,
None => None, Err(_) => REPLACEMENT_CHARACTER,
}; })
.collect()
assert_eq!(decoded_char, Some(expected_char));
}
assert!(decoded.next().is_none());
} }
} }

@ -1,12 +0,0 @@
[package]
name = "utfdump_wasm"
version = "0.1.0"
edition = "2021"
rust-version = "1.70.0"
[lib]
crate-type = ["cdylib", "rlib"]
[dependencies]
utfdump = { path = "../lib" }
wasm-bindgen = "0.2.86"

@ -1,133 +0,0 @@
use std::sync::OnceLock;
use utfdump::{UnicodeData, CombiningClass, CharData};
use wasm_bindgen::prelude::wasm_bindgen;
#[wasm_bindgen]
pub struct WbgCharData(CharData<'static>);
#[wasm_bindgen]
impl WbgCharData {
#[wasm_bindgen]
pub fn name(&self) -> String {
self.0.name().to_owned()
}
#[wasm_bindgen]
pub fn encoded_utf8(&self) -> Option<EncodedCodepoint> {
let c = char::try_from(self.0.codepoint()).ok()?;
let mut buf = [0u8; 4];
let len = c.encode_utf8(&mut buf).len() as u8;
Some(EncodedCodepoint::new(buf, len))
}
#[wasm_bindgen]
pub fn encoded_utf16_le(&self) -> Option<EncodedCodepoint> {
let (word_buf, num_words) = self.encoded_utf16()?;
let mut byte_buf = [0u8; 4];
for (i, word) in word_buf.iter().take(usize::from(num_words)).enumerate() {
let le_bytes = word.to_le_bytes();
byte_buf[(i * 2)..(i * 2 + 2)].copy_from_slice(&le_bytes);
}
Some(EncodedCodepoint::new(byte_buf, num_words * 2))
}
fn encoded_utf16(&self) -> Option<([u16; 2], u8)> {
let c = char::try_from(self.0.codepoint()).ok()?;
let mut word_buf = [0u16; 2];
let num_words = c.encode_utf16(&mut word_buf).len() as u8;
Some((word_buf, num_words))
}
#[wasm_bindgen]
pub fn category(&self) -> String {
self.0.category().abbreviation().to_owned()
}
#[wasm_bindgen]
pub fn category_full(&self) -> String {
self.0.category().full_name().to_owned()
}
#[wasm_bindgen]
pub fn combining_class(&self) -> u8 {
self.0.combining_class().0
}
#[wasm_bindgen]
pub fn bidi(&self) -> String {
self.0.bidi_category().abbreviation().to_owned()
}
#[wasm_bindgen]
pub fn bidi_full(&self) -> String {
self.0.bidi_category().full_name().to_owned()
}
#[wasm_bindgen]
pub fn numeric_value(&self) -> Option<String> {
self.0.numeric_value().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn mirrored(&self) -> bool {
self.0.mirrored()
}
#[wasm_bindgen]
pub fn decomp_string(&self) -> Option<String> {
self.0.decomp_mapping().map(|d| d.value().to_owned())
}
#[wasm_bindgen]
pub fn uppercase_string(&self) -> Option<String> {
self.0.uppercase().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn lowercase_string(&self) -> Option<String> {
self.0.lowercase().map(ToOwned::to_owned)
}
#[wasm_bindgen]
pub fn titlecase_string(&self) -> Option<String> {
self.0.titlecase().map(ToOwned::to_owned)
}
}
#[wasm_bindgen]
pub fn combining_class_name(combining_class: u8) -> Option<String> {
CombiningClass(combining_class)
.name()
.map(ToOwned::to_owned)
}
static UNICODE_DATA: OnceLock<UnicodeData> = OnceLock::new();
#[wasm_bindgen]
pub fn codepoint_char_data(codepoint: u32) -> Option<WbgCharData> {
let unicode_data = UNICODE_DATA.get_or_init(|| {
UnicodeData::new()
.unwrap()
});
unicode_data
.get(codepoint)
.map(WbgCharData)
}
#[wasm_bindgen]
pub struct EncodedCodepoint {
// `wasm-bindgen` unfortunately does not support arrays :(
pub b0: u8,
pub b1: u8,
pub b2: u8,
pub b3: u8,
pub len: u8,
}
impl EncodedCodepoint {
fn new([b0, b1, b2, b3]: [u8; 4], len: u8) -> Self {
Self { b0, b1, b2, b3, len }
}
}
Loading…
Cancel
Save