initial commit

main
pantonshire 2 years ago
commit f07fc58d1f

1
.gitignore vendored

@ -0,0 +1 @@
/target

7
Cargo.lock generated

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "tpstrutil"
version = "0.1.0"

@ -0,0 +1,8 @@
[package]
name = "tpstrutil"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

@ -0,0 +1,124 @@
pub fn truncate_str_to_array<const N: usize>(s: &str) -> ([u8; N], usize) {
let mut buf = [0u8; N];
let tr_len = truncated_str_len(s, N);
// SAFETY:
// `truncated_str_len` is guaranteed to return a length less than or equal to both the string
// length and the maximum truncated length `N`.
let src = unsafe { s.as_bytes().get_unchecked(..tr_len) };
let dst = unsafe { buf.get_unchecked_mut(..tr_len) };
dst.copy_from_slice(src);
(buf, tr_len)
}
pub fn truncated_str_len(s: &str, n: usize) -> usize {
let bs = s.as_bytes();
if bs.len() <= n {
return bs.len();
}
let mut tr_len = n;
// Repeatedly check if the byte `bs[tr_len]` (the byte immediately after the end of the
// candidate truncated string) is a continutation byte. We are splitting the string at a
// codepoint boundary (and therefore have a valid truncated string) iff this byte is not a
// continuation byte. There can be a maximum of 3 consecutive continuation bytes, so if the
// `is_utf8_continutation` check returns true 3 times in a row then we know the next byte will
// not be a continutation without the need to check; therefore, we can use `take` and hopefully
// allow the compiler to unroll the loop. `bs.len() > n` must hold here, so the slice will
// never panic.
for b in bs[..=tr_len].iter().copied().rev().take(3) {
if !is_utf8_continutation(b) {
break;
}
// This would underflow if `tr_len == 0`, but the first byte of a utf8 string is guaranteed
// not to be a continuation byte so this cannot happen.
tr_len -= 1;
}
tr_len
}
pub fn is_utf8_continutation(b: u8) -> bool {
b & 0b11000000 == 0b10000000
}
#[cfg(test)]
mod tests {
use std::str;
use super::{truncated_str_len, truncate_str_to_array};
#[test]
fn test_truncate_str_len() {
assert_eq!(truncated_str_len("", 0), 0);
assert_eq!(truncated_str_len("", 1), 0);
assert_eq!(truncated_str_len("", 2), 0);
assert_eq!(truncated_str_len("", 3), 0);
assert_eq!(truncated_str_len("", usize::MAX), 0);
assert_eq!(truncated_str_len("hi", 0), 0);
assert_eq!(truncated_str_len("hi", 1), 1);
assert_eq!(truncated_str_len("hi", 2), 2);
assert_eq!(truncated_str_len("hi", 3), 2);
assert_eq!(truncated_str_len("hi", usize::MAX), 2);
assert_eq!(truncated_str_len("日本", 0), 0);
assert_eq!(truncated_str_len("日本", 1), 0);
assert_eq!(truncated_str_len("日本", 2), 0);
assert_eq!(truncated_str_len("日本", 3), 3);
assert_eq!(truncated_str_len("日本", 4), 3);
assert_eq!(truncated_str_len("日本", 5), 3);
assert_eq!(truncated_str_len("日本", 6), 6);
assert_eq!(truncated_str_len("日本", 7), 6);
assert_eq!(truncated_str_len("日本", 8), 6);
assert_eq!(truncated_str_len("日本", 9), 6);
assert_eq!(truncated_str_len("日本", usize::MAX), 6);
assert_eq!(truncated_str_len("cafe\u{0301}s", 0), 0);
assert_eq!(truncated_str_len("cafe\u{0301}s", 1), 1);
assert_eq!(truncated_str_len("cafe\u{0301}s", 2), 2);
assert_eq!(truncated_str_len("cafe\u{0301}s", 3), 3);
assert_eq!(truncated_str_len("cafe\u{0301}s", 4), 4);
assert_eq!(truncated_str_len("cafe\u{0301}s", 5), 4);
assert_eq!(truncated_str_len("cafe\u{0301}s", 6), 6);
assert_eq!(truncated_str_len("cafe\u{0301}s", 7), 7);
assert_eq!(truncated_str_len("cafe\u{0301}s", 8), 7);
assert_eq!(truncated_str_len("cafe\u{0301}s", usize::MAX), 7);
}
#[test]
fn test_truncate_str_to_array() {
fn array_to_str<const N: usize>(x: &([u8; N], usize)) -> &str {
str::from_utf8(&x.0[..x.1]).unwrap()
}
assert_eq!(array_to_str(&truncate_str_to_array::<0>("")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<1>("")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<2>("")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<3>("")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<0>("hi")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<1>("hi")), "h");
assert_eq!(array_to_str(&truncate_str_to_array::<2>("hi")), "hi");
assert_eq!(array_to_str(&truncate_str_to_array::<3>("hi")), "hi");
assert_eq!(array_to_str(&truncate_str_to_array::<0>("日本")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<1>("日本")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<2>("日本")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<3>("日本")), "日");
assert_eq!(array_to_str(&truncate_str_to_array::<4>("日本")), "日");
assert_eq!(array_to_str(&truncate_str_to_array::<5>("日本")), "日");
assert_eq!(array_to_str(&truncate_str_to_array::<6>("日本")), "日本");
assert_eq!(array_to_str(&truncate_str_to_array::<7>("日本")), "日本");
assert_eq!(array_to_str(&truncate_str_to_array::<0>("cafe\u{0301}s")), "");
assert_eq!(array_to_str(&truncate_str_to_array::<1>("cafe\u{0301}s")), "c");
assert_eq!(array_to_str(&truncate_str_to_array::<2>("cafe\u{0301}s")), "ca");
assert_eq!(array_to_str(&truncate_str_to_array::<3>("cafe\u{0301}s")), "caf");
assert_eq!(array_to_str(&truncate_str_to_array::<4>("cafe\u{0301}s")), "cafe");
assert_eq!(array_to_str(&truncate_str_to_array::<5>("cafe\u{0301}s")), "cafe");
assert_eq!(array_to_str(&truncate_str_to_array::<6>("cafe\u{0301}s")), "cafe\u{0301}");
assert_eq!(array_to_str(&truncate_str_to_array::<7>("cafe\u{0301}s")), "cafe\u{0301}s");
assert_eq!(array_to_str(&truncate_str_to_array::<8>("cafe\u{0301}s")), "cafe\u{0301}s");
}
}
Loading…
Cancel
Save