commit f07fc58d1f4ef9794d8349dc547e32dd50a08fa5 Author: pantonshire Date: Mon Jan 1 12:32:41 2024 +0000 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..632ebad --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "tpstrutil" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f04b2f5 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "tpstrutil" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..003f001 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,124 @@ +pub fn truncate_str_to_array(s: &str) -> ([u8; N], usize) { + let mut buf = [0u8; N]; + let tr_len = truncated_str_len(s, N); + // SAFETY: + // `truncated_str_len` is guaranteed to return a length less than or equal to both the string + // length and the maximum truncated length `N`. + let src = unsafe { s.as_bytes().get_unchecked(..tr_len) }; + let dst = unsafe { buf.get_unchecked_mut(..tr_len) }; + dst.copy_from_slice(src); + (buf, tr_len) +} + +pub fn truncated_str_len(s: &str, n: usize) -> usize { + let bs = s.as_bytes(); + + if bs.len() <= n { + return bs.len(); + } + + let mut tr_len = n; + // Repeatedly check if the byte `bs[tr_len]` (the byte immediately after the end of the + // candidate truncated string) is a continutation byte. We are splitting the string at a + // codepoint boundary (and therefore have a valid truncated string) iff this byte is not a + // continuation byte. There can be a maximum of 3 consecutive continuation bytes, so if the + // `is_utf8_continutation` check returns true 3 times in a row then we know the next byte will + // not be a continutation without the need to check; therefore, we can use `take` and hopefully + // allow the compiler to unroll the loop. `bs.len() > n` must hold here, so the slice will + // never panic. + for b in bs[..=tr_len].iter().copied().rev().take(3) { + if !is_utf8_continutation(b) { + break; + } + // This would underflow if `tr_len == 0`, but the first byte of a utf8 string is guaranteed + // not to be a continuation byte so this cannot happen. + tr_len -= 1; + } + tr_len +} + +pub fn is_utf8_continutation(b: u8) -> bool { + b & 0b11000000 == 0b10000000 +} + +#[cfg(test)] +mod tests { + use std::str; + + use super::{truncated_str_len, truncate_str_to_array}; + + #[test] + fn test_truncate_str_len() { + assert_eq!(truncated_str_len("", 0), 0); + assert_eq!(truncated_str_len("", 1), 0); + assert_eq!(truncated_str_len("", 2), 0); + assert_eq!(truncated_str_len("", 3), 0); + assert_eq!(truncated_str_len("", usize::MAX), 0); + + assert_eq!(truncated_str_len("hi", 0), 0); + assert_eq!(truncated_str_len("hi", 1), 1); + assert_eq!(truncated_str_len("hi", 2), 2); + assert_eq!(truncated_str_len("hi", 3), 2); + assert_eq!(truncated_str_len("hi", usize::MAX), 2); + + assert_eq!(truncated_str_len("日本", 0), 0); + assert_eq!(truncated_str_len("日本", 1), 0); + assert_eq!(truncated_str_len("日本", 2), 0); + assert_eq!(truncated_str_len("日本", 3), 3); + assert_eq!(truncated_str_len("日本", 4), 3); + assert_eq!(truncated_str_len("日本", 5), 3); + assert_eq!(truncated_str_len("日本", 6), 6); + assert_eq!(truncated_str_len("日本", 7), 6); + assert_eq!(truncated_str_len("日本", 8), 6); + assert_eq!(truncated_str_len("日本", 9), 6); + assert_eq!(truncated_str_len("日本", usize::MAX), 6); + + assert_eq!(truncated_str_len("cafe\u{0301}s", 0), 0); + assert_eq!(truncated_str_len("cafe\u{0301}s", 1), 1); + assert_eq!(truncated_str_len("cafe\u{0301}s", 2), 2); + assert_eq!(truncated_str_len("cafe\u{0301}s", 3), 3); + assert_eq!(truncated_str_len("cafe\u{0301}s", 4), 4); + assert_eq!(truncated_str_len("cafe\u{0301}s", 5), 4); + assert_eq!(truncated_str_len("cafe\u{0301}s", 6), 6); + assert_eq!(truncated_str_len("cafe\u{0301}s", 7), 7); + assert_eq!(truncated_str_len("cafe\u{0301}s", 8), 7); + assert_eq!(truncated_str_len("cafe\u{0301}s", usize::MAX), 7); + } + + #[test] + fn test_truncate_str_to_array() { + fn array_to_str(x: &([u8; N], usize)) -> &str { + str::from_utf8(&x.0[..x.1]).unwrap() + } + + assert_eq!(array_to_str(&truncate_str_to_array::<0>("")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<1>("")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<2>("")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<3>("")), ""); + + assert_eq!(array_to_str(&truncate_str_to_array::<0>("hi")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<1>("hi")), "h"); + assert_eq!(array_to_str(&truncate_str_to_array::<2>("hi")), "hi"); + assert_eq!(array_to_str(&truncate_str_to_array::<3>("hi")), "hi"); + + assert_eq!(array_to_str(&truncate_str_to_array::<0>("日本")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<1>("日本")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<2>("日本")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<3>("日本")), "日"); + assert_eq!(array_to_str(&truncate_str_to_array::<4>("日本")), "日"); + assert_eq!(array_to_str(&truncate_str_to_array::<5>("日本")), "日"); + assert_eq!(array_to_str(&truncate_str_to_array::<6>("日本")), "日本"); + assert_eq!(array_to_str(&truncate_str_to_array::<7>("日本")), "日本"); + + assert_eq!(array_to_str(&truncate_str_to_array::<0>("cafe\u{0301}s")), ""); + assert_eq!(array_to_str(&truncate_str_to_array::<1>("cafe\u{0301}s")), "c"); + assert_eq!(array_to_str(&truncate_str_to_array::<2>("cafe\u{0301}s")), "ca"); + assert_eq!(array_to_str(&truncate_str_to_array::<3>("cafe\u{0301}s")), "caf"); + assert_eq!(array_to_str(&truncate_str_to_array::<4>("cafe\u{0301}s")), "cafe"); + assert_eq!(array_to_str(&truncate_str_to_array::<5>("cafe\u{0301}s")), "cafe"); + assert_eq!(array_to_str(&truncate_str_to_array::<6>("cafe\u{0301}s")), "cafe\u{0301}"); + assert_eq!(array_to_str(&truncate_str_to_array::<7>("cafe\u{0301}s")), "cafe\u{0301}s"); + assert_eq!(array_to_str(&truncate_str_to_array::<8>("cafe\u{0301}s")), "cafe\u{0301}s"); + } +} +