From 4917be0963d48059354feb82e16e60792a2b233f Mon Sep 17 00:00:00 2001 From: pantonshire Date: Fri, 9 Sep 2022 21:20:30 +0100 Subject: [PATCH] strings: WIP CappedString refactoring and improvements This patch changes the name of the CappedString error type to be more descriptive, adds inline and must_use annotations to more public functions, and begins implementing a wider variety of ways to create a CappedString. --- src/strings/capped.rs | 213 +++++++++++++++++++++++++++++++++--------- src/strings/mod.rs | 2 +- 2 files changed, 170 insertions(+), 45 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 84b8080..751972d 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -20,6 +20,18 @@ use alloc::{ #[cfg(feature = "std")] use std::borrow::Cow; +#[derive(Debug)] +pub struct CapacityError; + +impl fmt::Display for CapacityError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "`CappedString` capacity exceeded") + } +} + +#[cfg(feature = "std")] +impl std::error::Error for CapacityError {} + /// A string type which stores at most `N` bytes of string data. The string data is stored inline /// rather than using a heap allocation. /// @@ -50,8 +62,8 @@ impl CappedString { /// Creates a new `CappedString` from a given byte buffer and length. /// /// # Safety - /// - /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8. + /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8. + /// `len` must be less than or equal to `N`. #[inline] #[must_use] pub const unsafe fn from_raw_parts(buf: [MaybeUninit; N], len: u8) -> Self { @@ -60,10 +72,38 @@ impl CappedString { #[inline] #[must_use] - pub fn into_raw_parts(self) -> ([MaybeUninit; N], u8) { + pub const fn into_raw_parts(self) -> ([MaybeUninit; N], u8) { (self.buf, self.len) } + /// # Safety + /// `src` must point to `len` bytes of valid, initialised, UTF-8 string data. `len` must be less + /// than or equal to `N`. + #[inline] + unsafe fn from_raw_ptr(src: *const u8, len: u8) -> Self { + // `u8` has the same memory layout as `MaybeUninit`, so this cast is valid. + let src = src as *const MaybeUninit; + + // SAFETY: + // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit; N]`, since each element + // of the array is allowed to be uninitialised. + let mut buf = unsafe { MaybeUninit::<[MaybeUninit; N]>::uninit().assume_init() }; + + // SAFETY: + // The caller is responsible for ensuring that `src` points to a valid string, which means + // that it must not overlap with the new local variable `buf`. The caller is responsible + // for ensuring that `src` is valid for reads of `len` bytes. The caller is responsible for + // ensuring that `len <= N`, so `buf` is valid for writes of `len` bytes. `src` and `buf` + // are both trivially properly aligned, since they both have an alignment of 1. + unsafe { ptr::copy_nonoverlapping(src, buf.as_mut_ptr(), usize::from(len)); } + + // SAFETY: + // The caller is responsible for ensuring that `src` points to `len` bytes of valid UTF-8 + // data. `src` is copied into the start of `buf`, so the first `len` bytes of `buf` are + // valid UTF-8. The caller is responsible for ensuring that `len <= N`. + unsafe { Self::from_raw_parts(buf, len) } + } + /// Returns a new empty `CappedString`. /// /// ``` @@ -100,23 +140,18 @@ impl CappedString { /// # } /// ``` #[inline] - pub fn new(s: &S) -> Result + pub fn new(src: &S) -> Result where S: AsRef + ?Sized, { // Convert the string to a byte slice, which is guaranteed to be valid UTF-8 since this is // an invariant of `str`. - let src = >::as_ref(s).as_bytes(); + let src = >::as_ref(src).as_bytes(); // If the length of the string is greater than `Self::MAX_LEN`, it will not fit in the - // buffer so return `None`. - let len = u8::try_from(src.len()) - .ok() - .and_then(|len| (len <= Self::MAX_LEN).then_some(len)) - .ok_or(Error { - max_len: N, - actual_len: src.len(), - })?; + // buffer so return an error. + let len = Self::bound_to_max_len(src.len()) + .ok_or(CapacityError)?; // SAFETY: // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit; N]`, since each element @@ -139,6 +174,52 @@ impl CappedString { unsafe { Ok(Self::from_raw_parts(buf, len)) } } + #[inline] + #[must_use] + pub fn new_truncating(src: &S) -> Self + where + S: AsRef + ?Sized, + { + let (src, len) = truncate_str(>::as_ref(src), Self::MAX_LEN); + + // SAFETY: + // It is part of the contract of `truncate_str` that it returns a pointer to a valid UTF-8 + // string of length `len`, and that `len` is less than or equal to the provided maximum + // length, which is `Self::MAX_LEN` (which is equal to `N`) in this case. + unsafe { Self::from_raw_ptr(src, len) } + } + + /// Returns the length as a `u8` if it is less than or equal to `Self::MAX_LEN` (which is the + /// `u8` representation of `N`). Otherwise, returns `None`. + #[inline] + fn bound_to_max_len(len: usize) -> Option { + u8::try_from(len) + .ok() + .and_then(|len| (len <= Self::MAX_LEN).then_some(len)) + } + + pub fn push(&mut self, c: char) -> Result<(), CapacityError> { + todo!() + } + + pub fn push_truncating(&mut self, c: char) { + todo!() + } + + pub fn push_str(&mut self, s: &S) -> Result<(), CapacityError> + where + S: AsRef + ?Sized, + { + todo!() + } + + pub fn push_str_truncating(&mut self, s: &S) + where + S: AsRef + ?Sized, + { + todo!() + } + /// Returns a string slice pointing to the underlying string data. #[inline] #[must_use] @@ -194,10 +275,14 @@ impl CappedString { unsafe { &mut *(data_slice as *mut [MaybeUninit] as *mut [u8]) } } + #[inline] + #[must_use] pub fn len(&self) -> usize { usize::from(self.len) } + #[inline] + #[must_use] pub fn is_empty(&self) -> bool { self.len == 0 } @@ -205,10 +290,14 @@ impl CappedString { #[cfg(feature = "alloc")] impl CappedString { + #[inline] + #[must_use] pub fn into_boxed_str(self) -> Box { self.as_str().into() } + #[inline] + #[must_use] pub fn into_string(self) -> String { self.as_str().to_owned() } @@ -266,7 +355,7 @@ impl borrow::BorrowMut for CappedString { } impl<'a, const N: usize> TryFrom<&'a str> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: &'a str) -> Result { @@ -276,7 +365,7 @@ impl<'a, const N: usize> TryFrom<&'a str> for CappedString { #[cfg(feature = "alloc")] impl TryFrom for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: String) -> Result { @@ -286,7 +375,7 @@ impl TryFrom for CappedString { #[cfg(feature = "alloc")] impl TryFrom> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: Box) -> Result { @@ -296,7 +385,7 @@ impl TryFrom> for CappedString { #[cfg(feature = "alloc")] impl<'a, const N: usize> TryFrom> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: Cow<'a, str>) -> Result { @@ -351,7 +440,7 @@ impl Hash for CappedString { } impl str::FromStr for CappedString { - type Err = Error; + type Err = CapacityError; #[inline] fn from_str(s: &str) -> Result { @@ -373,32 +462,68 @@ impl fmt::Display for CappedString { } } -#[derive(Debug)] -pub struct Error { - max_len: usize, - actual_len: usize, -} - -impl Error { - pub fn max_len(&self) -> usize { - self.max_len - } - - pub fn actual_len(&self) -> usize { - self.actual_len - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "string of length {} exceeds limit for `CappedString<{}>`", - self.actual_len, - self.max_len - ) +/// Returns a pointer to the longest prefix of `src` which is valid UTF-8 and whose length is +/// shorter than `max_len`, and returns the length of this prefix. +#[inline] +fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) { + match u8::try_from(src.len()) { + // If the length of the `src` string is less than or equal to `max_len`, there is no need to + // truncate it. + Ok(src_len) if src_len <= max_len => (src.as_ptr(), src_len), + + // If the length of `src` is greater than `max_len`, we need to truncate it. Note that + // `u8::try_from` returning an error means that `src.len() > max_len`, since `max_len` is a + // `u8` and `src.len()` is a `usize`. + _ => { + let src = src.as_bytes(); + + let mut i = max_len; + + // Find the rightmost codepoint which starts at an index less than or equal to + // `max_len`. Everything to the left of this will be valid UTF-8 with a length less + // than or equal to `max_len`. We only need to do 3 iterations because codepoints have + // a maximum length of 4 bytes. + for _ in 0..3 { + // The first byte in the string must always be the start of a codepoint. + if i == 0 { + break; + } + + // SAFETY: + // `i <= max_len`, since it is never incremented. If this branch is run, then either + // `src.len(): usize` does not fit into a `u8`, in which case it must be greater + // than `max_len: u8`, or it does fit into a `u8` but it is greater than `max_len`. + // Therefore, `src.len() > max_len` must hold. Substitution gives `i < src.len()`, + // so `i` is a valid index into `src`. + let byte = unsafe { *src.get_unchecked(usize::from(i)) }; + + // If the byte is not of the form 0b10xxxxxx, then it is the start of a codepoint. + if byte & 0xc0 != 0x80 { + break; + } + + i -= 1; + } + + // // SAFETY: + // // As discussed above, `i < src.len()` always holds, so `..i` is a valid range over + // // `src`. + // let src_truncated = unsafe { src.get_unchecked(..usize::from(i)) }; + + // // SAFETY: + // // `i` is the index of a start of a codepoint, and codepoints are contiguous, so the + // // substring `src[..i]` must be valid UTF-8. + // let src_truncated = unsafe { str::from_utf8_unchecked(src_truncated) }; + + // (src_truncated, i) + + // `i < src.len()` always holds as discussed above, so the pointer `src.as_ptr()` is + // valid for reads of `i` bytes. `i` is the index of the start of a codepoint, and + // codepoints are contiguous, so the `i` bytes being pointed to must be valid UTF-8. + (src.as_ptr(), i) + }, } } -#[cfg(feature = "std")] -impl std::error::Error for Error {} +#[cfg(test)] +mod tests {} diff --git a/src/strings/mod.rs b/src/strings/mod.rs index 6facc8d..3825407 100644 --- a/src/strings/mod.rs +++ b/src/strings/mod.rs @@ -4,6 +4,6 @@ pub mod capped; pub mod inlining; pub use fixed::{FixedString, Error as FixedStringError}; -pub use capped::{CappedString, Error as CappedStringError}; +pub use capped::{CappedString, CapacityError as CappedStringError}; #[cfg(feature = "alloc")] pub use inlining::{InliningString, InliningString23};