strings: WIP CappedString refactoring and improvements

This patch changes the name of the CappedString error type to be more descriptive, adds inline and must_use annotations to more public functions, and begins implementing a wider variety of ways to create a CappedString.
3 years ago · 4917be0963
parent 96a085686b
commit 4917be0963
2 changed files with 170 additions and 45 deletions
--- a/src/strings/capped.rs
+++ b/src/strings/capped.rs
@ -20,6 +20,18 @@ use alloc::{
 #[cfg(feature = "std")]
 use std::borrow::Cow;

+#[derive(Debug)]
+pub struct CapacityError;
+
+impl fmt::Display for CapacityError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "`CappedString` capacity exceeded")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CapacityError {}
+
 /// A string type which stores at most `N` bytes of string data. The string data is stored inline
 /// rather than using a heap allocation.
 ///
@ -50,8 +62,8 @@ impl<const N: usize> CappedString<N> {
    /// Creates a new `CappedString` from a given byte buffer and length.
    ///
    /// # Safety
-    ///
-    /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8.
+    /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8. 
+    /// `len` must be less than or equal to `N`.
    #[inline]
    #[must_use]
    pub const unsafe fn from_raw_parts(buf: [MaybeUninit<u8>; N], len: u8) -> Self {
@ -60,10 +72,38 @@ impl<const N: usize> CappedString<N> {

    #[inline]
    #[must_use]
-    pub fn into_raw_parts(self) -> ([MaybeUninit<u8>; N], u8) {
+    pub const fn into_raw_parts(self) -> ([MaybeUninit<u8>; N], u8) {
        (self.buf, self.len)
    }

+    /// # Safety
+    /// `src` must point to `len` bytes of valid, initialised, UTF-8 string data. `len` must be less
+    /// than or equal to `N`.
+    #[inline]
+    unsafe fn from_raw_ptr(src: *const u8, len: u8) -> Self {
+        // `u8` has the same memory layout as `MaybeUninit<u8>`, so this cast is valid.
+        let src = src as *const MaybeUninit<u8>;
+
+        // SAFETY:
+        // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit<u8>; N]`, since each element
+        // of the array is allowed to be uninitialised.
+        let mut buf = unsafe { MaybeUninit::<[MaybeUninit<u8>; N]>::uninit().assume_init() };
+
+        // SAFETY:
+        // The caller is responsible for ensuring that `src` points to a valid string, which means
+        // that it must not overlap with the new local variable `buf`. The caller is responsible
+        // for ensuring that `src` is valid for reads of `len` bytes. The caller is responsible for
+        // ensuring that `len <= N`, so `buf` is valid for writes of `len` bytes. `src` and `buf`
+        // are both trivially properly aligned, since they both have an alignment of 1.
+        unsafe { ptr::copy_nonoverlapping(src, buf.as_mut_ptr(), usize::from(len)); }
+
+        // SAFETY:
+        // The caller is responsible for ensuring that `src` points to `len` bytes of valid UTF-8
+        // data. `src` is copied into the start of `buf`, so the first `len` bytes of `buf` are
+        // valid UTF-8. The caller is responsible for ensuring that `len <= N`.
+        unsafe { Self::from_raw_parts(buf, len) }
+    }
+
    /// Returns a new empty `CappedString`.
    ///
    /// ```
@ -100,23 +140,18 @@ impl<const N: usize> CappedString<N> {
    /// # }
    /// ```
    #[inline]
-    pub fn new<S>(s: &S) -> Result<Self, Error>
+    pub fn new<S>(src: &S) -> Result<Self, CapacityError>
    where
        S: AsRef<str> + ?Sized,
    {
        // Convert the string to a byte slice, which is guaranteed to be valid UTF-8 since this is
        // an invariant of `str`.
-        let src = <S as AsRef<str>>::as_ref(s).as_bytes();
+        let src = <S as AsRef<str>>::as_ref(src).as_bytes();

        // If the length of the string is greater than `Self::MAX_LEN`, it will not fit in the
-        // buffer so return `None`.
-        let len = u8::try_from(src.len())
-            .ok()
-            .and_then(|len| (len <= Self::MAX_LEN).then_some(len))
-            .ok_or(Error {
-                max_len: N,
-                actual_len: src.len(),
-            })?;
+        // buffer so return an error.
+        let len = Self::bound_to_max_len(src.len())
+            .ok_or(CapacityError)?;

        // SAFETY:
        // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit<u8>; N]`, since each element
@ -139,6 +174,52 @@ impl<const N: usize> CappedString<N> {
        unsafe { Ok(Self::from_raw_parts(buf, len)) }
    }

+    #[inline]
+    #[must_use]
+    pub fn new_truncating<S>(src: &S) -> Self
+    where
+        S: AsRef<str> + ?Sized,
+    {
+        let (src, len) = truncate_str(<S as AsRef<str>>::as_ref(src), Self::MAX_LEN);
+
+        // SAFETY:
+        // It is part of the contract of `truncate_str` that it returns a pointer to a valid UTF-8
+        // string of length `len`, and that `len` is less than or equal to the provided maximum
+        // length, which is `Self::MAX_LEN` (which is equal to `N`) in this case.
+        unsafe { Self::from_raw_ptr(src, len) }
+    }
+
+    /// Returns the length as a `u8` if it is less than or equal to `Self::MAX_LEN` (which is the
+    /// `u8` representation of `N`). Otherwise, returns `None`.
+    #[inline]
+    fn bound_to_max_len(len: usize) -> Option<u8> {
+        u8::try_from(len)
+            .ok()
+            .and_then(|len| (len <= Self::MAX_LEN).then_some(len))
+    }
+
+    pub fn push(&mut self, c: char) -> Result<(), CapacityError> {
+        todo!()
+    }
+
+    pub fn push_truncating(&mut self, c: char) {
+        todo!()
+    }
+
+    pub fn push_str<S>(&mut self, s: &S) -> Result<(), CapacityError>
+    where
+        S: AsRef<str> + ?Sized,
+    {
+        todo!()
+    }
+
+    pub fn push_str_truncating<S>(&mut self, s: &S)
+    where
+        S: AsRef<str> + ?Sized,
+    {
+        todo!()
+    }
+
    /// Returns a string slice pointing to the underlying string data.
    #[inline]
    #[must_use]
@ -194,10 +275,14 @@ impl<const N: usize> CappedString<N> {
        unsafe { &mut *(data_slice as *mut [MaybeUninit<u8>] as *mut [u8]) }
    }

+    #[inline]
+    #[must_use]
    pub fn len(&self) -> usize {
        usize::from(self.len)
    }

+    #[inline]
+    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.len == 0
    }
@ -205,10 +290,14 @@ impl<const N: usize> CappedString<N> {

 #[cfg(feature = "alloc")]
 impl<const N: usize> CappedString<N> {
+    #[inline]
+    #[must_use]
    pub fn into_boxed_str(self) -> Box<str> {
        self.as_str().into()
    }

+    #[inline]
+    #[must_use]
    pub fn into_string(self) -> String {
        self.as_str().to_owned()
    }
@ -266,7 +355,7 @@ impl<const N: usize> borrow::BorrowMut<str> for CappedString<N> {
 }

 impl<'a, const N: usize> TryFrom<&'a str> for CappedString<N> {
-    type Error = Error;
+    type Error = CapacityError;

    #[inline]
    fn try_from(s: &'a str) -> Result<Self, Self::Error> {
@ -276,7 +365,7 @@ impl<'a, const N: usize> TryFrom<&'a str> for CappedString<N> {

 #[cfg(feature = "alloc")]
 impl<const N: usize> TryFrom<String> for CappedString<N> {
-    type Error = Error;
+    type Error = CapacityError;

    #[inline]
    fn try_from(s: String) -> Result<Self, Self::Error> {
@ -286,7 +375,7 @@ impl<const N: usize> TryFrom<String> for CappedString<N> {

 #[cfg(feature = "alloc")]
 impl<const N: usize> TryFrom<Box<str>> for CappedString<N> {
-    type Error = Error;
+    type Error = CapacityError;

    #[inline]
    fn try_from(s: Box<str>) -> Result<Self, Self::Error> {
@ -296,7 +385,7 @@ impl<const N: usize> TryFrom<Box<str>> for CappedString<N> {

 #[cfg(feature = "alloc")]
 impl<'a, const N: usize> TryFrom<Cow<'a, str>> for CappedString<N> {
-    type Error = Error;
+    type Error = CapacityError;

    #[inline]
    fn try_from(s: Cow<'a, str>) -> Result<Self, Self::Error> {
@ -351,7 +440,7 @@ impl<const N: usize> Hash for CappedString<N> {
 }

 impl<const N: usize> str::FromStr for CappedString<N> {
-    type Err = Error;
+    type Err = CapacityError;

    #[inline]
    fn from_str(s: &str) -> Result<Self, Self::Err> {
@ -373,32 +462,68 @@ impl<const N: usize> fmt::Display for CappedString<N> {
    }
 }

-#[derive(Debug)]
-pub struct Error {
-    max_len: usize,
-    actual_len: usize,
-}
-
-impl Error {
-    pub fn max_len(&self) -> usize {
-        self.max_len
-    }
-
-    pub fn actual_len(&self) -> usize {
-        self.actual_len
-    }
-}
-
-impl fmt::Display for Error {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "string of length {} exceeds limit for `CappedString<{}>`",
-            self.actual_len,
-            self.max_len
-        )
+/// Returns a pointer to the longest prefix of `src` which is valid UTF-8 and whose length is
+/// shorter than `max_len`, and returns the length of this prefix.
+#[inline]
+fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) {
+    match u8::try_from(src.len()) {
+        // If the length of the `src` string is less than or equal to `max_len`, there is no need to
+        // truncate it.
+        Ok(src_len) if src_len <= max_len => (src.as_ptr(), src_len),
+
+        // If the length of `src` is greater than `max_len`, we need to truncate it. Note that
+        // `u8::try_from` returning an error means that `src.len() > max_len`, since `max_len` is a
+        // `u8` and `src.len()` is a `usize`.
+        _ => {
+            let src = src.as_bytes();
+
+            let mut i = max_len;
+
+            // Find the rightmost codepoint which starts at an index less than or equal to
+            // `max_len`. Everything to the left of this will be valid UTF-8 with a length less
+            // than or equal to `max_len`. We only need to do 3 iterations because codepoints have
+            // a maximum length of 4 bytes.
+            for _ in 0..3 {
+                // The first byte in the string must always be the start of a codepoint.
+                if i == 0 {
+                    break;
+                }
+
+                // SAFETY:
+                // `i <= max_len`, since it is never incremented. If this branch is run, then either
+                // `src.len(): usize` does not fit into a `u8`, in which case it must be greater
+                // than `max_len: u8`, or it does fit into a `u8` but it is greater than `max_len`.
+                // Therefore, `src.len() > max_len` must hold. Substitution gives `i < src.len()`,
+                // so `i` is a valid index into `src`.
+                let byte = unsafe { *src.get_unchecked(usize::from(i)) };
+
+                // If the byte is not of the form 0b10xxxxxx, then it is the start of a codepoint.
+                if byte & 0xc0 != 0x80 {
+                    break;
+                }
+
+                i -= 1;
+            }
+
+            // // SAFETY:
+            // // As discussed above, `i < src.len()` always holds, so `..i` is a valid range over
+            // // `src`.
+            // let src_truncated = unsafe { src.get_unchecked(..usize::from(i)) };
+
+            // // SAFETY:
+            // // `i` is the index of a start of a codepoint, and codepoints are contiguous, so the
+            // // substring `src[..i]` must be valid UTF-8.
+            // let src_truncated = unsafe { str::from_utf8_unchecked(src_truncated) };
+
+            // (src_truncated, i)
+
+            // `i < src.len()` always holds as discussed above, so the pointer `src.as_ptr()` is
+            // valid for reads of `i` bytes. `i` is the index of the start of a codepoint, and
+            // codepoints are contiguous, so the `i` bytes being pointed to must be valid UTF-8.
+            (src.as_ptr(), i)
+        },
    }
 }

-#[cfg(feature = "std")]
-impl std::error::Error for Error {}
+#[cfg(test)]
+mod tests {}
--- a/src/strings/mod.rs
+++ b/src/strings/mod.rs
@ -4,6 +4,6 @@ pub mod capped;
 pub mod inlining;

 pub use fixed::{FixedString, Error as FixedStringError};
-pub use capped::{CappedString, Error as CappedStringError};
+pub use capped::{CappedString, CapacityError as CappedStringError};
 #[cfg(feature = "alloc")]
 pub use inlining::{InliningString, InliningString23};