From 4917be0963d48059354feb82e16e60792a2b233f Mon Sep 17 00:00:00 2001 From: pantonshire Date: Fri, 9 Sep 2022 21:20:30 +0100 Subject: [PATCH 1/7] strings: WIP CappedString refactoring and improvements This patch changes the name of the CappedString error type to be more descriptive, adds inline and must_use annotations to more public functions, and begins implementing a wider variety of ways to create a CappedString. --- src/strings/capped.rs | 213 +++++++++++++++++++++++++++++++++--------- src/strings/mod.rs | 2 +- 2 files changed, 170 insertions(+), 45 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 84b8080..751972d 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -20,6 +20,18 @@ use alloc::{ #[cfg(feature = "std")] use std::borrow::Cow; +#[derive(Debug)] +pub struct CapacityError; + +impl fmt::Display for CapacityError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "`CappedString` capacity exceeded") + } +} + +#[cfg(feature = "std")] +impl std::error::Error for CapacityError {} + /// A string type which stores at most `N` bytes of string data. The string data is stored inline /// rather than using a heap allocation. /// @@ -50,8 +62,8 @@ impl CappedString { /// Creates a new `CappedString` from a given byte buffer and length. /// /// # Safety - /// - /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8. + /// The first `len` bytes of `buf` (i.e. `buf[..len]`) must be initialised and valid UTF-8. + /// `len` must be less than or equal to `N`. #[inline] #[must_use] pub const unsafe fn from_raw_parts(buf: [MaybeUninit; N], len: u8) -> Self { @@ -60,10 +72,38 @@ impl CappedString { #[inline] #[must_use] - pub fn into_raw_parts(self) -> ([MaybeUninit; N], u8) { + pub const fn into_raw_parts(self) -> ([MaybeUninit; N], u8) { (self.buf, self.len) } + /// # Safety + /// `src` must point to `len` bytes of valid, initialised, UTF-8 string data. `len` must be less + /// than or equal to `N`. + #[inline] + unsafe fn from_raw_ptr(src: *const u8, len: u8) -> Self { + // `u8` has the same memory layout as `MaybeUninit`, so this cast is valid. + let src = src as *const MaybeUninit; + + // SAFETY: + // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit; N]`, since each element + // of the array is allowed to be uninitialised. + let mut buf = unsafe { MaybeUninit::<[MaybeUninit; N]>::uninit().assume_init() }; + + // SAFETY: + // The caller is responsible for ensuring that `src` points to a valid string, which means + // that it must not overlap with the new local variable `buf`. The caller is responsible + // for ensuring that `src` is valid for reads of `len` bytes. The caller is responsible for + // ensuring that `len <= N`, so `buf` is valid for writes of `len` bytes. `src` and `buf` + // are both trivially properly aligned, since they both have an alignment of 1. + unsafe { ptr::copy_nonoverlapping(src, buf.as_mut_ptr(), usize::from(len)); } + + // SAFETY: + // The caller is responsible for ensuring that `src` points to `len` bytes of valid UTF-8 + // data. `src` is copied into the start of `buf`, so the first `len` bytes of `buf` are + // valid UTF-8. The caller is responsible for ensuring that `len <= N`. + unsafe { Self::from_raw_parts(buf, len) } + } + /// Returns a new empty `CappedString`. /// /// ``` @@ -100,23 +140,18 @@ impl CappedString { /// # } /// ``` #[inline] - pub fn new(s: &S) -> Result + pub fn new(src: &S) -> Result where S: AsRef + ?Sized, { // Convert the string to a byte slice, which is guaranteed to be valid UTF-8 since this is // an invariant of `str`. - let src = >::as_ref(s).as_bytes(); + let src = >::as_ref(src).as_bytes(); // If the length of the string is greater than `Self::MAX_LEN`, it will not fit in the - // buffer so return `None`. - let len = u8::try_from(src.len()) - .ok() - .and_then(|len| (len <= Self::MAX_LEN).then_some(len)) - .ok_or(Error { - max_len: N, - actual_len: src.len(), - })?; + // buffer so return an error. + let len = Self::bound_to_max_len(src.len()) + .ok_or(CapacityError)?; // SAFETY: // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit; N]`, since each element @@ -139,6 +174,52 @@ impl CappedString { unsafe { Ok(Self::from_raw_parts(buf, len)) } } + #[inline] + #[must_use] + pub fn new_truncating(src: &S) -> Self + where + S: AsRef + ?Sized, + { + let (src, len) = truncate_str(>::as_ref(src), Self::MAX_LEN); + + // SAFETY: + // It is part of the contract of `truncate_str` that it returns a pointer to a valid UTF-8 + // string of length `len`, and that `len` is less than or equal to the provided maximum + // length, which is `Self::MAX_LEN` (which is equal to `N`) in this case. + unsafe { Self::from_raw_ptr(src, len) } + } + + /// Returns the length as a `u8` if it is less than or equal to `Self::MAX_LEN` (which is the + /// `u8` representation of `N`). Otherwise, returns `None`. + #[inline] + fn bound_to_max_len(len: usize) -> Option { + u8::try_from(len) + .ok() + .and_then(|len| (len <= Self::MAX_LEN).then_some(len)) + } + + pub fn push(&mut self, c: char) -> Result<(), CapacityError> { + todo!() + } + + pub fn push_truncating(&mut self, c: char) { + todo!() + } + + pub fn push_str(&mut self, s: &S) -> Result<(), CapacityError> + where + S: AsRef + ?Sized, + { + todo!() + } + + pub fn push_str_truncating(&mut self, s: &S) + where + S: AsRef + ?Sized, + { + todo!() + } + /// Returns a string slice pointing to the underlying string data. #[inline] #[must_use] @@ -194,10 +275,14 @@ impl CappedString { unsafe { &mut *(data_slice as *mut [MaybeUninit] as *mut [u8]) } } + #[inline] + #[must_use] pub fn len(&self) -> usize { usize::from(self.len) } + #[inline] + #[must_use] pub fn is_empty(&self) -> bool { self.len == 0 } @@ -205,10 +290,14 @@ impl CappedString { #[cfg(feature = "alloc")] impl CappedString { + #[inline] + #[must_use] pub fn into_boxed_str(self) -> Box { self.as_str().into() } + #[inline] + #[must_use] pub fn into_string(self) -> String { self.as_str().to_owned() } @@ -266,7 +355,7 @@ impl borrow::BorrowMut for CappedString { } impl<'a, const N: usize> TryFrom<&'a str> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: &'a str) -> Result { @@ -276,7 +365,7 @@ impl<'a, const N: usize> TryFrom<&'a str> for CappedString { #[cfg(feature = "alloc")] impl TryFrom for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: String) -> Result { @@ -286,7 +375,7 @@ impl TryFrom for CappedString { #[cfg(feature = "alloc")] impl TryFrom> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: Box) -> Result { @@ -296,7 +385,7 @@ impl TryFrom> for CappedString { #[cfg(feature = "alloc")] impl<'a, const N: usize> TryFrom> for CappedString { - type Error = Error; + type Error = CapacityError; #[inline] fn try_from(s: Cow<'a, str>) -> Result { @@ -351,7 +440,7 @@ impl Hash for CappedString { } impl str::FromStr for CappedString { - type Err = Error; + type Err = CapacityError; #[inline] fn from_str(s: &str) -> Result { @@ -373,32 +462,68 @@ impl fmt::Display for CappedString { } } -#[derive(Debug)] -pub struct Error { - max_len: usize, - actual_len: usize, -} - -impl Error { - pub fn max_len(&self) -> usize { - self.max_len - } - - pub fn actual_len(&self) -> usize { - self.actual_len - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "string of length {} exceeds limit for `CappedString<{}>`", - self.actual_len, - self.max_len - ) +/// Returns a pointer to the longest prefix of `src` which is valid UTF-8 and whose length is +/// shorter than `max_len`, and returns the length of this prefix. +#[inline] +fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) { + match u8::try_from(src.len()) { + // If the length of the `src` string is less than or equal to `max_len`, there is no need to + // truncate it. + Ok(src_len) if src_len <= max_len => (src.as_ptr(), src_len), + + // If the length of `src` is greater than `max_len`, we need to truncate it. Note that + // `u8::try_from` returning an error means that `src.len() > max_len`, since `max_len` is a + // `u8` and `src.len()` is a `usize`. + _ => { + let src = src.as_bytes(); + + let mut i = max_len; + + // Find the rightmost codepoint which starts at an index less than or equal to + // `max_len`. Everything to the left of this will be valid UTF-8 with a length less + // than or equal to `max_len`. We only need to do 3 iterations because codepoints have + // a maximum length of 4 bytes. + for _ in 0..3 { + // The first byte in the string must always be the start of a codepoint. + if i == 0 { + break; + } + + // SAFETY: + // `i <= max_len`, since it is never incremented. If this branch is run, then either + // `src.len(): usize` does not fit into a `u8`, in which case it must be greater + // than `max_len: u8`, or it does fit into a `u8` but it is greater than `max_len`. + // Therefore, `src.len() > max_len` must hold. Substitution gives `i < src.len()`, + // so `i` is a valid index into `src`. + let byte = unsafe { *src.get_unchecked(usize::from(i)) }; + + // If the byte is not of the form 0b10xxxxxx, then it is the start of a codepoint. + if byte & 0xc0 != 0x80 { + break; + } + + i -= 1; + } + + // // SAFETY: + // // As discussed above, `i < src.len()` always holds, so `..i` is a valid range over + // // `src`. + // let src_truncated = unsafe { src.get_unchecked(..usize::from(i)) }; + + // // SAFETY: + // // `i` is the index of a start of a codepoint, and codepoints are contiguous, so the + // // substring `src[..i]` must be valid UTF-8. + // let src_truncated = unsafe { str::from_utf8_unchecked(src_truncated) }; + + // (src_truncated, i) + + // `i < src.len()` always holds as discussed above, so the pointer `src.as_ptr()` is + // valid for reads of `i` bytes. `i` is the index of the start of a codepoint, and + // codepoints are contiguous, so the `i` bytes being pointed to must be valid UTF-8. + (src.as_ptr(), i) + }, } } -#[cfg(feature = "std")] -impl std::error::Error for Error {} +#[cfg(test)] +mod tests {} diff --git a/src/strings/mod.rs b/src/strings/mod.rs index 6facc8d..3825407 100644 --- a/src/strings/mod.rs +++ b/src/strings/mod.rs @@ -4,6 +4,6 @@ pub mod capped; pub mod inlining; pub use fixed::{FixedString, Error as FixedStringError}; -pub use capped::{CappedString, Error as CappedStringError}; +pub use capped::{CappedString, CapacityError as CappedStringError}; #[cfg(feature = "alloc")] pub use inlining::{InliningString, InliningString23}; From c50b3c7daadbfd06039559c12bb15fd6822ef7ba Mon Sep 17 00:00:00 2001 From: pantonshire Date: Sat, 10 Sep 2022 20:10:19 +0100 Subject: [PATCH 2/7] strings: implement `CappedString::new` using `CappedString::from_raw_ptr` This patch replaces the logic in `CappedString::new` for copying the string data into a `[MaybeUninit]` buffer with a call to `CappedString::from_raw_ptr`, which performs the same task. --- src/strings/capped.rs | 47 +++++++++++++------------------------------ 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 751972d..ab0ff1e 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -146,32 +146,20 @@ impl CappedString { { // Convert the string to a byte slice, which is guaranteed to be valid UTF-8 since this is // an invariant of `str`. - let src = >::as_ref(src).as_bytes(); + let src = >::as_ref(src); - // If the length of the string is greater than `Self::MAX_LEN`, it will not fit in the - // buffer so return an error. - let len = Self::bound_to_max_len(src.len()) - .ok_or(CapacityError)?; + // If the length of the `src` string does not fit into a `u8` or is greater than + // `Self::MAX_LEN`, we can't fit it into the new `CappedString` so return an error. + let len = match u8::try_from(src.len()) { + Ok(len) if len <= Self::MAX_LEN => len, + _ => return Err(CapacityError), + }; // SAFETY: - // `MaybeUninit::uninit()` is a valid value for `[MaybeUninit; N]`, since each element - // of the array is allowed to be uninitialised. - let mut buf = unsafe { MaybeUninit::<[MaybeUninit; N]>::uninit().assume_init() }; - - let src_ptr = src.as_ptr() as *const MaybeUninit; - - // SAFETY: - // The source and destination to not overlap, since `buf` is a new local variable which is - // completely separate from the provided source string `s`. The source is valid for reads of - // `len` bytes since `len == src.len()`, and the destination is valid for writes of `len` - // bytes since `len <= N`. The source and destination are both trivially properly aligned, - // since they both have an alignment of 1. - unsafe { ptr::copy_nonoverlapping(src_ptr, buf.as_mut_ptr(), usize::from(len)) } - - // SAFETY: - // The first `len` bytes of the buffer are valid UTF-8 because the first `len` bytes of - // the buffer contain data copied from a `&str`, and `&str` is always valid UTF-8. - unsafe { Ok(Self::from_raw_parts(buf, len)) } + // `src.as_ptr()` points to `len` bytes of valid UTF-8 string data since `src` is a `&str` + // and `len` is its length. `len` is less than or equal to `Self::MAX_LEN`, which is equal + // to `N`. + unsafe { Ok(Self::from_raw_ptr(src.as_ptr(), len)) } } #[inline] @@ -180,7 +168,9 @@ impl CappedString { where S: AsRef + ?Sized, { - let (src, len) = truncate_str(>::as_ref(src), Self::MAX_LEN); + let src = >::as_ref(src); + + let (src, len) = truncate_str(src, Self::MAX_LEN); // SAFETY: // It is part of the contract of `truncate_str` that it returns a pointer to a valid UTF-8 @@ -189,15 +179,6 @@ impl CappedString { unsafe { Self::from_raw_ptr(src, len) } } - /// Returns the length as a `u8` if it is less than or equal to `Self::MAX_LEN` (which is the - /// `u8` representation of `N`). Otherwise, returns `None`. - #[inline] - fn bound_to_max_len(len: usize) -> Option { - u8::try_from(len) - .ok() - .and_then(|len| (len <= Self::MAX_LEN).then_some(len)) - } - pub fn push(&mut self, c: char) -> Result<(), CapacityError> { todo!() } From 2d5343681a4223bd43e657f76963d4c4a3cff719 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Sat, 10 Sep 2022 21:55:39 +0100 Subject: [PATCH 3/7] strings: WIP CappedString::push_str and friends This patch implements `CappedString::push_str` and its truncating counterpart, `CappedString::push_str_truncating`. These methods provide a safe API to append additional string data to the end of a `CappedString`. This is a work-in-progress and, like the rest of `CappedString`, requires unit testing. --- src/strings/capped.rs | 71 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index ab0ff1e..a9c05d9 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -77,8 +77,8 @@ impl CappedString { } /// # Safety - /// `src` must point to `len` bytes of valid, initialised, UTF-8 string data. `len` must be less - /// than or equal to `N`. + /// `src` must point to `len` bytes of valid, UTF-8 string data. `len` must be less than or + /// equal to `N`. #[inline] unsafe fn from_raw_ptr(src: *const u8, len: u8) -> Self { // `u8` has the same memory layout as `MaybeUninit`, so this cast is valid. @@ -104,6 +104,25 @@ impl CappedString { unsafe { Self::from_raw_parts(buf, len) } } + /// # Safety + /// `src` must point to `len` bytes of valid UTF-8 string data. `len` must be less than or equal + /// to `N - self.len`. + unsafe fn append_to_buf(&mut self, src: *const u8, len: u8) { + // `u8` has the same memory layout as `MaybeUninit`, so this cast is valid. + let src = src as *const MaybeUninit; + + // SAFETY: + // `self.len <= N` is an invariant of `CappedString`, so `self.len..` is a valid range over + // `self.buf`. + let dst = unsafe { self.buf.get_unchecked_mut(usize::from(self.len)..) }; + + // SAFETY: + // + unsafe { ptr::copy_nonoverlapping(src, dst.as_mut_ptr(), usize::from(len)); } + + self.len += len; + } + /// Returns a new empty `CappedString`. /// /// ``` @@ -179,26 +198,66 @@ impl CappedString { unsafe { Self::from_raw_ptr(src, len) } } + /// Appends the given character to the end of this `CappedString`, returning an error if there + /// is insufficient capacity remaining to do so. + /// + /// If you do not care whether or not the append succeeds, see [`Self::push_truncating`]. pub fn push(&mut self, c: char) -> Result<(), CapacityError> { todo!() } + /// Appends the given character to the end of this `CappedString`, failing silently if there is + /// insufficient capacity remaining to do so. + /// + /// If you would like to know whether or not the append succeeds, see [`Self::push`]. pub fn push_truncating(&mut self, c: char) { todo!() } - pub fn push_str(&mut self, s: &S) -> Result<(), CapacityError> + /// Appends the given string slice to the end of this `CappedString`, returning an error if + /// there is insufficient capacity remaining to do so. + /// + /// If you would like a version which cannot fail, see [`Self::push_str_truncating`]. + pub fn push_str(&mut self, src: &S) -> Result<(), CapacityError> where S: AsRef + ?Sized, { - todo!() + let src = >::as_ref(src); + + let len = match u8::try_from(src.len()) { + Ok(len) if len <= Self::MAX_LEN - self.len => len, + _ => return Err(CapacityError), + }; + + // SAFETY: + // + unsafe { self.append_to_buf(src.as_ptr(), len); } + + Ok(()) } - pub fn push_str_truncating(&mut self, s: &S) + /// Appends as many of the characters of the given string slice to the end of this + /// `CappedString` as can fit. Any remaining characters will not be added. + /// + /// If you would like a version which returns an error if there is not enough capacity remaining + /// to append the entire string slice, see [`Self::push_str`]. + pub fn push_str_truncating(&mut self, src: &S) where S: AsRef + ?Sized, { - todo!() + let remaining_cap = Self::MAX_LEN - self.len; + + if remaining_cap == 0 { + return; + } + + let src = >::as_ref(src); + + let (src, len) = truncate_str(src, remaining_cap); + + // SAFETY: + // + unsafe { self.append_to_buf(src, len); } } /// Returns a string slice pointing to the underlying string data. From 0d8877536674899abd2de4cbe89437342f94cc27 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Mon, 12 Sep 2022 16:32:34 +0100 Subject: [PATCH 4/7] strings: implement push and push_truncating for CappedString This patch implements the `CappedString::push` and `CappedString::push_truncating` methods, which are like `CappedString::push_str` and friends but take a single character rather than a string slice. --- src/strings/capped.rs | 54 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index a9c05d9..817be91 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -104,10 +104,23 @@ impl CappedString { unsafe { Self::from_raw_parts(buf, len) } } + /// # Safety + /// `self.len` must be less than `N`, so that there is space in the buffer to append the byte. + #[inline] + unsafe fn append_byte(&mut self, byte: u8) { + // SAFETY: + // The caller is responsible for ensuring that `self.len < N`. + let dst = unsafe { self.buf.get_unchecked_mut(usize::from(self.len)) }; + + *dst = MaybeUninit::new(byte); + self.len += 1; + } + /// # Safety /// `src` must point to `len` bytes of valid UTF-8 string data. `len` must be less than or equal /// to `N - self.len`. - unsafe fn append_to_buf(&mut self, src: *const u8, len: u8) { + #[inline] + unsafe fn append_bytes(&mut self, src: *const u8, len: u8) { // `u8` has the same memory layout as `MaybeUninit`, so this cast is valid. let src = src as *const MaybeUninit; @@ -117,7 +130,12 @@ impl CappedString { let dst = unsafe { self.buf.get_unchecked_mut(usize::from(self.len)..) }; // SAFETY: - // + // The caller is responsible for ensuring that `src` points to a valid string, which means + // that it cannot overlap with the new local variable `buf`. The caller is responsible for + // ensuring that `src` is valid for reads of `len` bytes. The caller is responsible for + // ensuring that `len <= N - self.len`, so the destination `dst = self.buf[self.len..]` is + // valid for writes of `len` bytes. `src` and `dst` are both trivially properly aligned, + // since they both have an alignment of 1. unsafe { ptr::copy_nonoverlapping(src, dst.as_mut_ptr(), usize::from(len)); } self.len += len; @@ -202,22 +220,45 @@ impl CappedString { /// is insufficient capacity remaining to do so. /// /// If you do not care whether or not the append succeeds, see [`Self::push_truncating`]. + #[inline] pub fn push(&mut self, c: char) -> Result<(), CapacityError> { - todo!() + let mut char_buf = [0u8; 4]; + let encoded = c.encode_utf8(&mut char_buf); + + match encoded.len() { + 1 => { + if self.len == Self::MAX_LEN { + return Err(CapacityError); + } + + // SAFETY: + // + unsafe { self.append_byte(encoded.as_bytes()[0]) } + + Ok(()) + }, + + _ => self.push_str(encoded), + } } /// Appends the given character to the end of this `CappedString`, failing silently if there is /// insufficient capacity remaining to do so. /// /// If you would like to know whether or not the append succeeds, see [`Self::push`]. + #[inline] pub fn push_truncating(&mut self, c: char) { - todo!() + // Unlike `Self::push_str_truncating`, we can just use `Self::push` and swallow the error + // because a single character will never be partially pushed; it is either pushed or it + // isn't. + self.push(c).ok(); } /// Appends the given string slice to the end of this `CappedString`, returning an error if /// there is insufficient capacity remaining to do so. /// /// If you would like a version which cannot fail, see [`Self::push_str_truncating`]. + #[inline] pub fn push_str(&mut self, src: &S) -> Result<(), CapacityError> where S: AsRef + ?Sized, @@ -231,7 +272,7 @@ impl CappedString { // SAFETY: // - unsafe { self.append_to_buf(src.as_ptr(), len); } + unsafe { self.append_bytes(src.as_ptr(), len); } Ok(()) } @@ -241,6 +282,7 @@ impl CappedString { /// /// If you would like a version which returns an error if there is not enough capacity remaining /// to append the entire string slice, see [`Self::push_str`]. + #[inline] pub fn push_str_truncating(&mut self, src: &S) where S: AsRef + ?Sized, @@ -257,7 +299,7 @@ impl CappedString { // SAFETY: // - unsafe { self.append_to_buf(src, len); } + unsafe { self.append_bytes(src, len); } } /// Returns a string slice pointing to the underlying string data. From 119a32840af242289d68e4ec1bcccac69d9fd902 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Mon, 12 Sep 2022 17:46:00 +0100 Subject: [PATCH 5/7] strings: fill in safety comments for CappedString This patch fills in some missing safety comments in the implementation of `CappedString`, and adds some additional comments for clarity. --- src/strings/capped.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 817be91..7c5bda0 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -271,7 +271,8 @@ impl CappedString { }; // SAFETY: - // + // `src` is a valid string slice with length `len`. We have checked that + // `len <= N - self.len` holds above (note that `Self::MAX_LEN == N`). unsafe { self.append_bytes(src.as_ptr(), len); } Ok(()) @@ -289,16 +290,20 @@ impl CappedString { { let remaining_cap = Self::MAX_LEN - self.len; + // Short-circuit if we have no space left to copy into. if remaining_cap == 0 { return; } let src = >::as_ref(src); + // Find the longest valid UTF-8 prefix which fits into the remaining space. let (src, len) = truncate_str(src, remaining_cap); // SAFETY: - // + // `truncate_str` returns a pointer to `len` bytes of valid UTF-8 string data. The returned + // `len` will always be less than or equal to `remaining_cap`, which is equal to + // `N - self.len` (note that `Self::MAX_LEN == N`). unsafe { self.append_bytes(src, len); } } @@ -545,7 +550,7 @@ impl fmt::Display for CappedString { } /// Returns a pointer to the longest prefix of `src` which is valid UTF-8 and whose length is -/// shorter than `max_len`, and returns the length of this prefix. +/// less than or equal to `max_len`, and returns the length of this prefix. #[inline] fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) { match u8::try_from(src.len()) { @@ -587,18 +592,6 @@ fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) { i -= 1; } - // // SAFETY: - // // As discussed above, `i < src.len()` always holds, so `..i` is a valid range over - // // `src`. - // let src_truncated = unsafe { src.get_unchecked(..usize::from(i)) }; - - // // SAFETY: - // // `i` is the index of a start of a codepoint, and codepoints are contiguous, so the - // // substring `src[..i]` must be valid UTF-8. - // let src_truncated = unsafe { str::from_utf8_unchecked(src_truncated) }; - - // (src_truncated, i) - // `i < src.len()` always holds as discussed above, so the pointer `src.as_ptr()` is // valid for reads of `i` bytes. `i` is the index of the start of a codepoint, and // codepoints are contiguous, so the `i` bytes being pointed to must be valid UTF-8. From 383f0ae3588b329843b8ab73f80015c005396b79 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Wed, 14 Sep 2022 15:08:39 +0100 Subject: [PATCH 6/7] strings: add CappedString::clear method This patch implements `CappedString::clear`, which provides an easy way for users to clear the contents of a `CappedString`. --- src/strings/capped.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 7c5bda0..3bf87cd 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -307,6 +307,28 @@ impl CappedString { unsafe { self.append_bytes(src, len); } } + /// Empties the contents of this `CappedString`. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// # fn main() -> Result<(), libshire::strings::capped::CapacityError> { + /// let mut s = CappedString::<16>::new("hello")?; + /// + /// assert!(!s.is_empty()); + /// assert_eq!(&*s, "hello"); + /// + /// s.clear(); + /// + /// assert!(s.is_empty()); + /// assert_eq!(&*s, ""); + /// # Ok(()) + /// # } + /// ``` + #[inline] + pub fn clear(&mut self) { + self.len = 0; + } + /// Returns a string slice pointing to the underlying string data. #[inline] #[must_use] From cad45f5bce23191653540d0503ce8650e53eb02b Mon Sep 17 00:00:00 2001 From: pantonshire Date: Wed, 14 Sep 2022 16:10:07 +0100 Subject: [PATCH 7/7] strings: unit tests and documentation for CappedString This patch adds several unit tests for `CappedString`, which are intended to be run under miri since `CappedString` uses lots of unsafe code. It also adds documentation and documentation tests for a number of previously undocumented `CappedString` methods. --- src/strings/capped.rs | 282 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 275 insertions(+), 7 deletions(-) diff --git a/src/strings/capped.rs b/src/strings/capped.rs index 3bf87cd..7104df6 100644 --- a/src/strings/capped.rs +++ b/src/strings/capped.rs @@ -37,7 +37,7 @@ impl std::error::Error for CapacityError {} /// /// ``` /// # use libshire::strings::CappedString; -/// # fn main() -> Result<(), libshire::strings::capped::Error> { +/// # fn main() -> Result<(), libshire::strings::capped::CapacityError> { /// let s = CappedString::<16>::new("hello world")?; /// assert_eq!(&*s, "hello world"); /// # Ok(()) @@ -106,6 +106,7 @@ impl CappedString { /// # Safety /// `self.len` must be less than `N`, so that there is space in the buffer to append the byte. + /// The byte must be a valid UTF-8 codepoint; it must be in the range `0..=127`. #[inline] unsafe fn append_byte(&mut self, byte: u8) { // SAFETY: @@ -167,10 +168,12 @@ impl CappedString { /// Returns a new `CappedString` containing the given string data. The string data will be /// stored inline; no heap allocation is used. An error will be returned if the length of the /// provided string exceeds the `CappedString`'s maximum length, `N`. + /// + /// If you would like a version which never returns an error, see [`Self::new_truncating`]. /// /// ``` /// # use libshire::strings::CappedString; - /// # fn main() -> Result<(), libshire::strings::capped::Error> { + /// # fn main() -> Result<(), libshire::strings::capped::CapacityError> { /// let s = CappedString::<16>::new("hello world")?; /// assert_eq!(&*s, "hello world"); /// # Ok(()) @@ -199,6 +202,21 @@ impl CappedString { unsafe { Ok(Self::from_raw_ptr(src.as_ptr(), len)) } } + /// Returns a new `CappedString` containing the given string data. The string data will be + /// stored inline; no heap allocation is used. If the length of the provided string exceeds the + /// `CappedString`'s maximum length, `N`, it will be truncated to fit. + /// + /// If you would like a version which returns an error rather than truncating the string, see + /// [`Self::new`]. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// let s1 = CappedString::<15>::new_truncating("こんにちは"); + /// assert_eq!(&*s1, "こんにちは"); + /// + /// let s2 = CappedString::<10>::new_truncating("こんにちは"); + /// assert_eq!(&*s2, "こんに"); + /// ``` #[inline] #[must_use] pub fn new_truncating(src: &S) -> Self @@ -232,7 +250,11 @@ impl CappedString { } // SAFETY: - // + // We have checked that `self.len != N` (`Self::MAX_LEN == N`). Since it is an + // invariant of `CappedString` that `self.len <= N`, it must hold that + // `self.len < N`. The first byte of a `str` of length 1 must be a valid UTF-8 + // codepoint; it must be in the range `0..=127`, since anything outside this range + // implies the presence of further bytes. unsafe { self.append_byte(encoded.as_bytes()[0]) } Ok(()) @@ -258,6 +280,20 @@ impl CappedString { /// there is insufficient capacity remaining to do so. /// /// If you would like a version which cannot fail, see [`Self::push_str_truncating`]. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// let mut s = CappedString::<8>::empty(); + /// + /// assert!(s.push_str("hello").is_ok()); + /// assert_eq!(&*s, "hello"); + /// + /// assert!(s.push_str(" world").is_err()); + /// assert_eq!(&*s, "hello"); + /// + /// assert!(s.push_str("!!!").is_ok()); + /// assert_eq!(&*s, "hello!!!"); + /// ``` #[inline] pub fn push_str(&mut self, src: &S) -> Result<(), CapacityError> where @@ -283,6 +319,20 @@ impl CappedString { /// /// If you would like a version which returns an error if there is not enough capacity remaining /// to append the entire string slice, see [`Self::push_str`]. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// let mut s = CappedString::<10>::empty(); + /// + /// s.push_str_truncating("hello"); + /// assert_eq!(&*s, "hello"); + /// + /// s.push_str_truncating(" 世界"); + /// assert_eq!(&*s, "hello 世"); + /// + /// s.push_str_truncating("!!!"); + /// assert_eq!(&*s, "hello 世!"); + /// ``` #[inline] pub fn push_str_truncating(&mut self, src: &S) where @@ -326,6 +376,15 @@ impl CappedString { /// ``` #[inline] pub fn clear(&mut self) { + // Setting the length to 0 is enough to clear the `CappedString`; we don't need to replace + // any of the old bytes in the buffer, as setting the length to 0 makes all of the old bytes + // inaccessible via safe methods, and means that any future calls to `Self::push` and + // friends will write over the old bytes. + // + // It may be desirable for security-critical code to zero the old buffer to prevent cleared + // data from being exposed via buffer-overflow exploits or similar. However, this should be + // implemented in a separate function so that regular users don't have to pay the cost of + // zeroing the buffer. self.len = 0; } @@ -340,15 +399,37 @@ impl CappedString { } /// Returns a mutable string slice pointing to the underlying string data. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// # fn main() -> Result<(), libshire::strings::capped::CapacityError> { + /// let mut s = CappedString::<16>::new("hello!")?; + /// s.as_str_mut().make_ascii_uppercase(); + /// assert_eq!(&*s, "HELLO!"); + /// # Ok(()) + /// # } + /// ``` #[inline] #[must_use] pub fn as_str_mut(&mut self) -> &mut str { // SAFETY: // The first `self.len` bytes of `self.buf` (which is returned by `Self::as_bytes_mut`) - // being valid UTF-8 is an invariant of `CappedString`. + // being valid UTF-8 is an invariant of `CappedString`. Since we are returning a `&mut str` + // to the caller, the caller cannot safely use it to mutate this `CappedString`'s buffer in + // a way that violates the UTF-8 property. unsafe { str::from_utf8_unchecked_mut(self.as_bytes_mut()) } } + /// Returns a byte slice containing the UTF-8 bytes representing the string. + /// + /// ``` + /// # use libshire::strings::CappedString; + /// # fn main() -> Result<(), libshire::strings::capped::CapacityError> { + /// let s = CappedString::<16>::new("hello!")?; + /// assert_eq!(s.as_bytes(), &[0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x21]); + /// # Ok(()) + /// # } + /// ``` #[inline] #[must_use] pub fn as_bytes(&self) -> &[u8] { @@ -366,8 +447,8 @@ impl CappedString { } /// # Safety - /// The caller is responsible for ensuring that the slice is valid UTF-8 when the mutable - /// borrow ends. + /// The slice must be valid UTF-8 when the mutable borrow ends and this `CappedString` is used + /// again. #[inline] #[must_use] pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { @@ -623,4 +704,191 @@ fn truncate_str(src: &str, max_len: u8) -> (*const u8, u8) { } #[cfg(test)] -mod tests {} +mod tests { + use super::CappedString; + + #[test] + fn test_truncate_str() { + use super::truncate_str; + + let s1 = "hello"; + assert_eq!(truncate_str(s1, 0), (s1.as_ptr(), 0)); + assert_eq!(truncate_str(s1, 1), (s1.as_ptr(), 1)); + assert_eq!(truncate_str(s1, 5), (s1.as_ptr(), 5)); + assert_eq!(truncate_str(s1, 6), (s1.as_ptr(), 5)); + + let s2 = "こんにちは"; + assert_eq!(truncate_str(s2, 0), (s2.as_ptr(), 0)); + assert_eq!(truncate_str(s2, 1), (s2.as_ptr(), 0)); + assert_eq!(truncate_str(s2, 2), (s2.as_ptr(), 0)); + assert_eq!(truncate_str(s2, 3), (s2.as_ptr(), 3)); + assert_eq!(truncate_str(s2, 4), (s2.as_ptr(), 3)); + assert_eq!(truncate_str(s2, 5), (s2.as_ptr(), 3)); + assert_eq!(truncate_str(s2, 6), (s2.as_ptr(), 6)); + assert_eq!(truncate_str(s2, 14), (s2.as_ptr(), 12)); + assert_eq!(truncate_str(s2, 15), (s2.as_ptr(), 15)); + assert_eq!(truncate_str(s2, 16), (s2.as_ptr(), 15)); + assert_eq!(truncate_str(s2, 18), (s2.as_ptr(), 15)); + + let s3 = "🤖 こんにちは, world 🤖"; + assert_eq!(truncate_str(s3, 0), (s3.as_ptr(), 0)); + assert_eq!(truncate_str(s3, 1), (s3.as_ptr(), 0)); + assert_eq!(truncate_str(s3, 2), (s3.as_ptr(), 0)); + assert_eq!(truncate_str(s3, 3), (s3.as_ptr(), 0)); + assert_eq!(truncate_str(s3, 4), (s3.as_ptr(), 4)); + assert_eq!(truncate_str(s3, 5), (s3.as_ptr(), 5)); + assert_eq!(truncate_str(s3, 6), (s3.as_ptr(), 5)); + assert_eq!(truncate_str(s3, 7), (s3.as_ptr(), 5)); + assert_eq!(truncate_str(s3, 8), (s3.as_ptr(), 8)); + assert_eq!(truncate_str(s3, 28), (s3.as_ptr(), 28)); + assert_eq!(truncate_str(s3, 29), (s3.as_ptr(), 28)); + assert_eq!(truncate_str(s3, 30), (s3.as_ptr(), 28)); + assert_eq!(truncate_str(s3, 31), (s3.as_ptr(), 28)); + assert_eq!(truncate_str(s3, 32), (s3.as_ptr(), 32)); + assert_eq!(truncate_str(s3, 33), (s3.as_ptr(), 32)); + assert_eq!(truncate_str(s3, 36), (s3.as_ptr(), 32)); + + let s4 = "a"; + assert_eq!(truncate_str(s4, 0), (s4.as_ptr(), 0)); + assert_eq!(truncate_str(s4, 1), (s4.as_ptr(), 1)); + assert_eq!(truncate_str(s4, 2), (s4.as_ptr(), 1)); + assert_eq!(truncate_str(s4, 3), (s4.as_ptr(), 1)); + assert_eq!(truncate_str(s4, 4), (s4.as_ptr(), 1)); + + let s5 = ""; + assert_eq!(truncate_str(s5, 0), (s5.as_ptr(), 0)); + assert_eq!(truncate_str(s5, 1), (s5.as_ptr(), 0)); + assert_eq!(truncate_str(s5, 2), (s5.as_ptr(), 0)); + assert_eq!(truncate_str(s5, 3), (s5.as_ptr(), 0)); + assert_eq!(truncate_str(s5, 4), (s5.as_ptr(), 0)); + + let s6 = "На берегу пустынных волн\n\ + Стоял он, дум великих полн,\n\ + И вдаль глядел. Пред ним широко\n\ + Река неслася; бедный чёлн\n\ + По ней стремился одиноко.\n\ + По мшистым, топким берегам\n\ + Чернели избы здесь и там,\n\ + Приют убогого чухонца;\n\ + И лес, неведомый лучам\n\ + В тумане спрятанного солнца,\n\ + Кругом шумел."; + assert_eq!(truncate_str(s6, 0), (s6.as_ptr(), 0)); + assert_eq!(truncate_str(s6, 1), (s6.as_ptr(), 0)); + assert_eq!(truncate_str(s6, 2), (s6.as_ptr(), 2)); + assert_eq!(truncate_str(s6, 3), (s6.as_ptr(), 2)); + assert_eq!(truncate_str(s6, 4), (s6.as_ptr(), 4)); + assert_eq!(truncate_str(s6, 254), (s6.as_ptr(), 253)); + assert_eq!(truncate_str(s6, 255), (s6.as_ptr(), 255)); + } + + #[test] + fn test_new() { + assert_eq!(&*CappedString::<5>::new("").unwrap(), ""); + assert_eq!(&*CappedString::<5>::new("a").unwrap(), "a"); + assert_eq!(&*CappedString::<5>::new("hello").unwrap(), "hello"); + assert_eq!(&*CappedString::<6>::new("hello").unwrap(), "hello"); + assert!(CappedString::<5>::new("hello!").is_err()); + assert_eq!(&*CappedString::<6>::new("hello!").unwrap(), "hello!"); + assert_eq!(&*CappedString::<5>::new("こ").unwrap(), "こ"); + assert!(CappedString::<5>::new("こん").is_err()); + assert_eq!(&*CappedString::<6>::new("こん").unwrap(), "こん"); + assert!(CappedString::<6>::new("こんにちは").is_err()); + assert_eq!(&*CappedString::<0>::new("").unwrap(), ""); + assert!(CappedString::<0>::new("a").is_err()); + } + + #[test] + fn test_new_truncating() { + assert_eq!(&*CappedString::<5>::new_truncating(""), ""); + assert_eq!(&*CappedString::<5>::new_truncating("a"), "a"); + assert_eq!(&*CappedString::<5>::new_truncating("hello"), "hello"); + assert_eq!(&*CappedString::<6>::new_truncating("hello"), "hello"); + assert_eq!(&*CappedString::<5>::new_truncating("hello!"), "hello"); + assert_eq!(&*CappedString::<6>::new_truncating("hello!"), "hello!"); + assert_eq!(&*CappedString::<5>::new_truncating("こ"), "こ"); + assert_eq!(&*CappedString::<5>::new_truncating("こん"), "こ"); + assert_eq!(&*CappedString::<6>::new_truncating("こん"), "こん"); + assert_eq!(&*CappedString::<6>::new_truncating("こんにちは"), "こん"); + assert_eq!(&*CappedString::<7>::new_truncating("こんにちは"), "こん"); + assert_eq!(&*CappedString::<8>::new_truncating("こんにちは"), "こん"); + assert_eq!(&*CappedString::<9>::new_truncating("こんにちは"), "こんに"); + assert_eq!(&*CappedString::<3>::new_truncating("🤖 hello 🤖"), ""); + assert_eq!(&*CappedString::<4>::new_truncating("🤖 hello 🤖"), "🤖"); + assert_eq!(&*CappedString::<14>::new_truncating("🤖 hello 🤖"), "🤖 hello "); + assert_eq!(&*CappedString::<15>::new_truncating("🤖 hello 🤖"), "🤖 hello 🤖"); + assert_eq!(&*CappedString::<20>::new_truncating("🤖 hello 🤖"), "🤖 hello 🤖"); + assert_eq!(&*CappedString::<0>::new_truncating(""), ""); + assert_eq!(&*CappedString::<0>::new_truncating("a"), ""); + } + + #[test] + fn test_push() { + let mut s = CappedString::<6>::empty(); + s.push_str("").unwrap(); + assert_eq!(&*s, ""); + s.push('h').unwrap(); + assert_eq!(&*s, "h"); + s.push_str("ello").unwrap(); + assert_eq!(&*s, "hello"); + assert!(s.push_str(", world").is_err()); + assert_eq!(&*s, "hello"); + } + + #[test] + fn test_push_truncating() { + let mut s = CappedString::<6>::empty(); + + s.push_str_truncating(""); + assert_eq!(&*s, ""); + s.push_truncating('h'); + assert_eq!(&*s, "h"); + s.push_str_truncating("ello"); + assert_eq!(&*s, "hello"); + s.push_str_truncating(", world"); + assert_eq!(&*s, "hello,"); + + s.clear(); + + s.push_truncating('こ'); + assert_eq!(&*s, "こ"); + s.push_truncating('ん'); + assert_eq!(&*s, "こん"); + s.push_truncating('に'); + assert_eq!(&*s, "こん"); + + s.clear(); + + s.push_truncating('🤖'); + assert_eq!(&*s, "🤖"); + s.push_truncating('🤖'); + assert_eq!(&*s, "🤖"); + s.push_str_truncating("!!!"); + assert_eq!(&*s, "🤖!!"); + + s.clear(); + + s.push_str_truncating("🤖 "); + assert_eq!(&*s, "🤖 "); + s.push_truncating('🤖'); + assert_eq!(&*s, "🤖 "); + + s.clear(); + + s.push_str_truncating(" "); + assert_eq!(&*s, " "); + s.push_str_truncating("🤖🤖🤖"); + assert_eq!(&*s, " 🤖"); + s.push_truncating('!'); + assert_eq!(&*s, " 🤖"); + + s.clear(); + + s.push_str_truncating(" "); + assert_eq!(&*s, " "); + s.push_truncating('🤖'); + assert_eq!(&*s, " "); + s.push_str_truncating("こんにちは"); + assert_eq!(&*s, " こ"); + } +}