From 78ee3cfb4e4dcf9cd212ffd7568576ae9bc9e374 Mon Sep 17 00:00:00 2001 From: Pantonshire Date: Wed, 13 Jul 2022 14:49:38 +0100 Subject: [PATCH] Experimental ShString now supports efficient conversion to String --- src/strings/experimental.rs | 258 +++++++++++++++++++++++------------- 1 file changed, 166 insertions(+), 92 deletions(-) diff --git a/src/strings/experimental.rs b/src/strings/experimental.rs index a2533e2..49eed24 100644 --- a/src/strings/experimental.rs +++ b/src/strings/experimental.rs @@ -1,13 +1,14 @@ use std::{ - borrow, fmt, - mem::{ManuallyDrop, MaybeUninit}, + borrow::{self, Cow}, + convert::Infallible, + fmt, + mem::{self, ManuallyDrop, MaybeUninit}, ops, ptr::{self, addr_of, addr_of_mut}, - slice, str, + slice, + str, }; -use crate::either::Either::{self, Inl, Inr}; - pub type ShString23 = ShString<23>; /// An experimental alternative to `libshire::strings::ShString`, which is able to store one extra @@ -32,7 +33,7 @@ pub struct ShString { #[repr(C, packed)] union Repr { stack: [MaybeUninit; N], - heap: ManuallyDrop>, + heap: ManuallyDrop>>, } impl ShString { @@ -94,7 +95,7 @@ impl ShString { { Self { repr: Repr { - heap: ManuallyDrop::new(Box::::from(s)), + heap: ManuallyDrop::new(MaybeUninit::new(Box::from(s))), }, len: u8::MAX, _align: [], @@ -104,120 +105,148 @@ impl ShString { #[inline] #[must_use] pub fn as_str(&self) -> &str { - match self.variant() { - Inl(stack) => stack, - Inr(heap) => heap, + if self.heap_allocated() { + // SAFETY: + // `len` is greater than `Self::MAX_LEN`, which means that the `heap` field is + // active. `heap` is properly aligned because it is stored at offset 0 of + // `ShString` (since both `ShString` and `Repr` use `repr(C)`), and the alignment + // of `ShString` is equal to the alignment of `Box`. + let box_str = unsafe { &*addr_of!(self.repr.heap) }; + + unsafe { box_str.assume_init_ref() } + } else { + // Get a pointer to the `stack` field of the union. + // SAFETY: + // Since `len` is less no greater than `MAX_LEN`, the `stack` field must be + // active. + let ptr = unsafe { addr_of!(self.repr.stack) } + as *const MaybeUninit + as *const u8; + + // Construct a byte slice from the pointer to the string data and the length. + // SAFETY: + // The first `len` bytes of `stack` are always initialised, as this is an + // invariant of `ShString`. + let bytes = unsafe { slice::from_raw_parts(ptr, usize::from(self.len)) }; + + // Perform an unchecked conversion from the byte slice to a string slice. + // SAFETY: + // The first `len` bytes of `stack` is always valid UTF-8, as this is an + // invariant of `ShString`. + unsafe { str::from_utf8_unchecked(bytes) } } } #[inline] #[must_use] pub fn as_str_mut(&mut self) -> &mut str { - match self.variant_mut() { - Inl(stack) => stack, - Inr(heap) => heap, - } - } + if self.heap_allocated() { + // SAFETY: + // `len` is greater than `Self::MAX_LEN`, which means that the `heap` field is + // active. `heap` is properly aligned because it is stored at offset 0 of + // `ShString` (since both `ShString` and `Repr` use `repr(C)`), and the alignment + // of `ShString` is equal to the alignment of `Box`. + let box_str = unsafe { &mut *addr_of_mut!(self.repr.heap) }; - // #[inline] - // #[must_use] - // pub fn into_string(self) -> String { - // match self.variant() { - // Inl(stack) => stack.to_owned(), - // Inr(heap) => heap.into_string(), - // } - // } + unsafe { box_str.assume_init_mut() } + } else { + // Get a pointer to the `stack` field of the union. + // SAFETY: + // Since `len` is less no greater than `MAX_LEN`, the `stack` field must be + // active. + let ptr = unsafe { addr_of_mut!(self.repr.stack) } + as *mut MaybeUninit + as *mut u8; - #[inline] - #[must_use] - pub fn heap_allocated(&self) -> bool { - match self.variant() { - Inl(_) => false, - Inr(_) => true, + // Construct a byte slice from the pointer to the string data and the length. + // SAFETY: + // The first `len` bytes of `stack` are always initialised, as this is an + // invariant of `ShString`. + let bytes = unsafe { slice::from_raw_parts_mut(ptr, usize::from(self.len)) }; + + // Perform an unchecked conversion from the byte slice to a string slice. + // SAFETY: + // The first `len` bytes of `stack` is always valid UTF-8, as this is an + // invariant of `ShString`. + unsafe { str::from_utf8_unchecked_mut(bytes) } } } #[inline] #[must_use] - pub fn len(&self) -> usize { - match self.variant() { - Inl(stack) => stack.len(), - Inr(heap) => heap.len(), + pub fn into_string(self) -> String { + if self.heap_allocated() { + // Disable the destructor for `self`; we are transferring ownership of the allocated + // memory to the caller, so we don't want to run the drop implementation which would + // free the memory. + let mut this = ManuallyDrop::new(self); + + // SAFETY: + // `len` is greater than `Self::MAX_LEN`, which means that the `heap` field is + // active. `heap` is properly aligned because it is stored at offset 0 of + // `ShString` (since both `ShString` and `Repr` use `repr(C)`), and the alignment + // of `ShString` is equal to the alignment of `Box`. + let field_ref = unsafe { &mut *addr_of_mut!(this.repr.heap) }; + + let manual_box_str = mem::replace(field_ref, ManuallyDrop::new(MaybeUninit::uninit())); + + let maybe_box_str = ManuallyDrop::into_inner(manual_box_str); + + let box_str = unsafe { maybe_box_str.assume_init() }; + + box_str.into_string() + } else { + // Get a pointer to the `stack` field of the union. + // SAFETY: + // Since `len` is less no greater than `MAX_LEN`, the `stack` field must be + // active. + let ptr = unsafe { addr_of!(self.repr.stack) } + as *const MaybeUninit + as *const u8; + + // Construct a byte slice from the pointer to the string data and the length. + // SAFETY: + // The first `len` bytes of `stack` are always initialised, as this is an + // invariant of `ShString`. + let bytes = unsafe { slice::from_raw_parts(ptr, usize::from(self.len)) }; + + // Perform an unchecked conversion from the byte slice to a string slice. + // SAFETY: + // The first `len` bytes of `stack` is always valid UTF-8, as this is an + // invariant of `ShString`. + let str_slice = unsafe { str::from_utf8_unchecked(bytes) }; + + str_slice.to_owned() } } #[inline] #[must_use] - pub fn is_empty(&self) -> bool { - match self.variant() { - Inl(stack) => stack.is_empty(), - Inr(heap) => heap.is_empty(), - } + pub fn heap_allocated(&self) -> bool { + self.len > Self::MAX_LEN } - #[inline(always)] + #[inline] #[must_use] - fn variant(&self) -> Either<&str, &ManuallyDrop>> { - if self.len <= Self::MAX_LEN { - let slice = unsafe { - // Get a pointer to the `stack` field of the union. - // SAFETY: - // Since `len` is less no greater than `MAX_LEN`, the `stack` field must be active. - let ptr = addr_of!(self.repr.stack) as *const MaybeUninit as *const u8; - - // SAFETY: - // The first `len` bytes of `stack` are always initialised, as this is an invariant - // of `ShString`. - let bytes = slice::from_raw_parts(ptr, usize::from(self.len)); - - // Perform an unchecked conversion from the byte slice to a string slice. - // SAFETY: - // The first `len` bytes of `stack` is always valid UTF-8, as this is an invariant - // of `ShString`. - str::from_utf8_unchecked(bytes) - }; - Inl(slice) - } else { - // SAFETY: - // `len` is greater than `Self::MAX_LEN`, which means that the `heap` field is active. - // `heap` is properly aligned because it is stored at offset 0 of `ShString` (since - // both `ShString` and `Repr` use `repr(C)`), and the alignment of `ShString` is equal - // to the alignment of `Box`. - let heap = unsafe { &*addr_of!(self.repr.heap) }; - Inr(heap) - } + pub fn len(&self) -> usize { + self.as_str().len() } - #[inline(always)] + #[inline] #[must_use] - fn variant_mut(&mut self) -> Either<&mut str, &mut ManuallyDrop>> { - if self.len <= Self::MAX_LEN { - let slice = unsafe { - let ptr = addr_of_mut!(self.repr.stack) as *mut MaybeUninit as *mut u8; - - let bytes = slice::from_raw_parts_mut(ptr, usize::from(self.len)); - - // Perform an unchecked conversion from the byte slice to a string slice. This is - // sound because the first `len` bytes of `stack` is always valid UTF-8 when it is - // active, as this is an invariant of `ShString`. - str::from_utf8_unchecked_mut(bytes) - }; - Inl(slice) - } else { - let heap = unsafe { &mut *addr_of_mut!(self.repr.heap) }; - Inr(heap) - } + pub fn is_empty(&self) -> bool { + self.as_str().is_empty() } } impl Drop for ShString { fn drop(&mut self) { - if let Inr(heap) = self.variant_mut() { + if self.heap_allocated() { + let heap = unsafe { &mut *addr_of_mut!(self.repr.heap) }; + // SAFETY: // Since this is a drop implementation, `heap` will not be used again after this. - unsafe { - let _ = ManuallyDrop::take(heap); - } + let _ = unsafe { ManuallyDrop::take(heap).assume_init() }; } } } @@ -266,6 +295,36 @@ impl borrow::BorrowMut for ShString { } } +impl<'a, const N: usize> From<&'a str> for ShString { + #[inline] + fn from(s: &'a str) -> Self { + Self::new(s) + } +} + +impl From for ShString { + #[inline] + fn from(s: String) -> Self { + Self::new(s) + } +} + +impl<'a, const N: usize> From> for ShString { + #[inline] + fn from(s: Cow<'a, str>) -> Self { + Self::new(s) + } +} + +impl str::FromStr for ShString { + type Err = Infallible; + + #[inline] + fn from_str(s: &str) -> Result { + Ok(Self::new(s)) + } +} + impl fmt::Debug for ShString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&**self, f) @@ -278,8 +337,6 @@ impl fmt::Display for ShString { } } -// TODO: ** lots of MIRI tests! ** - #[cfg(test)] mod tests { use std::borrow::Cow; @@ -327,6 +384,23 @@ mod tests { assert_eq!(s2.as_str(), "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"); } + #[test] + fn test_into_string() { + let test_strings = [ + "".to_owned(), + "Hello".to_owned(), + "Somethingfortheweekend".to_owned(), + "Dichlorodifluoromethane".to_owned(), + "Electrocardiographically".to_owned(), + "こんにちは".to_owned(), + "❤️🧡💛💚💙💜".to_owned(), + ]; + + for s in test_strings { + assert_eq!(ShString23::new(&*s).into_string(), s); + } + } + #[test] fn test_len() { assert_eq!(ShString23::new("").len(), 0);