From c077cdf610e0445e777b03587eafc45f192fc6d2 Mon Sep 17 00:00:00 2001 From: Pantonshire Date: Thu, 2 Jun 2022 21:27:43 +0100 Subject: [PATCH] Proof of concept improvement for ShString --- Cargo.toml | 3 - src/strings/experimental.rs | 223 ++++++++++++++++++++++++++++++++++++ src/strings/mod.rs | 1 + src/uuid.rs | 22 ---- 4 files changed, 224 insertions(+), 25 deletions(-) create mode 100644 src/strings/experimental.rs diff --git a/Cargo.toml b/Cargo.toml index 22edea6..2f17f18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,3 @@ name = "libshire" version = "0.1.0" edition = "2021" - -[dependencies] -serde = { version = "1", optional = true } diff --git a/src/strings/experimental.rs b/src/strings/experimental.rs new file mode 100644 index 0000000..9eed5c9 --- /dev/null +++ b/src/strings/experimental.rs @@ -0,0 +1,223 @@ +use std::{ + borrow, + fmt, + mem::ManuallyDrop, + ops, + ptr::{addr_of, addr_of_mut}, + slice, + str, +}; + +use crate::either::Either::{self, Inl, Inr}; + +/// An experimental alternative to `libshire::strings::ShString`, which is able to store one extra +/// byte of string data on the stack in the same amount of space. +// `repr(C)` is necessary to ensure that `Repr` starts at offset zero, so that it's properly +// aligned within the struct. +#[repr(C)] +pub struct ShString { + repr: Repr, + len: u8, + _align: [Box; 0], +} + +#[repr(C, packed)] +union Repr { + stack: [u8; N], + heap: ManuallyDrop>, +} + +impl ShString { + const MAX_LEN: u8 = { + #[allow(clippy::cast_possible_truncation, clippy::checked_conversions)] + if N < u8::MAX as usize { + N as u8 + } else { + panic!("`N` must be less than `u8::MAX`") + } + }; + + #[must_use] + pub fn new(s: S) -> Self + where + S: AsRef, + Box: From, + { + let bytes = s.as_ref().as_bytes(); + match u8::try_from(bytes.len()) { + Ok(len) if len <= Self::MAX_LEN => { + let mut buf = [0u8; N]; + buf[..usize::from(len)].copy_from_slice(bytes); + // SAFETY: + // The first `len` bytes of `buf` are copied from a `&str`, so the first `len` + // bytes are valid UTF-8. We have already checked that `len` is thess than or equal + // to `Self::MAX_LEN`. + unsafe { Self::stack_from_raw_parts(buf, len) } + }, + _ => Self::new_heap(s), + } + } + + /// # Safety + /// The first `len` bytes of `buf` must be valid UTF-8. `len` must be less than or equal to + /// `Self::MAX_LEN` (which is equal to `N`). + unsafe fn stack_from_raw_parts(buf: [u8; N], len: u8) -> Self { + Self { + repr: Repr { stack: buf }, + len, + _align: [] + } + } + + fn new_heap(s: S) -> Self + where + Box: From, + { + Self { + repr: Repr { heap: ManuallyDrop::new(Box::::from(s)) }, + len: u8::MAX, + _align: [], + } + } + + #[inline] + #[must_use] + pub fn as_str(&self) -> &str { + match self.variant() { + // SAFETY: + // `stack` being valid UTF-8 when active is an invariant of `ShString`. + Inl(stack) => unsafe { str::from_utf8_unchecked(stack) }, + Inr(heap) => heap, + } + } + + #[inline] + #[must_use] + pub fn as_str_mut(&mut self) -> &mut str { + match self.variant_mut() { + // SAFETY: + // `stack` being valid UTF-8 when active is an invariant of `ShString`. + Inl(stack) => unsafe { str::from_utf8_unchecked_mut(stack) }, + Inr(heap) => heap, + } + } + + #[inline(always)] + #[must_use] + fn variant(&self) -> Either<&[u8], &ManuallyDrop>> { + if self.len <= Self::MAX_LEN { + let slice = unsafe { + // The preferred way to read the fields of a packed struct is with `addr_of`. + let ptr = addr_of!(self.repr.stack) as *const u8; + let len = usize::from(self.len); + slice::from_raw_parts(ptr, len) + }; + Inl(slice) + } else { + // SAFETY: + // `len` is greater than `Self::MAX_LEN`, which means that the `heap` field is active. + // `heap` is properly aligned because it is stored at offset 0 of `ShString` (since + // both `ShString` and `Repr` use `repr(C)`), and the alignment of `ShString` is equal + // to the alignment of `Box`. + let heap = unsafe { &*addr_of!(self.repr.heap) }; + Inr(heap) + } + } + + #[inline(always)] + #[must_use] + fn variant_mut(&mut self) -> Either<&mut [u8], &mut ManuallyDrop>> { + if self.len <= Self::MAX_LEN { + let slice = unsafe { + let ptr = addr_of_mut!(self.repr.stack) as *mut u8; + let len = usize::from(self.len); + slice::from_raw_parts_mut(ptr, len) + }; + Inl(slice) + } else { + let heap = unsafe { &mut *addr_of_mut!(self.repr.heap) }; + Inr(heap) + } + } +} + +impl Drop for ShString { + fn drop(&mut self) { + if let Inr(heap) = self.variant_mut() { + // SAFETY: + // Since this is a drop implementation, `heap` will not be used again after this. + unsafe { + let _ = ManuallyDrop::take(heap); + } + } + } +} + +impl ops::Deref for ShString { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl ops::DerefMut for ShString { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + self.as_str_mut() + } +} + +impl AsRef for ShString { + #[inline] + fn as_ref(&self) -> &str { + self + } +} + +impl AsMut for ShString { + #[inline] + fn as_mut(&mut self) -> &mut str { + self + } +} + +impl borrow::Borrow for ShString { + #[inline] + fn borrow(&self) -> &str { + self + } +} + +impl borrow::BorrowMut for ShString { + #[inline] + fn borrow_mut(&mut self) -> &mut str { + self + } +} + +impl fmt::Debug for ShString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +impl fmt::Display for ShString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +// TODO: ** lots of MIRI tests! ** + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_shstring_align() { + use std::mem::align_of; + assert_eq!(align_of::>(), align_of::>()); + } +} diff --git a/src/strings/mod.rs b/src/strings/mod.rs index d5f6f52..07b01df 100644 --- a/src/strings/mod.rs +++ b/src/strings/mod.rs @@ -1,3 +1,4 @@ +pub mod experimental; pub mod fixed_string; pub mod shstring; diff --git a/src/uuid.rs b/src/uuid.rs index 0f1a552..249fc4c 100644 --- a/src/uuid.rs +++ b/src/uuid.rs @@ -158,28 +158,6 @@ impl fmt::Display for Uuid { } } -#[cfg(feature = "serde")] -impl serde::Serialize for Uuid { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer - { - serializer.serialize_str(&self.as_string()) - } -} - -#[cfg(feature = "serde")] -impl<'de> serde::Deserialize<'de> for Uuid { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de> - { - let s: &str = serde::Deserialize::deserialize(deserializer)?; - s.parse() - .map_err(serde::de::Error::custom) - } -} - #[derive(Debug)] pub enum ParseError { NotEnoughGroups(usize),