From 8f263e330a81b6f029008d6abd219cd60c883841 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Tue, 6 Sep 2022 21:23:51 +0100 Subject: [PATCH] Optional decoding of plus character in percent encoded strings --- src/encoding/url.rs | 236 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 188 insertions(+), 48 deletions(-) diff --git a/src/encoding/url.rs b/src/encoding/url.rs index 1139966..52c3a86 100644 --- a/src/encoding/url.rs +++ b/src/encoding/url.rs @@ -5,12 +5,12 @@ use core::str; #[cfg(all(feature = "alloc", not(feature = "std")))] -use alloc::{borrow::Cow, string::String, vec::Vec}; +use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec}; #[cfg(feature = "std")] use std::borrow::Cow; -use crate::{strings::FixedString, sink::StrSink}; +use crate::{sink::StrSink, strings::FixedString}; use super::hex; @@ -35,12 +35,12 @@ where // SAFETY: // We have already checked that `i < xs.len()`, so `..i` is in bounds for `xs`. let prefix = unsafe { xs.get_unchecked(..i) }; - + // SAFETY: // We have already checked that `i < xs.len()`, so `i + 1 <= xs.len()` must hold. // Therefore, `(i + 1)..` is in bounds for `xs`. let suffix = unsafe { xs.get_unchecked((i + 1)..) }; - + return (prefix, Some((x, suffix))); } @@ -87,9 +87,10 @@ impl<'a> Iterator for PercentEncoder<'a> { // a `prefix` consisting entirely of characters which do not need to be percent-encoded, // followed by a `suffix` which is either `None` or starts which a character which needs // to be percent-encoded. - let (prefix, suffix) = split_at(self.remaining, |b| { - !matches!(b, b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'-' | b'.' | b'_' | b'~') - }); + let (prefix, suffix) = split_at( + self.remaining, + |b| !matches!(b, b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'-' | b'.' | b'_' | b'~'), + ); // SAFETY: // `prefix` only contains characters in the unreserved set, which are all valid ASCII @@ -103,7 +104,7 @@ impl<'a> Iterator for PercentEncoder<'a> { Some((byte, suffix)) => { self.remaining = suffix; Some((prefix, Some(Self::percent_encode_byte(byte)))) - }, + } // If there's no suffix, then we've reached the end of the input string. Therefore, we // set the length of the iterator's slice to 0 to indicate that we are done, and then @@ -111,7 +112,7 @@ impl<'a> Iterator for PercentEncoder<'a> { None => { self.remaining = &self.remaining[self.remaining.len()..]; Some((prefix, None)) - }, + } } } } @@ -130,7 +131,7 @@ where buf.push_str(prefix); buf.push_str(&encoded_byte); - for (prefix, encoded_byte) in encoder { + for (prefix, encoded_byte) in encoder { buf.push_str(prefix); if let Some(encoded_byte) = encoded_byte { buf.push_str(&encoded_byte); @@ -138,7 +139,7 @@ where } Cow::Owned(buf) - }, + } Some((prefix, None)) => Cow::Borrowed(prefix), @@ -174,22 +175,27 @@ where Ok(()) } -pub struct PercentDecoder<'a> { +pub struct PercentDecoder<'a, M> { remaining: &'a [u8], + mode: M, } -impl<'a> PercentDecoder<'a> { - pub fn new(bytes: &'a B) -> Self +impl<'a, M> PercentDecoder<'a, M> { + pub fn new(bytes: &'a B, mode: M) -> Self where B: AsRef<[u8]> + ?Sized, { Self { remaining: bytes.as_ref(), + mode, } } } -impl<'a> Iterator for PercentDecoder<'a> { +impl<'a, M> Iterator for PercentDecoder<'a, M> +where + M: PercentDecodeMode, +{ type Item = (&'a [u8], Option); fn next(&mut self) -> Option { @@ -200,6 +206,23 @@ impl<'a> Iterator for PercentDecoder<'a> { let mut i = 0; while i < self.remaining.len() { + // The '+' character being decoded to a space does not appear in the URL standard + // section on percent-encoding, but it does appear in the section on + // application/x-www-form-urlencoded. We implement it here as an optional feature to + // simplify things. + if self.mode.plus_space() && self.remaining[i] == b'+' { + // SAFETY: + // `i < self.remaining.len()`, so `..i` is a valid range over the slice. + let prefix = unsafe { self.remaining.get_unchecked(..i) }; + + // SAFETY: + // `i < self.remaining.len()`, so `i + 1 <= self.remaining.len()`. Therefore, + // `(i + 1)..` is a valid range over the slice. + self.remaining = unsafe { self.remaining.get_unchecked((i + 1)..) }; + + return Some((prefix, Some(b' '))); + } + // According to the URL standard, the only special case we need to handle is when the // percent character '%' is followed immediately by two hex digits. We check that there // are at least two characters after the percent with `self.remaining.len() - i > 2`, @@ -214,9 +237,12 @@ impl<'a> Iterator for PercentDecoder<'a> { // gives `i + 2 < self.remaining.len()`. Therefore, `i + 1` and `i + 2` are valid // indexes into the slice. let (msb, lsb) = unsafe { - (*self.remaining.get_unchecked(i + 1), *self.remaining.get_unchecked(i + 2)) + ( + *self.remaining.get_unchecked(i + 1), + *self.remaining.get_unchecked(i + 2), + ) }; - + // If the two bytes are valid hex digits, decode the hex number. if let Ok(decoded) = hex::hex_to_byte(msb, lsb) { // SAFETY: @@ -243,12 +269,66 @@ impl<'a> Iterator for PercentDecoder<'a> { } } +pub trait PercentDecodeMode: percent_decode_mode::PercentDecodeModeSealed {} + +impl PercentDecodeMode for &T {} + +#[cfg(feature = "alloc")] +impl PercentDecodeMode for Box {} + +pub struct StandardDecode; + +impl PercentDecodeMode for StandardDecode {} + +pub struct FormDecode; + +impl PercentDecodeMode for FormDecode {} + +mod percent_decode_mode { + #[cfg(all(feature = "alloc", not(feature = "std")))] + use alloc::boxed::Box; + + pub trait PercentDecodeModeSealed { + fn plus_space(&self) -> bool; + } + + impl PercentDecodeModeSealed for &T { + #[inline] + fn plus_space(&self) -> bool { + T::plus_space(self) + } + } + + #[cfg(feature = "alloc")] + impl PercentDecodeModeSealed for Box { + #[inline] + fn plus_space(&self) -> bool { + T::plus_space(&**self) + } + } + + impl PercentDecodeModeSealed for super::StandardDecode { + #[inline] + fn plus_space(&self) -> bool { + false + } + } + + impl PercentDecodeModeSealed for super::FormDecode { + #[inline] + fn plus_space(&self) -> bool { + true + } + } +} + #[cfg(feature = "alloc")] -pub fn percent_decode(bytes: &B) -> Cow<[u8]> +pub fn percent_decode(bytes: &B, mode: M) -> Cow<[u8]> where B: AsRef<[u8]> + ?Sized, + M: PercentDecodeMode, { - let mut decoder = PercentDecoder::new(bytes); + let mut decoder = PercentDecoder::new(bytes, mode); match decoder.next() { Some((prefix, Some(byte))) => { @@ -264,7 +344,7 @@ where } Cow::Owned(buf) - }, + } Some((prefix, None)) => Cow::Borrowed(prefix), @@ -273,23 +353,27 @@ where } #[cfg(feature = "alloc")] -pub fn percent_decode_utf8(bytes: &B) -> Cow +pub fn percent_decode_utf8(bytes: &B, mode: M) -> Cow where B: AsRef<[u8]> + ?Sized, + M: PercentDecodeMode, { - match percent_decode(bytes) { + match percent_decode(bytes, mode) { Cow::Borrowed(decoded) => String::from_utf8_lossy(decoded), Cow::Owned(decoded) => match String::from_utf8_lossy(&decoded) { Cow::Borrowed(decoded_str) => { debug_assert_eq!(decoded_str.len(), decoded.len()); - debug_assert_eq!(decoded_str.as_bytes().as_ptr() as *const u8, decoded.as_ptr()); + debug_assert_eq!( + decoded_str.as_bytes().as_ptr() as *const u8, + decoded.as_ptr() + ); // SAFETY: // `String::from_utf8_lossy` returned a `Cow::Borrowed`, which means that // `decoded` is valid UTF-8. let decoded = unsafe { String::from_utf8_unchecked(decoded) }; Cow::Owned(decoded) - }, + } Cow::Owned(decoded) => Cow::Owned(decoded), }, } @@ -311,9 +395,18 @@ mod tests { assert!(matches!(percent_encode(""), Cow::Borrowed(""))); assert!(matches!(percent_encode("foobar"), Cow::Borrowed("foobar"))); - assert_eq!(&*percent_encode("Ladies + Gentlemen"), "Ladies%20%2B%20Gentlemen"); - assert_eq!(&*percent_encode("An encoded string!"), "An%20encoded%20string%21"); - assert_eq!(&*percent_encode("Dogs, Cats & Mice"), "Dogs%2C%20Cats%20%26%20Mice"); + assert_eq!( + &*percent_encode("Ladies + Gentlemen"), + "Ladies%20%2B%20Gentlemen" + ); + assert_eq!( + &*percent_encode("An encoded string!"), + "An%20encoded%20string%21" + ); + assert_eq!( + &*percent_encode("Dogs, Cats & Mice"), + "Dogs%2C%20Cats%20%26%20Mice" + ); assert_eq!(&*percent_encode("☃"), "%E2%98%83"); } @@ -326,26 +419,73 @@ mod tests { #[cfg(feature = "std")] use std::borrow::Cow; - use super::{percent_decode_utf8}; - - assert!(matches!(percent_decode_utf8(""), Cow::Borrowed(""))); - assert!(matches!(percent_decode_utf8("foobar"), Cow::Borrowed("foobar"))); - - assert_eq!(&*percent_decode_utf8("Ladies%20%2B%20Gentlemen"), "Ladies + Gentlemen"); - assert_eq!(&*percent_decode_utf8("An%20encoded%20string%21"), "An encoded string!"); - assert_eq!(&*percent_decode_utf8("Dogs%2C%20Cats%20%26%20Mice"), "Dogs, Cats & Mice"); - assert_eq!(&*percent_decode_utf8("%E2%98%83"), "☃"); - - assert_eq!(&*percent_decode_utf8("%e2%98%83"), "☃"); - - assert_eq!(&*percent_decode_utf8("%41%6E%20%65%6E%63%6F%64%65%64%20%73%74%72%69%6E%67%21"), "An encoded string!"); - - assert_eq!(&*percent_decode_utf8("hello!"), "hello!"); - assert_eq!(&*percent_decode_utf8("hello%"), "hello%"); - assert_eq!(&*percent_decode_utf8("%a"), "%a"); - assert_eq!(&*percent_decode_utf8("%za"), "%za"); - assert_eq!(&*percent_decode_utf8("%az"), "%az"); - - assert_eq!(&*percent_decode_utf8("hello%FFworld"), "hello�world"); + use super::{percent_decode_utf8, FormDecode, StandardDecode}; + + assert!(matches!( + percent_decode_utf8("", StandardDecode), + Cow::Borrowed("") + )); + assert!(matches!( + percent_decode_utf8("foobar", StandardDecode), + Cow::Borrowed("foobar") + )); + + assert_eq!( + &*percent_decode_utf8("Ladies%20%2B%20Gentlemen", StandardDecode), + "Ladies + Gentlemen" + ); + assert_eq!( + &*percent_decode_utf8("An%20encoded%20string%21", StandardDecode), + "An encoded string!" + ); + assert_eq!( + &*percent_decode_utf8("Dogs%2C%20Cats%20%26%20Mice", StandardDecode), + "Dogs, Cats & Mice" + ); + assert_eq!( + &*percent_decode_utf8("%E2%98%83", StandardDecode), + "☃" + ); + + assert_eq!( + &*percent_decode_utf8("%e2%98%83", StandardDecode), + "☃" + ); + + assert_eq!( + &*percent_decode_utf8( + "%41%6E%20%65%6E%63%6F%64%65%64%20%73%74%72%69%6E%67%21", + StandardDecode + ), + "An encoded string!" + ); + + assert_eq!(&*percent_decode_utf8("hello!", StandardDecode), "hello!"); + assert_eq!(&*percent_decode_utf8("hello%", StandardDecode), "hello%"); + assert_eq!(&*percent_decode_utf8("%a", StandardDecode), "%a"); + assert_eq!(&*percent_decode_utf8("%za", StandardDecode), "%za"); + assert_eq!(&*percent_decode_utf8("%az", StandardDecode), "%az"); + + assert_eq!( + &*percent_decode_utf8("hello%FFworld", StandardDecode), + "hello�world" + ); + + assert_eq!( + &*percent_decode_utf8("hello+world", StandardDecode), + "hello+world" + ); + assert_eq!( + &*percent_decode_utf8("hello+world", FormDecode), + "hello world" + ); + assert_eq!( + &*percent_decode_utf8("hello++world", FormDecode), + "hello world" + ); + assert_eq!( + &*percent_decode_utf8("+hello+world+", FormDecode), + " hello world " + ); } }