🐛 only show consumed bad bytes for invalid characters

Previously, the bytes displayed for invalid characters included bytes
from the byte stream that were peeked rather than consumed. This
resulted in certain bytes being displayed multiple times, since the
peeked byte could appear in the following character.

For example, `printf '\xce\x61' | utfdump_bin` would result in the byte
0xce being displayed twice, once at the end of the invalid character and
once as the valid character `a`.

This patch modifies `utfdump::utf8::Utf8Error` so it also stores the
number of consumed bad bytes, enabling the binary to output only the
consumed bad bytes.
main
pantonshire 3 years ago
parent ecf2abbdad
commit c274ba6f01

@ -110,14 +110,14 @@ impl OutRow {
}
fn from_bad_char(err: Utf8Error) -> Self {
let (bad_bytes, num_bad_bytes) = err.into_parts();
let (bad_bytes, _num_bad_bytes, num_consumed_bad_bytes) = err.into_parts();
Self {
display_char: CappedString::new_truncating("\u{fffd}"),
codepoint: Optional::None,
utf_8_bytes: Utf8Bytes {
buf: bad_bytes,
len: num_bad_bytes,
len: num_consumed_bad_bytes,
},
name: Optional::Some("<invalid>"),
category: Optional::None,

@ -128,6 +128,7 @@ where
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: 1,
num_consumed_bad_bytes: 1,
}));
},
}
@ -140,6 +141,7 @@ where
None => return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 1,
num_consumed_bad_bytes: usize::from(i),
})),
};
@ -149,6 +151,7 @@ where
return Some(Err(Utf8Error {
bad_bytes: bytes_seen,
num_bad_bytes: usize::from(i) + 2,
num_consumed_bad_bytes: usize::from(i) + 1,
}));
}
@ -172,6 +175,7 @@ where
pub struct Utf8Error {
bad_bytes: [u8; 4],
num_bad_bytes: usize,
num_consumed_bad_bytes: usize,
}
impl Utf8Error {
@ -179,8 +183,9 @@ impl Utf8Error {
&self.bad_bytes[..self.num_bad_bytes]
}
pub fn into_parts(self) -> ([u8; 4], usize) {
(self.bad_bytes, self.num_bad_bytes)
// FIXME: return some type with u8 array + length
pub fn into_parts(self) -> ([u8; 4], usize, usize) {
(self.bad_bytes, self.num_bad_bytes, self.num_consumed_bad_bytes)
}
}

Loading…
Cancel
Save