From c274ba6f018e233fb827e61fc320e88a2b019c93 Mon Sep 17 00:00:00 2001 From: pantonshire Date: Mon, 5 Jun 2023 11:04:00 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20only=20show=20consumed=20bad=20b?= =?UTF-8?q?ytes=20for=20invalid=20characters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the bytes displayed for invalid characters included bytes from the byte stream that were peeked rather than consumed. This resulted in certain bytes being displayed multiple times, since the peeked byte could appear in the following character. For example, `printf '\xce\x61' | utfdump_bin` would result in the byte 0xce being displayed twice, once at the end of the invalid character and once as the valid character `a`. This patch modifies `utfdump::utf8::Utf8Error` so it also stores the number of consumed bad bytes, enabling the binary to output only the consumed bad bytes. --- bin/src/main.rs | 4 ++-- lib/src/utf8.rs | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/src/main.rs b/bin/src/main.rs index c32045b..5f97f63 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -110,14 +110,14 @@ impl OutRow { } fn from_bad_char(err: Utf8Error) -> Self { - let (bad_bytes, num_bad_bytes) = err.into_parts(); + let (bad_bytes, _num_bad_bytes, num_consumed_bad_bytes) = err.into_parts(); Self { display_char: CappedString::new_truncating("\u{fffd}"), codepoint: Optional::None, utf_8_bytes: Utf8Bytes { buf: bad_bytes, - len: num_bad_bytes, + len: num_consumed_bad_bytes, }, name: Optional::Some(""), category: Optional::None, diff --git a/lib/src/utf8.rs b/lib/src/utf8.rs index a15e18e..e98e4c3 100644 --- a/lib/src/utf8.rs +++ b/lib/src/utf8.rs @@ -128,6 +128,7 @@ where return Some(Err(Utf8Error { bad_bytes: bytes_seen, num_bad_bytes: 1, + num_consumed_bad_bytes: 1, })); }, } @@ -140,6 +141,7 @@ where None => return Some(Err(Utf8Error { bad_bytes: bytes_seen, num_bad_bytes: usize::from(i) + 1, + num_consumed_bad_bytes: usize::from(i), })), }; @@ -149,6 +151,7 @@ where return Some(Err(Utf8Error { bad_bytes: bytes_seen, num_bad_bytes: usize::from(i) + 2, + num_consumed_bad_bytes: usize::from(i) + 1, })); } @@ -172,6 +175,7 @@ where pub struct Utf8Error { bad_bytes: [u8; 4], num_bad_bytes: usize, + num_consumed_bad_bytes: usize, } impl Utf8Error { @@ -179,8 +183,9 @@ impl Utf8Error { &self.bad_bytes[..self.num_bad_bytes] } - pub fn into_parts(self) -> ([u8; 4], usize) { - (self.bad_bytes, self.num_bad_bytes) + // FIXME: return some type with u8 array + length + pub fn into_parts(self) -> ([u8; 4], usize, usize) { + (self.bad_bytes, self.num_bad_bytes, self.num_consumed_bad_bytes) } }