utfdump/lib/src/unicode_data.rs

use core::{fmt, mem, slice, str};

use tap::Pipe;

use crate::character::{
    CharData,
    Category,
    BidiCategory,
    OptionalDecompKind,
    CombiningClass,
    DecompMapping,
};

const MAGIC_NUMBER: [u8; 8] = *b"UTFDUMP!";

#[derive(Clone, Copy)]
pub struct UnicodeData<'a> {
    group_table: GroupTable<'a>,
    char_table: CharTable<'a>,
    string_table: StringTable<'a>,
}

impl<'a> UnicodeData<'a> {
    pub(crate) fn from_bytes(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
        let mut bs = ByteStream(bs);

        if bs.consume(MAGIC_NUMBER.len())? != MAGIC_NUMBER {
            return Err(UnicodeDataError::InvalidHeader);
        }

        let group_table_len = bs.consume_4_byte_len()?;
        let char_table_len = bs.consume_4_byte_len()?;
        let string_table_len = bs.consume_4_byte_len()?;

        let group_table = bs.consume(group_table_len)?.pipe(GroupTable::new)?;
        let char_table = bs.consume(char_table_len)?.pipe(CharTable::new)?;
        let string_table = bs.consume(string_table_len)?.pipe(StringTable::new);

        bs.check_empty()?;

        Ok(Self { group_table, char_table, string_table })
    }

    pub fn get(self, codepoint: u32) -> Option<CharData<'a>> {
        let entry = self.char_entry_for(codepoint)?;

        let flags_and_categories = entry.flags_and_categories.to_u16();
        let category = Category::decode((flags_and_categories & 0x1f) as u8)?;
        let bidi = BidiCategory::decode(((flags_and_categories >> 5) & 0x1f) as u8)?;
        let decomp_kind = OptionalDecompKind::decode(((flags_and_categories >> 10) & 0x1f) as u8)?;
        let mirrored = (flags_and_categories >> 15) != 0;

        let name = self.string_table.get_u24_le(entry.name)?;

        let decomp_value = self.string_table.get_u24_le(entry.decomp);
        let decomp = match (decomp_kind, decomp_value) {
            (OptionalDecompKind::None, _) | (_, None) => None,
            (OptionalDecompKind::Anon, Some(value)) => {
                Some(DecompMapping::new(None, value))
            },
            (OptionalDecompKind::Named(kind), Some(value)) => {
                Some(DecompMapping::new(Some(kind), value))
            },
        };

        let numeric = self.string_table.get_u24_le(entry.numeric);
        let old_name = self.string_table.get_u24_le(entry.old_name);
        let comment = self.string_table.get_u24_le(entry.comment);
        let uppercase = self.string_table.get_u24_le(entry.uppercase);
        let lowercase = self.string_table.get_u24_le(entry.lowercase);
        let titlecase = self.string_table.get_u24_le(entry.titlecase);

        let combining = CombiningClass(entry.combining);

        let decimal_digit = match entry.digit & 0xf {
            0xf => None,
            n => Some(n),
        };

        let digit = match (entry.digit >> 4) & 0xf {
            0xf => None,
            n => Some(n),
        };

        Some(CharData {
            codepoint,
            name,
            category,
            combining,
            bidi,
            decomp,
            decimal_digit,
            digit,
            numeric,
            mirrored,
            old_name,
            comment,
            uppercase,
            lowercase,
            titlecase,
        })
    }

    fn char_entry_for(self, codepoint: u32) -> Option<&'a CharTableEntry> {
        let index = self.group_table
            .char_table_index_for(codepoint)?
            .pipe(usize::try_from)
            .ok()?;

        self.char_table.get(index)
    }
}

#[derive(Clone, Copy, Debug)]
struct GroupTable<'a> {
    entries: &'a [GroupTableEntry],
}

impl<'a> GroupTable<'a> {
    fn new(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
        if bs.len() % GroupTableEntry::SIZE != 0 {
            return Err(UnicodeDataError::InvalidTableSize);
        }

        let num_entries = bs.len() / GroupTableEntry::SIZE;

        // SAFETY:
        // - The pointer is valid for reads of `num_entries * mem::size_of::<GroupTableEntry>()`
        //   bytes; `num_entries = bs.len() / mem::size_of::<GroupTableEntry>()`, so
        //   `num_entries * mem::size_of::<GroupTableEntry>() <= bs.len()` (the inequality is due
        //   to flooring integer division), and clearly a pointer to `bs` is valid for reads of
        //   <= `bs.len()` bytes.
        //
        // - `u8` and `GroupTableEntry` both have an alignment of 1 (since `GroupTableEntry` is
        //    packed), so the pointer is correctly aligned.
        //
        // - The pointer points to `num_entries` consecutive properly-initialised `GroupTableEntry`
        //   values, as `bs` contains initialised data and `GroupTableEntry` consists only of
        //   arrays of `u8` of varying sizes, for which any bit pattern is valid.
        //
        // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be
        //   mutated by safe code for the duration of the lifetime `'a`.
        //
        // - The total length of the slice does not exceed `isize::MAX`, since it is no larger
        //   than `bs` which is a valid slice and therefore no larger than `isize::MAX`.
        let entries = unsafe {
            slice::from_raw_parts(
                bs.as_ptr() as *const GroupTableEntry,
                num_entries
            )
        };

        Ok(Self { entries })
    }

    // TODO: compare performance of binary search to linear search
    // TODO: fast path for characters before the first group
    fn char_table_index_for(self, codepoint: u32) -> Option<u32> {
        let mut entries = self.entries;
        let mut offset = 0;

        loop {
            if entries.len() == 0 {
                break codepoint.checked_sub(offset);
            }

            let midpoint = entries.len() / 2;
            let entry = &entries[midpoint];
            let start = entry.start.to_u32();
            let end = entry.end.to_u32();
            let total_len_before = entry.total_len_before.to_u32();

            if start <= codepoint && codepoint <= end {
                match entry.kind {
                    GROUP_KIND_USE_PREV_VALUE => {
                        // This group uses the same character data as the codepoint immediately
                        // before the group start (`start - 1`). Subtract `total_len_before`, which
                        // is the total length of all groups before this group, from `start - 1` to
                        // find the index of its character data in the character table.
                        break start
                            .checked_sub(1)
                            .expect("first codepoint for a USE_PREV_VALUE group should always be at least 1")
                            .checked_sub(total_len_before)
                            .expect("computed character data index should not underflow")
                            .pipe(Some)
                    },

                    // If the codepoint is in a group which is not `USE_PREV_VALUE`, we take it to
                    // be a codepoint with no associated character data.
                    _ => break None,
                }
            } else if codepoint > end {
                // Since the `end` is inclusive, the length of the group is calculated as
                // `(end - start) + 1`.
                let group_len = end
                    .checked_sub(start)
                    .expect("group start should be less than or equal to the group end")
                    .checked_add(1)
                    .expect("group length should not overflow a u32");

                // `total_len_before` is the total length of all groups before this group, so we
                // can calculate the total length of all groups up to and including this group by
                // adding `group_len` to it. We assign this to `offset` because this is the group
                // with the largest `end` value that is less than the codepoint, and is therefore
                // the offset that should be used to calculate the character table index for this
                // codepoint in the event that there are no groups with a larger `end` value less
                // than the codepoint and the codepoint is not contained in a group.
                offset = total_len_before
                    .checked_add(group_len)
                    .expect("cumulative group length should not overflow a u32");

                entries = &entries[(midpoint + 1)..];
            } else {
                entries = &entries[..midpoint];
            }
        }
    }
}

const GROUP_KIND_NO_VALUE: u8 = 0;
const GROUP_KIND_USE_PREV_VALUE: u8 = 1;

#[derive(Debug)]
#[repr(C, packed)]
struct GroupTableEntry {
    start: U32Le,
    end: U32Le,
    total_len_before: U32Le,
    kind: u8,
}

impl GroupTableEntry {
    const SIZE: usize = mem::size_of::<Self>();
}

#[derive(Debug)]
#[derive(Clone, Copy)]
struct CharTable<'a> {
    entries: &'a [CharTableEntry],
}

impl<'a> CharTable<'a> {
    fn new(bs: &'a [u8]) -> Result<Self, UnicodeDataError> {
        if bs.len() % CharTableEntry::SIZE != 0 {
            return Err(UnicodeDataError::InvalidTableSize);
        }

        let num_entries = bs.len() / CharTableEntry::SIZE;

        // SAFETY:
        // - The pointer is valid for reads of `num_entries * mem::size_of::<CharTableEntry>()`
        //   bytes; `num_entries = bs.len() / mem::size_of::<CharTableEntry>()`, so
        //   `num_entries * mem::size_of::<CharTableEntry>() <= bs.len()` (the inequality is due
        //   to flooring integer division), and clearly a pointer to `bs` is valid for reads of
        //   <= `bs.len()` bytes.
        //
        // - `u8` and `CharTableEntry` both have an alignment of 1 (since `CharTableEntry` is
        //    packed), so the pointer is correctly aligned.
        //
        // - The pointer points to `num_entries` consecutive properly-initialised `CharTableEntry`
        //   values, as `bs` contains initialised data and `CharTableEntry` consists only of
        //   arrays of `u8` of varying sizes, for which any bit pattern is valid.
        //
        // - Since we obtained the pointer from an immutable reference `bs`, the data cannot be
        //   mutated by safe code for the duration of the lifetime `'a`.
        //
        // - The total length of the slice does not exceed `isize::MAX`, since it is no larger
        //   than `bs` which is a valid slice and therefore no larger than `isize::MAX`.
        let entries = unsafe {
            slice::from_raw_parts(
                bs.as_ptr() as *const CharTableEntry,
                num_entries
            )
        };

        Ok(Self { entries })
    }

    fn get(self, i: usize) -> Option<&'a CharTableEntry> {
        self.entries.get(i)
    }
}

#[derive(Debug)]
#[repr(C, packed)]
struct CharTableEntry {
    flags_and_categories: U16Le,
    name: U24Le,
    decomp: U24Le,
    numeric: U24Le,
    old_name: U24Le,
    comment: U24Le,
    uppercase: U24Le,
    lowercase: U24Le,
    titlecase: U24Le,
    combining: u8,
    digit: u8,
}

impl CharTableEntry {
    const SIZE: usize = mem::size_of::<Self>();
}

#[derive(Clone, Copy)]
struct StringTable<'a> {
    inner: &'a [u8],
}

impl<'a> StringTable<'a> {
    fn new(bs: &'a [u8]) -> Self {
        Self { inner: bs }
    }

    fn get(self, i: usize) -> Option<&'a str> {
        let len = usize::from(*self.inner.get(i)?);

        let str_start = i.checked_add(1)?;
        let str_end = str_start.checked_add(len)?;

        self.inner.get(str_start..str_end)
            .and_then(|s| str::from_utf8(s).ok())
    }

    fn get_u24_le(self, i: U24Le) -> Option<&'a str> {
        const NIL_INDEX_PATTERN: [u8; 3] = [0xff; 3];

        if i.0 == NIL_INDEX_PATTERN {
            return None;
        }

        i.to_usize().and_then(|i| self.get(i))
    }
}

#[derive(Clone, Copy)]
#[repr(transparent)]
struct U16Le([u8; 2]);

impl U16Le {
    fn to_u16(self) -> u16 {
        u16::from_le_bytes(self.0)
    }
}

impl fmt::Debug for U16Le {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::Debug::fmt(&self.to_u16(), f)
    }
}

#[derive(Clone, Copy)]
#[repr(transparent)]
struct U24Le([u8; 3]);

impl U24Le {
    fn to_u32(self) -> u32 {
        let mut buf = [0u8; 4];
        (&mut buf[..3]).copy_from_slice(&self.0);
        u32::from_le_bytes(buf)
    }

    fn to_usize(self) -> Option<usize> {
        usize::try_from(self.to_u32()).ok()
    }
}

impl fmt::Debug for U24Le {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::Debug::fmt(&self.to_u32(), f)
    }
}

#[derive(Clone, Copy)]
#[repr(transparent)]
struct U32Le([u8; 4]);

impl U32Le {
    fn to_u32(self) -> u32 {
        u32::from_le_bytes(self.0)
    }
}

impl fmt::Debug for U32Le {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::Debug::fmt(&self.to_u32(), f)
    }
}

struct ByteStream<'a>(&'a [u8]);

impl<'a> ByteStream<'a> {
    fn consume(&mut self, n: usize) -> Result<&'a [u8], UnicodeDataError> {
        if n > self.0.len() {
            return Err(UnicodeDataError::InsufficientBytes);
        }

        let consumed = &self.0[..n];
        self.0 = &self.0[n..];
        Ok(consumed)
    }

    fn consume_4_byte_len(&mut self) -> Result<usize, UnicodeDataError> {
        self.consume(4)?
            .pipe(<[u8; 4]>::try_from)
            .unwrap()
            .pipe(u32::from_le_bytes)
            .pipe(usize::try_from)
            .map_err(|_| UnicodeDataError::OutOfBounds)
    }

    fn check_empty(&self) -> Result<(), UnicodeDataError> {
        self.0
            .is_empty()
            .then_some(())
            .ok_or(UnicodeDataError::LeftoverBytes)
    }
}

#[derive(Debug)]
pub enum UnicodeDataError {
    InvalidHeader,
    InsufficientBytes,
    OutOfBounds,
    LeftoverBytes,
    InvalidTableSize,
}

impl fmt::Display for UnicodeDataError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::InvalidHeader => write!(f, "invalid header"),
            Self::InsufficientBytes => write!(f, "fewer bytes than expected"),
            Self::OutOfBounds => write!(f, "index out of bounds"),
            Self::LeftoverBytes => write!(f, "unexpected bytes found after expected end of data"),
            Self::InvalidTableSize => write!(f, "invalid table size"),
        }
    }
}