Skip to content

Commit

Permalink
Context 0.1: MultiLookupHuffmanTable
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoric committed Sep 17, 2019
1 parent 7e8d52f commit 9d85128
Show file tree
Hide file tree
Showing 2 changed files with 441 additions and 68 deletions.
194 changes: 158 additions & 36 deletions crates/binjs_io/src/context/huffman/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,23 @@ impl BitSequence {
pub fn new(bits: u32, bit_len: BitLen) -> Self {
Self { bits, bit_len }
}

pub fn bits(&self) -> u32 {
self.bits
}

/// The number of bits of `bits` to use.
pub fn bit_len(&self) -> BitLen {
self.bit_len
}

/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
/// bits.
///
/// # Failure
///
/// This function panics if `bit_len > self.bit_len`.
pub fn split(&self, bit_len: BitLen) -> (u32, u32) {
pub fn split_bits(&self, bit_len: BitLen) -> (u32, u32) {
let shift = self.bit_len - bit_len;
match shift.into() {
0u8 => (self.bits, 0), // Special case: cannot >> 32
Expand All @@ -106,6 +109,25 @@ impl BitSequence {
),
}
}

/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
/// bits.
///
/// # Failure
///
/// This function panics if `bit_len > self.bit_len`.
pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) {
let (prefix, suffix) = self.split_bits(bit_len);
(
BitSequence::new(prefix, bit_len),
BitSequence::new(suffix, self.bit_len - bit_len),
)
}

/// Add lowest-weight to this bit sequence bits until it reaches
/// a sufficient bit length.
///
/// Does nothing if the bit sequence already has a sufficient bitlength.
pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow<BitSequence> {
assert!(total_bit_len.0 <= 32u8);
if total_bit_len <= self.bit_len {
Expand All @@ -117,21 +139,29 @@ impl BitSequence {
}
Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len))
}

/// Prepend a sequence of bits to a sequencce.s
pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
assert!((prefix.bit_len() + self.bit_len()).as_u8() <= 32);
let bits = self.bits | (prefix.bits() << self.bit_len);
let bit_len = self.bit_len + prefix.bit_len;
BitSequence::new(bits, bit_len)
}
}

#[test]
fn test_bit_sequence_split() {
let bits = 0b11111111_11111111_00000000_00000000;
let key = BitSequence::new(bits, BitLen(32));
assert_eq!(key.split(BitLen(0)), (0, bits));
assert_eq!(key.split(BitLen(32)), (bits, 0));
assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0));
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
assert_eq!(key.split_bits(BitLen(32)), (bits, 0));
assert_eq!(key.split_bits(BitLen(16)), (0b11111111_11111111, 0));

let bits = 0b00000000_00000000_00000000_11111111;
let key = BitSequence::new(bits, BitLen(16));
assert_eq!(key.split(BitLen(0)), (0, bits));
assert_eq!(key.split(BitLen(16)), (bits, 0));
assert_eq!(key.split(BitLen(8)), (0, 0b11111111));
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
assert_eq!(key.split_bits(BitLen(16)), (bits, 0));
assert_eq!(key.split_bits(BitLen(8)), (0, 0b11111111));
}

/// A Huffman key
Expand Down Expand Up @@ -159,6 +189,10 @@ impl Key {
Key(BitSequence { bits, bit_len })
}

pub fn from_bit_sequence(sequence: BitSequence) -> Self {
Self::new(sequence.bits, sequence.bit_len)
}

/// The bits in this Key.
///
/// # Invariant
Expand All @@ -176,6 +210,11 @@ impl Key {
pub fn as_bit_sequence(&self) -> &BitSequence {
&self.0
}

pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
let sequence = self.0.with_prefix(prefix);
Key::from_bit_sequence(sequence)
}
}

/// A node in the Huffman tree.
Expand Down Expand Up @@ -219,43 +258,46 @@ impl<T> PartialEq for Node<T> {
}
impl<T> Eq for Node<T> {}

/// Keys associated to a sequence of values.
/// Codebook associated to a sequence of values.
#[derive(Clone, Debug)]
pub struct Keys<T> {
/// The longest bit length that actually appears in `keys`.
pub struct Codebook<T> {
/// The longest bit length that actually appears in `mappings`.
highest_bit_len: BitLen,

/// The sequence of keys.
///
/// Order is meaningful.
keys: Vec<(T, Key)>,
mappings: Vec<(T, Key)>,
}

impl<T> Keys<T> {
impl<T> Codebook<T> {
/// The number of elements in this Codebook.
pub fn len(&self) -> usize {
self.keys.len()
self.mappings.len()
}

/// The longest bit length that acctually appears in this Codebook.
pub fn highest_bit_len(&self) -> BitLen {
self.highest_bit_len
}
}

impl<T> IntoIterator for Keys<T> {
impl<T> IntoIterator for Codebook<T> {
type Item = (T, Key);
type IntoIter = std::vec::IntoIter<(T, Key)>;
fn into_iter(self) -> Self::IntoIter {
self.keys.into_iter()
self.mappings.into_iter()
}
}

impl<T> Keys<T>
impl<T> Codebook<T>
where
T: Ord + Clone,
{
/// Compute a `Keys` from a sequence of values.
/// Compute a `Codebook` from a sequence of values.
///
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
/// If `Keys` may not be computed without exceeding this bit length,
/// If the `Codebook` may not be computed without exceeding this bit length,
/// fail with `Err(problemantic_bit_len)`.
///
/// The current implementation only attempts to produce the best compression
Expand All @@ -278,11 +320,11 @@ where
let counter = map.entry(item).or_insert(0.into());
*counter += 1.into();
}
// Then compute the `Keys`.
// Then compute the `Codebook`.
Self::from_instances(map, max_bit_len)
}

/// Compute a `Keys` from a sequence of values
/// Compute a `Codebook` from a sequence of values
/// with a number of instances already attached.
///
/// The current implementation only attempts to produce the best compression
Expand All @@ -305,27 +347,27 @@ where

// The bits associated to the next value.
let mut bits = 0;
let mut keys = Vec::with_capacity(bit_lengths.len());
let mut mappings = Vec::with_capacity(bit_lengths.len());

for i in 0..bit_lengths.len() - 1 {
let (bit_len, symbol, next_bit_len) = (
bit_lengths[i].1,
bit_lengths[i].0.clone(),
bit_lengths[i + 1].1,
);
keys.push((symbol.clone(), Key::new(bits, bit_len)));
mappings.push((symbol.clone(), Key::new(bits, bit_len)));
bits = (bits + 1) << (next_bit_len - bit_len);
if bit_len > highest_bit_len {
highest_bit_len = bit_len;
}
}
// Handle the last element.
let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1];
keys.push((symbol.clone(), Key::new(bits, bit_len)));
mappings.push((symbol.clone(), Key::new(bits, bit_len)));

return Ok(Self {
highest_bit_len,
keys,
mappings,
});
}

Expand Down Expand Up @@ -412,26 +454,106 @@ where
#[test]
fn test_coded_from_sequence() {
let sample = "appl";
let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap();
let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();

// Symbol 'p' appears twice, we should see 3 codes.
assert_eq!(coded.keys.len(), 3);
assert_eq!(coded.mappings.len(), 3);

// Check order of symbols.
assert_eq!(coded.keys[0].0, 'p');
assert_eq!(coded.keys[1].0, 'a');
assert_eq!(coded.keys[2].0, 'l');
assert_eq!(coded.mappings[0].0, 'p');
assert_eq!(coded.mappings[1].0, 'a');
assert_eq!(coded.mappings[2].0, 'l');

// Check bit length of symbols.
assert_eq!(coded.keys[0].1.bit_len(), 1.into());
assert_eq!(coded.keys[1].1.bit_len(), 2.into());
assert_eq!(coded.keys[2].1.bit_len(), 2.into());
assert_eq!(coded.mappings[0].1.bit_len(), 1.into());
assert_eq!(coded.mappings[1].1.bit_len(), 2.into());
assert_eq!(coded.mappings[2].1.bit_len(), 2.into());

// Check code of symbols.
assert_eq!(coded.keys[0].1.bits(), 0b00);
assert_eq!(coded.keys[1].1.bits(), 0b10);
assert_eq!(coded.keys[2].1.bits(), 0b11);
assert_eq!(coded.mappings[0].1.bits(), 0b00);
assert_eq!(coded.mappings[1].1.bits(), 0b10);
assert_eq!(coded.mappings[2].1.bits(), 0b11);

// Let's try again with a limit to 1 bit paths.
assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2);
assert_eq!(Codebook::from_sequence(sample.chars(), 1).unwrap_err(), 2);
}

impl<T> Codebook<T> {
/// Return the mappings of a Codebook.
pub fn mappings(self) -> Vec<(T, Key)> {
self.mappings
}

/// Split a Codebook into several Codebooks grouped by a common prefix.
///
/// For instance, if `prefix_len` is 2, the result will be a vector of size 2^2
/// containing:
///
/// - at index 0 (= 0b00), all the keys starting with 0b00, minus the prefix 0b00;
/// - at index 1 (= 0b01), all the keys starting with 0b01, minus the prefix 0b01;
/// - at index 2 (= 0b10), all the keys starting with 0b10, minus the prefix 0b10;
/// - at index 3 (= 0b11), all the keys starting with 0b11, minus the prefix 0b11.
///
/// ```
/// let sample = "appl";
/// let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
/// // 0b0 => p
/// // 0b10 => a
/// // 0b11 => l
///
/// let buckets = coded.bucket_by_prefix(1);
/// assert_eq!(buckets.len(), 2);
///
/// // `buckets[0]` contains keys that start with `0`.
/// assert_eq!(buckets[0].len(), 1);
/// let bucket: Vec<_> = buckets[0].iter().collect();
/// assert_eq!(bucket[0].0, 'p');
/// assert_eq!(bucket[0].1, Key::new(0, BitLen(0))); // Key was 0b0, now empty.
///
/// // `buckets[1]` contains keys that start with `1`.
/// assert_eq!(buckets[0].len(), 2);
/// let bucket: Vec<_> = buckets[0].iter().sorted_by_key(|(c, )| c);
/// assert_eq!(bucket[0].0, 'a');
/// assert_eq!(bucket[0].1, Key::new(1, BitLen(1))); // Key was 0b11, now 0b1
/// assert_eq!(bucket[1].0, 'l');
/// assert_eq!(bucket[1].1, Key::new(0, BitLen(1))); // Key was 0b10, now 0b0
/// ```
pub fn bucket_by_prefix(self, prefix_len: BitLen) -> Vec<Codebook<T>> {
assert!(prefix_len < self.highest_bit_len);

// Prepare empty buckets.
let mut result = Vec::with_capacity(1usize << prefix_len);
result.resize_with(1usize << prefix_len, || Codebook {
highest_bit_len: 0.into(),
mappings: vec![],
});

// Dispatch each (value, key) to its bucket.
for (value, key) in self {
let (prefix, suffix) = key.as_bit_sequence().split(prefix_len);
let ref mut bucket = result[prefix.bits() as usize];
if suffix.bit_len() > bucket.highest_bit_len {
bucket.highest_bit_len = suffix.bit_len();
}
bucket
.mappings
.push((value, Key::from_bit_sequence(suffix)));
}

result
}

pub fn map<F, U>(self, mut f: F) -> Codebook<U>
where
F: FnMut(T) -> U,
{
Codebook {
highest_bit_len: self.highest_bit_len,
mappings: self
.mappings
.into_iter()
.map(|(value, key)| (f(value), key))
.collect(),
}
}
}
Loading

0 comments on commit 9d85128

Please sign in to comment.