From 945b0d025fae3819f02dc1076fb0e7270199d143 Mon Sep 17 00:00:00 2001
From: Zoltan Szabadka <szabadka@google.com>
Date: Thu, 7 May 2015 17:23:07 +0200
Subject: [PATCH] Use a static context map with two buckets for UTF8 data.

Enabled for quality >= 4, and if there are no obvious
UTF8 violations detected.
For each block, we gather two separate histograms, one
for continuation bytes and one for ASCII or lead bytes.
---
 enc/encode.cc    |  74 +++++++++++++-
 enc/metablock.cc | 244 +++++++++++++++++++++++++++++++++++++++++++++++
 enc/metablock.h  |  18 ++++
 3 files changed, 331 insertions(+), 5 deletions(-)

diff --git a/enc/encode.cc b/enc/encode.cc
index b115786..b064f39 100644
--- a/enc/encode.cc
+++ b/enc/encode.cc
@@ -351,6 +351,53 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
   return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
 }
 
+void DecideOverLiteralContextModeling(const uint8_t* input,
+                                      size_t start_pos,
+                                      size_t length,
+                                      size_t mask,
+                                      int quality,
+                                      int* literal_context_mode,
+                                      int* num_literal_contexts,
+                                      const int** literal_context_map) {
+  if (quality <= 3 || length < 64) {
+    return;
+  }
+  // Simple heuristics to guess if the data is UTF8 or not. The goal is to
+  // recognize non-UTF8 data quickly by searching for the following obvious
+  // violations: a continuation byte following an ASCII byte or an ASCII or
+  // lead byte following a lead byte. If we find such violation we decide that
+  // the data is not UTF8. To make the analysis of UTF8 data faster we only
+  // examine 64 byte long strides at every 4kB intervals, if there are no
+  // violations found, we assume the whole data is UTF8.
+  const size_t end_pos = start_pos + length;
+  for (; start_pos + 64 < end_pos; start_pos += 4096) {
+    const size_t stride_end_pos = start_pos + 64;
+    uint8_t prev = input[start_pos & mask];
+    for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
+      const uint8_t literal = input[pos & mask];
+      if ((prev < 128 && (literal & 0xc0) == 0x80) ||
+          (prev >= 192 && (literal & 0xc0) != 0x80)) {
+        return;
+      }
+      prev = literal;
+    }
+  }
+  *literal_context_mode = CONTEXT_UTF8;
+  // If the data is UTF8, this static context map distinguishes between ASCII
+  // or lead bytes and continuation bytes: the UTF8 context value based on the
+  // last two bytes is 2 or 3 if and only if the next byte is a continuation
+  // byte (see table in context.h).
+  static const int kStaticContextMap[64] = {
+    0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  static const int kNumLiteralContexts = 2;
+  *num_literal_contexts = kNumLiteralContexts;
+  *literal_context_map = kStaticContextMap;
+}
+
 bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
                                               const bool utf8_mode,
                                               size_t* out_size,
@@ -406,8 +453,6 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
                                 num_direct_distance_codes,
                                 distance_postfix_bits);
     }
-    int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
-    MetaBlockSplit mb;
     if (params_.quality == 1) {
       if (!StoreMetaBlockTrivial(data, last_flush_pos_, bytes, mask, is_last,
                                  commands_.get(), num_commands_,
@@ -416,10 +461,29 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
         return false;
       }
     } else {
+      MetaBlockSplit mb;
+      int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
       if (params_.greedy_block_split) {
-        BuildMetaBlockGreedy(data, last_flush_pos_, mask,
-                             commands_.get(), num_commands_,
-                             &mb);
+        int num_literal_contexts = 1;
+        const int* literal_context_map = NULL;
+        DecideOverLiteralContextModeling(data, last_flush_pos_, bytes, mask,
+                                         params_.quality,
+                                         &literal_context_mode,
+                                         &num_literal_contexts,
+                                         &literal_context_map);
+        if (literal_context_map == NULL) {
+          BuildMetaBlockGreedy(data, last_flush_pos_, mask,
+                               commands_.get(), num_commands_,
+                               &mb);
+        } else {
+          BuildMetaBlockGreedyWithContexts(data, last_flush_pos_, mask,
+                                           prev_byte_, prev_byte2_,
+                                           literal_context_mode,
+                                           num_literal_contexts,
+                                           literal_context_map,
+                                           commands_.get(), num_commands_,
+                                           &mb);
+        }
       } else {
         BuildMetaBlock(data, last_flush_pos_, mask,
                        prev_byte_, prev_byte2_,
diff --git a/enc/metablock.cc b/enc/metablock.cc
index 19f4d4a..44f8267 100644
--- a/enc/metablock.cc
+++ b/enc/metablock.cc
@@ -18,6 +18,7 @@
 #include "./metablock.h"
 
 #include "./block_splitter.h"
+#include "./context.h"
 #include "./cluster.h"
 #include "./histogram.h"
 
@@ -297,6 +298,249 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
   dist_blocks.FinishBlock(/* is_final = */ true);
 }
 
+// Greedy block splitter for one block category (literal, command or distance).
+// Gathers histograms for all context buckets.
+template<typename HistogramType>
+class ContextBlockSplitter {
+ public:
+  ContextBlockSplitter(int alphabet_size,
+                       int num_contexts,
+                       int min_block_size,
+                       double split_threshold,
+                       int num_symbols,
+                       BlockSplit* split,
+                       std::vector<HistogramType>* histograms)
+      : alphabet_size_(alphabet_size),
+        num_contexts_(num_contexts),
+        max_block_types_(kMaxBlockTypes / num_contexts),
+        min_block_size_(min_block_size),
+        split_threshold_(split_threshold),
+        num_blocks_(0),
+        split_(split),
+        histograms_(histograms),
+        target_block_size_(min_block_size),
+        block_size_(0),
+        curr_histogram_ix_(0),
+        last_entropy_(2 * num_contexts),
+        merge_last_count_(0) {
+    int max_num_blocks = num_symbols / min_block_size + 1;
+    // We have to allocate one more histogram than the maximum number of block
+    // types for the current histogram when the meta-block is too big.
+    int max_num_types = std::min(max_num_blocks, max_block_types_ + 1);
+    split_->lengths.resize(max_num_blocks);
+    split_->types.resize(max_num_blocks);
+    histograms_->resize(max_num_types * num_contexts);
+    last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
+  }
+
+  // Adds the next symbol to the current block type and context. When the
+  // current block reaches the target size, decides on merging the block.
+  void AddSymbol(int symbol, int context) {
+    (*histograms_)[curr_histogram_ix_ + context].Add(symbol);
+    ++block_size_;
+    if (block_size_ == target_block_size_) {
+      FinishBlock(/* is_final = */ false);
+    }
+  }
+
+  // Does either of three things:
+  //   (1) emits the current block with a new block type;
+  //   (2) emits the current block with the type of the second last block;
+  //   (3) merges the current block with the last block.
+  void FinishBlock(bool is_final) {
+    if (block_size_ < min_block_size_) {
+      block_size_ = min_block_size_;
+    }
+    if (num_blocks_ == 0) {
+      // Create first block.
+      split_->lengths[0] = block_size_;
+      split_->types[0] = 0;
+      for (int i = 0; i < num_contexts_; ++i) {
+        last_entropy_[i] =
+            BitsEntropy(&(*histograms_)[i].data_[0], alphabet_size_);
+        last_entropy_[num_contexts_ + i] = last_entropy_[i];
+      }
+      ++num_blocks_;
+      ++split_->num_types;
+      curr_histogram_ix_ += num_contexts_;
+      block_size_ = 0;
+    } else if (block_size_ > 0) {
+      // Try merging the set of histograms for the current block type with the
+      // respective set of histograms for the last and second last block types.
+      // Decide over the split based on the total reduction of entropy across
+      // all contexts.
+      std::vector<double> entropy(num_contexts_);
+      std::vector<HistogramType> combined_histo(2 * num_contexts_);
+      std::vector<double> combined_entropy(2 * num_contexts_);
+      double diff[2] = { 0.0 };
+      for (int i = 0; i < num_contexts_; ++i) {
+        int curr_histo_ix = curr_histogram_ix_ + i;
+        entropy[i] = BitsEntropy(&(*histograms_)[curr_histo_ix].data_[0],
+                                 alphabet_size_);
+        for (int j = 0; j < 2; ++j) {
+          int jx = j * num_contexts_ + i;
+          int last_histogram_ix = last_histogram_ix_[j] + i;
+          combined_histo[jx] = (*histograms_)[curr_histo_ix];
+          combined_histo[jx].AddHistogram((*histograms_)[last_histogram_ix]);
+          combined_entropy[jx] = BitsEntropy(
+              &combined_histo[jx].data_[0], alphabet_size_);
+          diff[j] += combined_entropy[jx] - entropy[i] - last_entropy_[jx];
+        }
+      }
+
+      if (split_->num_types < max_block_types_ &&
+          diff[0] > split_threshold_ &&
+          diff[1] > split_threshold_) {
+        // Create new block.
+        split_->lengths[num_blocks_] = block_size_;
+        split_->types[num_blocks_] = split_->num_types;
+        last_histogram_ix_[1] = last_histogram_ix_[0];
+        last_histogram_ix_[0] = split_->num_types * num_contexts_;
+        for (int i = 0; i < num_contexts_; ++i) {
+          last_entropy_[num_contexts_ + i] = last_entropy_[i];
+          last_entropy_[i] = entropy[i];
+        }
+        ++num_blocks_;
+        ++split_->num_types;
+        curr_histogram_ix_ += num_contexts_;
+        block_size_ = 0;
+        merge_last_count_ = 0;
+        target_block_size_ = min_block_size_;
+      } else if (diff[1] < diff[0] - 20.0) {
+        // Combine this block with second last block.
+        split_->lengths[num_blocks_] = block_size_;
+        split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
+        std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
+        for (int i = 0; i < num_contexts_; ++i) {
+          (*histograms_)[last_histogram_ix_[0] + i] =
+              combined_histo[num_contexts_ + i];
+          last_entropy_[num_contexts_ + i] = last_entropy_[i];
+          last_entropy_[i] = combined_entropy[num_contexts_ + i];
+          (*histograms_)[curr_histogram_ix_ + i].Clear();
+        }
+        ++num_blocks_;
+        block_size_ = 0;
+        merge_last_count_ = 0;
+        target_block_size_ = min_block_size_;
+      } else {
+        // Combine this block with last block.
+        split_->lengths[num_blocks_ - 1] += block_size_;
+        for (int i = 0; i < num_contexts_; ++i) {
+          (*histograms_)[last_histogram_ix_[0] + i] = combined_histo[i];
+          last_entropy_[i] = combined_entropy[i];
+          if (split_->num_types == 1) {
+            last_entropy_[num_contexts_ + i] = last_entropy_[i];
+          }
+          (*histograms_)[curr_histogram_ix_ + i].Clear();
+        }
+        block_size_ = 0;
+        if (++merge_last_count_ > 1) {
+          target_block_size_ += min_block_size_;
+        }
+      }
+    }
+    if (is_final) {
+      (*histograms_).resize(split_->num_types * num_contexts_);
+      split_->types.resize(num_blocks_);
+      split_->lengths.resize(num_blocks_);
+    }
+  }
+
+ private:
+  static const int kMaxBlockTypes = 256;
+
+  // Alphabet size of particular block category.
+  const int alphabet_size_;
+  const int num_contexts_;
+  const int max_block_types_;
+  // We collect at least this many symbols for each block.
+  const int min_block_size_;
+  // We merge histograms A and B if
+  //   entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
+  // where A is the current histogram and B is the histogram of the last or the
+  // second last block type.
+  const double split_threshold_;
+
+  int num_blocks_;
+  BlockSplit* split_;  // not owned
+  std::vector<HistogramType>* histograms_;  // not owned
+
+  // The number of symbols that we want to collect before deciding on whether
+  // or not to merge the block with a previous one or emit a new block.
+  int target_block_size_;
+  // The number of symbols in the current histogram.
+  int block_size_;
+  // Offset of the current histogram.
+  int curr_histogram_ix_;
+  // Offset of the histograms of the previous two block types.
+  int last_histogram_ix_[2];
+  // Entropy of the previous two block types.
+  std::vector<double> last_entropy_;
+  // The number of times we merged the current block with the last one.
+  int merge_last_count_;
+};
+
+void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
+                                      size_t pos,
+                                      size_t mask,
+                                      uint8_t prev_byte,
+                                      uint8_t prev_byte2,
+                                      int literal_context_mode,
+                                      int num_contexts,
+                                      const int* static_context_map,
+                                      const Command *commands,
+                                      size_t n_commands,
+                                      MetaBlockSplit* mb) {
+  int num_literals = 0;
+  for (int i = 0; i < n_commands; ++i) {
+    num_literals += commands[i].insert_len_;
+  }
+
+  ContextBlockSplitter<HistogramLiteral> lit_blocks(
+      256, num_contexts, 512, 400.0, num_literals,
+      &mb->literal_split, &mb->literal_histograms);
+  BlockSplitter<HistogramCommand> cmd_blocks(
+      kNumCommandPrefixes, 1024, 500.0, n_commands,
+      &mb->command_split, &mb->command_histograms);
+  BlockSplitter<HistogramDistance> dist_blocks(
+      64, 512, 100.0, n_commands,
+      &mb->distance_split, &mb->distance_histograms);
+
+  for (int i = 0; i < n_commands; ++i) {
+    const Command cmd = commands[i];
+    cmd_blocks.AddSymbol(cmd.cmd_prefix_);
+    for (int j = 0; j < cmd.insert_len_; ++j) {
+      int context = Context(prev_byte, prev_byte2, literal_context_mode);
+      uint8_t literal = ringbuffer[pos & mask];
+      lit_blocks.AddSymbol(literal, static_context_map[context]);
+      prev_byte2 = prev_byte;
+      prev_byte = literal;
+      ++pos;
+    }
+    pos += cmd.copy_len_;
+    if (cmd.copy_len_ > 0) {
+      prev_byte2 = ringbuffer[(pos - 2) & mask];
+      prev_byte = ringbuffer[(pos - 1) & mask];
+      if (cmd.cmd_prefix_ >= 128) {
+        dist_blocks.AddSymbol(cmd.dist_prefix_);
+      }
+    }
+  }
+
+  lit_blocks.FinishBlock(/* is_final = */ true);
+  cmd_blocks.FinishBlock(/* is_final = */ true);
+  dist_blocks.FinishBlock(/* is_final = */ true);
+
+  mb->literal_context_map.resize(
+      mb->literal_split.num_types << kLiteralContextBits);
+  for (int i = 0; i < mb->literal_split.num_types; ++i) {
+    for (int j = 0; j < (1 << kLiteralContextBits); ++j) {
+      mb->literal_context_map[(i << kLiteralContextBits) + j] =
+          i * num_contexts + static_context_map[j];
+    }
+  }
+}
+
 void OptimizeHistograms(int num_direct_distance_codes,
                         int distance_postfix_bits,
                         MetaBlockSplit* mb) {
diff --git a/enc/metablock.h b/enc/metablock.h
index 54a9138..1946436 100644
--- a/enc/metablock.h
+++ b/enc/metablock.h
@@ -44,6 +44,7 @@ struct MetaBlockSplit {
   std::vector<HistogramDistance> distance_histograms;
 };
 
+// Uses the slow shortest-path block splitter and does context clustering.
 void BuildMetaBlock(const uint8_t* ringbuffer,
                     const size_t pos,
                     const size_t mask,
@@ -55,6 +56,8 @@ void BuildMetaBlock(const uint8_t* ringbuffer,
                     bool enable_context_modleing,
                     MetaBlockSplit* mb);
 
+// Uses a fast greedy block splitter that tries to merge current block with the
+// last or the second last block and does not do any context modeling.
 void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
                           size_t pos,
                           size_t mask,
@@ -62,6 +65,21 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
                           size_t n_commands,
                           MetaBlockSplit* mb);
 
+// Uses a fast greedy block splitter that tries to merge current block with the
+// last or the second last block and uses a static context clustering which
+// is the same for all block types.
+void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
+                                      size_t pos,
+                                      size_t mask,
+                                      uint8_t prev_byte,
+                                      uint8_t prev_byte2,
+                                      int literal_context_mode,
+                                      int num_contexts,
+                                      const int* static_context_map,
+                                      const Command *commands,
+                                      size_t n_commands,
+                                      MetaBlockSplit* mb);
+
 void OptimizeHistograms(int num_direct_distance_codes,
                         int distance_postfix_bits,
                         MetaBlockSplit* mb);