mirror of https://github.com/google/brotli
Merge pull request #96 from szabadka/master
Use a static context map with two buckets for UTF8 data.
This commit is contained in:
commit
e4a309ac07
|
@ -351,6 +351,53 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
|
|||
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
|
||||
}
|
||||
|
||||
void DecideOverLiteralContextModeling(const uint8_t* input,
|
||||
size_t start_pos,
|
||||
size_t length,
|
||||
size_t mask,
|
||||
int quality,
|
||||
int* literal_context_mode,
|
||||
int* num_literal_contexts,
|
||||
const int** literal_context_map) {
|
||||
if (quality <= 3 || length < 64) {
|
||||
return;
|
||||
}
|
||||
// Simple heuristics to guess if the data is UTF8 or not. The goal is to
|
||||
// recognize non-UTF8 data quickly by searching for the following obvious
|
||||
// violations: a continuation byte following an ASCII byte or an ASCII or
|
||||
// lead byte following a lead byte. If we find such violation we decide that
|
||||
// the data is not UTF8. To make the analysis of UTF8 data faster we only
|
||||
// examine 64 byte long strides at every 4kB intervals, if there are no
|
||||
// violations found, we assume the whole data is UTF8.
|
||||
const size_t end_pos = start_pos + length;
|
||||
for (; start_pos + 64 < end_pos; start_pos += 4096) {
|
||||
const size_t stride_end_pos = start_pos + 64;
|
||||
uint8_t prev = input[start_pos & mask];
|
||||
for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
|
||||
const uint8_t literal = input[pos & mask];
|
||||
if ((prev < 128 && (literal & 0xc0) == 0x80) ||
|
||||
(prev >= 192 && (literal & 0xc0) != 0x80)) {
|
||||
return;
|
||||
}
|
||||
prev = literal;
|
||||
}
|
||||
}
|
||||
*literal_context_mode = CONTEXT_UTF8;
|
||||
// If the data is UTF8, this static context map distinguishes between ASCII
|
||||
// or lead bytes and continuation bytes: the UTF8 context value based on the
|
||||
// last two bytes is 2 or 3 if and only if the next byte is a continuation
|
||||
// byte (see table in context.h).
|
||||
static const int kStaticContextMap[64] = {
|
||||
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
static const int kNumLiteralContexts = 2;
|
||||
*num_literal_contexts = kNumLiteralContexts;
|
||||
*literal_context_map = kStaticContextMap;
|
||||
}
|
||||
|
||||
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
||||
const bool utf8_mode,
|
||||
size_t* out_size,
|
||||
|
@ -406,8 +453,6 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
|||
num_direct_distance_codes,
|
||||
distance_postfix_bits);
|
||||
}
|
||||
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
|
||||
MetaBlockSplit mb;
|
||||
if (params_.quality == 1) {
|
||||
if (!StoreMetaBlockTrivial(data, last_flush_pos_, bytes, mask, is_last,
|
||||
commands_.get(), num_commands_,
|
||||
|
@ -416,10 +461,29 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
|||
return false;
|
||||
}
|
||||
} else {
|
||||
MetaBlockSplit mb;
|
||||
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
|
||||
if (params_.greedy_block_split) {
|
||||
BuildMetaBlockGreedy(data, last_flush_pos_, mask,
|
||||
commands_.get(), num_commands_,
|
||||
&mb);
|
||||
int num_literal_contexts = 1;
|
||||
const int* literal_context_map = NULL;
|
||||
DecideOverLiteralContextModeling(data, last_flush_pos_, bytes, mask,
|
||||
params_.quality,
|
||||
&literal_context_mode,
|
||||
&num_literal_contexts,
|
||||
&literal_context_map);
|
||||
if (literal_context_map == NULL) {
|
||||
BuildMetaBlockGreedy(data, last_flush_pos_, mask,
|
||||
commands_.get(), num_commands_,
|
||||
&mb);
|
||||
} else {
|
||||
BuildMetaBlockGreedyWithContexts(data, last_flush_pos_, mask,
|
||||
prev_byte_, prev_byte2_,
|
||||
literal_context_mode,
|
||||
num_literal_contexts,
|
||||
literal_context_map,
|
||||
commands_.get(), num_commands_,
|
||||
&mb);
|
||||
}
|
||||
} else {
|
||||
BuildMetaBlock(data, last_flush_pos_, mask,
|
||||
prev_byte_, prev_byte2_,
|
||||
|
|
244
enc/metablock.cc
244
enc/metablock.cc
|
@ -18,6 +18,7 @@
|
|||
#include "./metablock.h"
|
||||
|
||||
#include "./block_splitter.h"
|
||||
#include "./context.h"
|
||||
#include "./cluster.h"
|
||||
#include "./histogram.h"
|
||||
|
||||
|
@ -297,6 +298,249 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
|
|||
dist_blocks.FinishBlock(/* is_final = */ true);
|
||||
}
|
||||
|
||||
// Greedy block splitter for one block category (literal, command or distance).
|
||||
// Gathers histograms for all context buckets.
|
||||
template<typename HistogramType>
|
||||
class ContextBlockSplitter {
|
||||
public:
|
||||
ContextBlockSplitter(int alphabet_size,
|
||||
int num_contexts,
|
||||
int min_block_size,
|
||||
double split_threshold,
|
||||
int num_symbols,
|
||||
BlockSplit* split,
|
||||
std::vector<HistogramType>* histograms)
|
||||
: alphabet_size_(alphabet_size),
|
||||
num_contexts_(num_contexts),
|
||||
max_block_types_(kMaxBlockTypes / num_contexts),
|
||||
min_block_size_(min_block_size),
|
||||
split_threshold_(split_threshold),
|
||||
num_blocks_(0),
|
||||
split_(split),
|
||||
histograms_(histograms),
|
||||
target_block_size_(min_block_size),
|
||||
block_size_(0),
|
||||
curr_histogram_ix_(0),
|
||||
last_entropy_(2 * num_contexts),
|
||||
merge_last_count_(0) {
|
||||
int max_num_blocks = num_symbols / min_block_size + 1;
|
||||
// We have to allocate one more histogram than the maximum number of block
|
||||
// types for the current histogram when the meta-block is too big.
|
||||
int max_num_types = std::min(max_num_blocks, max_block_types_ + 1);
|
||||
split_->lengths.resize(max_num_blocks);
|
||||
split_->types.resize(max_num_blocks);
|
||||
histograms_->resize(max_num_types * num_contexts);
|
||||
last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
|
||||
}
|
||||
|
||||
// Adds the next symbol to the current block type and context. When the
|
||||
// current block reaches the target size, decides on merging the block.
|
||||
void AddSymbol(int symbol, int context) {
|
||||
(*histograms_)[curr_histogram_ix_ + context].Add(symbol);
|
||||
++block_size_;
|
||||
if (block_size_ == target_block_size_) {
|
||||
FinishBlock(/* is_final = */ false);
|
||||
}
|
||||
}
|
||||
|
||||
// Does either of three things:
|
||||
// (1) emits the current block with a new block type;
|
||||
// (2) emits the current block with the type of the second last block;
|
||||
// (3) merges the current block with the last block.
|
||||
void FinishBlock(bool is_final) {
|
||||
if (block_size_ < min_block_size_) {
|
||||
block_size_ = min_block_size_;
|
||||
}
|
||||
if (num_blocks_ == 0) {
|
||||
// Create first block.
|
||||
split_->lengths[0] = block_size_;
|
||||
split_->types[0] = 0;
|
||||
for (int i = 0; i < num_contexts_; ++i) {
|
||||
last_entropy_[i] =
|
||||
BitsEntropy(&(*histograms_)[i].data_[0], alphabet_size_);
|
||||
last_entropy_[num_contexts_ + i] = last_entropy_[i];
|
||||
}
|
||||
++num_blocks_;
|
||||
++split_->num_types;
|
||||
curr_histogram_ix_ += num_contexts_;
|
||||
block_size_ = 0;
|
||||
} else if (block_size_ > 0) {
|
||||
// Try merging the set of histograms for the current block type with the
|
||||
// respective set of histograms for the last and second last block types.
|
||||
// Decide over the split based on the total reduction of entropy across
|
||||
// all contexts.
|
||||
std::vector<double> entropy(num_contexts_);
|
||||
std::vector<HistogramType> combined_histo(2 * num_contexts_);
|
||||
std::vector<double> combined_entropy(2 * num_contexts_);
|
||||
double diff[2] = { 0.0 };
|
||||
for (int i = 0; i < num_contexts_; ++i) {
|
||||
int curr_histo_ix = curr_histogram_ix_ + i;
|
||||
entropy[i] = BitsEntropy(&(*histograms_)[curr_histo_ix].data_[0],
|
||||
alphabet_size_);
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
int jx = j * num_contexts_ + i;
|
||||
int last_histogram_ix = last_histogram_ix_[j] + i;
|
||||
combined_histo[jx] = (*histograms_)[curr_histo_ix];
|
||||
combined_histo[jx].AddHistogram((*histograms_)[last_histogram_ix]);
|
||||
combined_entropy[jx] = BitsEntropy(
|
||||
&combined_histo[jx].data_[0], alphabet_size_);
|
||||
diff[j] += combined_entropy[jx] - entropy[i] - last_entropy_[jx];
|
||||
}
|
||||
}
|
||||
|
||||
if (split_->num_types < max_block_types_ &&
|
||||
diff[0] > split_threshold_ &&
|
||||
diff[1] > split_threshold_) {
|
||||
// Create new block.
|
||||
split_->lengths[num_blocks_] = block_size_;
|
||||
split_->types[num_blocks_] = split_->num_types;
|
||||
last_histogram_ix_[1] = last_histogram_ix_[0];
|
||||
last_histogram_ix_[0] = split_->num_types * num_contexts_;
|
||||
for (int i = 0; i < num_contexts_; ++i) {
|
||||
last_entropy_[num_contexts_ + i] = last_entropy_[i];
|
||||
last_entropy_[i] = entropy[i];
|
||||
}
|
||||
++num_blocks_;
|
||||
++split_->num_types;
|
||||
curr_histogram_ix_ += num_contexts_;
|
||||
block_size_ = 0;
|
||||
merge_last_count_ = 0;
|
||||
target_block_size_ = min_block_size_;
|
||||
} else if (diff[1] < diff[0] - 20.0) {
|
||||
// Combine this block with second last block.
|
||||
split_->lengths[num_blocks_] = block_size_;
|
||||
split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
|
||||
std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
|
||||
for (int i = 0; i < num_contexts_; ++i) {
|
||||
(*histograms_)[last_histogram_ix_[0] + i] =
|
||||
combined_histo[num_contexts_ + i];
|
||||
last_entropy_[num_contexts_ + i] = last_entropy_[i];
|
||||
last_entropy_[i] = combined_entropy[num_contexts_ + i];
|
||||
(*histograms_)[curr_histogram_ix_ + i].Clear();
|
||||
}
|
||||
++num_blocks_;
|
||||
block_size_ = 0;
|
||||
merge_last_count_ = 0;
|
||||
target_block_size_ = min_block_size_;
|
||||
} else {
|
||||
// Combine this block with last block.
|
||||
split_->lengths[num_blocks_ - 1] += block_size_;
|
||||
for (int i = 0; i < num_contexts_; ++i) {
|
||||
(*histograms_)[last_histogram_ix_[0] + i] = combined_histo[i];
|
||||
last_entropy_[i] = combined_entropy[i];
|
||||
if (split_->num_types == 1) {
|
||||
last_entropy_[num_contexts_ + i] = last_entropy_[i];
|
||||
}
|
||||
(*histograms_)[curr_histogram_ix_ + i].Clear();
|
||||
}
|
||||
block_size_ = 0;
|
||||
if (++merge_last_count_ > 1) {
|
||||
target_block_size_ += min_block_size_;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_final) {
|
||||
(*histograms_).resize(split_->num_types * num_contexts_);
|
||||
split_->types.resize(num_blocks_);
|
||||
split_->lengths.resize(num_blocks_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static const int kMaxBlockTypes = 256;
|
||||
|
||||
// Alphabet size of particular block category.
|
||||
const int alphabet_size_;
|
||||
const int num_contexts_;
|
||||
const int max_block_types_;
|
||||
// We collect at least this many symbols for each block.
|
||||
const int min_block_size_;
|
||||
// We merge histograms A and B if
|
||||
// entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
|
||||
// where A is the current histogram and B is the histogram of the last or the
|
||||
// second last block type.
|
||||
const double split_threshold_;
|
||||
|
||||
int num_blocks_;
|
||||
BlockSplit* split_; // not owned
|
||||
std::vector<HistogramType>* histograms_; // not owned
|
||||
|
||||
// The number of symbols that we want to collect before deciding on whether
|
||||
// or not to merge the block with a previous one or emit a new block.
|
||||
int target_block_size_;
|
||||
// The number of symbols in the current histogram.
|
||||
int block_size_;
|
||||
// Offset of the current histogram.
|
||||
int curr_histogram_ix_;
|
||||
// Offset of the histograms of the previous two block types.
|
||||
int last_histogram_ix_[2];
|
||||
// Entropy of the previous two block types.
|
||||
std::vector<double> last_entropy_;
|
||||
// The number of times we merged the current block with the last one.
|
||||
int merge_last_count_;
|
||||
};
|
||||
|
||||
void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
|
||||
size_t pos,
|
||||
size_t mask,
|
||||
uint8_t prev_byte,
|
||||
uint8_t prev_byte2,
|
||||
int literal_context_mode,
|
||||
int num_contexts,
|
||||
const int* static_context_map,
|
||||
const Command *commands,
|
||||
size_t n_commands,
|
||||
MetaBlockSplit* mb) {
|
||||
int num_literals = 0;
|
||||
for (int i = 0; i < n_commands; ++i) {
|
||||
num_literals += commands[i].insert_len_;
|
||||
}
|
||||
|
||||
ContextBlockSplitter<HistogramLiteral> lit_blocks(
|
||||
256, num_contexts, 512, 400.0, num_literals,
|
||||
&mb->literal_split, &mb->literal_histograms);
|
||||
BlockSplitter<HistogramCommand> cmd_blocks(
|
||||
kNumCommandPrefixes, 1024, 500.0, n_commands,
|
||||
&mb->command_split, &mb->command_histograms);
|
||||
BlockSplitter<HistogramDistance> dist_blocks(
|
||||
64, 512, 100.0, n_commands,
|
||||
&mb->distance_split, &mb->distance_histograms);
|
||||
|
||||
for (int i = 0; i < n_commands; ++i) {
|
||||
const Command cmd = commands[i];
|
||||
cmd_blocks.AddSymbol(cmd.cmd_prefix_);
|
||||
for (int j = 0; j < cmd.insert_len_; ++j) {
|
||||
int context = Context(prev_byte, prev_byte2, literal_context_mode);
|
||||
uint8_t literal = ringbuffer[pos & mask];
|
||||
lit_blocks.AddSymbol(literal, static_context_map[context]);
|
||||
prev_byte2 = prev_byte;
|
||||
prev_byte = literal;
|
||||
++pos;
|
||||
}
|
||||
pos += cmd.copy_len_;
|
||||
if (cmd.copy_len_ > 0) {
|
||||
prev_byte2 = ringbuffer[(pos - 2) & mask];
|
||||
prev_byte = ringbuffer[(pos - 1) & mask];
|
||||
if (cmd.cmd_prefix_ >= 128) {
|
||||
dist_blocks.AddSymbol(cmd.dist_prefix_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lit_blocks.FinishBlock(/* is_final = */ true);
|
||||
cmd_blocks.FinishBlock(/* is_final = */ true);
|
||||
dist_blocks.FinishBlock(/* is_final = */ true);
|
||||
|
||||
mb->literal_context_map.resize(
|
||||
mb->literal_split.num_types << kLiteralContextBits);
|
||||
for (int i = 0; i < mb->literal_split.num_types; ++i) {
|
||||
for (int j = 0; j < (1 << kLiteralContextBits); ++j) {
|
||||
mb->literal_context_map[(i << kLiteralContextBits) + j] =
|
||||
i * num_contexts + static_context_map[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OptimizeHistograms(int num_direct_distance_codes,
|
||||
int distance_postfix_bits,
|
||||
MetaBlockSplit* mb) {
|
||||
|
|
|
@ -44,6 +44,7 @@ struct MetaBlockSplit {
|
|||
std::vector<HistogramDistance> distance_histograms;
|
||||
};
|
||||
|
||||
// Uses the slow shortest-path block splitter and does context clustering.
|
||||
void BuildMetaBlock(const uint8_t* ringbuffer,
|
||||
const size_t pos,
|
||||
const size_t mask,
|
||||
|
@ -55,6 +56,8 @@ void BuildMetaBlock(const uint8_t* ringbuffer,
|
|||
bool enable_context_modleing,
|
||||
MetaBlockSplit* mb);
|
||||
|
||||
// Uses a fast greedy block splitter that tries to merge current block with the
|
||||
// last or the second last block and does not do any context modeling.
|
||||
void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
|
||||
size_t pos,
|
||||
size_t mask,
|
||||
|
@ -62,6 +65,21 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
|
|||
size_t n_commands,
|
||||
MetaBlockSplit* mb);
|
||||
|
||||
// Uses a fast greedy block splitter that tries to merge current block with the
|
||||
// last or the second last block and uses a static context clustering which
|
||||
// is the same for all block types.
|
||||
void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
|
||||
size_t pos,
|
||||
size_t mask,
|
||||
uint8_t prev_byte,
|
||||
uint8_t prev_byte2,
|
||||
int literal_context_mode,
|
||||
int num_contexts,
|
||||
const int* static_context_map,
|
||||
const Command *commands,
|
||||
size_t n_commands,
|
||||
MetaBlockSplit* mb);
|
||||
|
||||
void OptimizeHistograms(int num_direct_distance_codes,
|
||||
int distance_postfix_bits,
|
||||
MetaBlockSplit* mb);
|
||||
|
|
Loading…
Reference in New Issue