* Update
 * decoder: better behavior after failure
 * encoder: replace "len_x_code" with delta
 * research: add experimental dictionary generator
 * python: test combing
This commit is contained in:
Eugene Kliuchnikov 2017-07-21 10:07:24 +02:00 committed by GitHub
parent 172a378deb
commit 52441069ef
13 changed files with 397 additions and 87 deletions

View File

@ -1942,7 +1942,13 @@ BrotliDecoderResult BrotliDecoderDecompressStream(
if (result != BROTLI_DECODER_SUCCESS) { /* Error, needs more input/output */ if (result != BROTLI_DECODER_SUCCESS) { /* Error, needs more input/output */
if (result == BROTLI_DECODER_NEEDS_MORE_INPUT) { if (result == BROTLI_DECODER_NEEDS_MORE_INPUT) {
if (s->ringbuffer != 0) { /* Pro-actively push output. */ if (s->ringbuffer != 0) { /* Pro-actively push output. */
WriteRingBuffer(s, available_out, next_out, total_out, BROTLI_TRUE); BrotliDecoderErrorCode intermediate_result = WriteRingBuffer(s,
available_out, next_out, total_out, BROTLI_TRUE);
/* WriteRingBuffer checks s->meta_block_remaining_len validity. */
if ((int)intermediate_result < 0) {
result = intermediate_result;
break;
}
} }
if (s->buffer_length != 0) { /* Used with internal buffer. */ if (s->buffer_length != 0) { /* Used with internal buffer. */
if (br->avail_in == 0) { /* Successfully finished read transaction. */ if (br->avail_in == 0) { /* Successfully finished read transaction. */
@ -2327,6 +2333,10 @@ void BrotliDecoderSetCustomDictionary(
} }
BROTLI_BOOL BrotliDecoderHasMoreOutput(const BrotliDecoderState* s) { BROTLI_BOOL BrotliDecoderHasMoreOutput(const BrotliDecoderState* s) {
/* After unrecoverable error remaining output is considered nonsensical. */
if ((int)s->error_code < 0) {
return BROTLI_FALSE;
}
return TO_BROTLI_BOOL( return TO_BROTLI_BOOL(
s->ringbuffer != 0 && UnwrittenBytes(s, BROTLI_FALSE) != 0); s->ringbuffer != 0 && UnwrittenBytes(s, BROTLI_FALSE) != 0);
} }
@ -2336,17 +2346,20 @@ const uint8_t* BrotliDecoderTakeOutput(BrotliDecoderState* s, size_t* size) {
size_t available_out = *size ? *size : 1u << 24; size_t available_out = *size ? *size : 1u << 24;
size_t requested_out = available_out; size_t requested_out = available_out;
BrotliDecoderErrorCode status; BrotliDecoderErrorCode status;
if (s->ringbuffer == 0) { if ((s->ringbuffer == 0) || ((int)s->error_code < 0)) {
*size = 0; *size = 0;
return 0; return 0;
} }
WrapRingBuffer(s); WrapRingBuffer(s);
status = WriteRingBuffer(s, &available_out, &result, 0, BROTLI_TRUE); status = WriteRingBuffer(s, &available_out, &result, 0, BROTLI_TRUE);
/* Either WriteRingBuffer returns those "success" codes... */
if (status == BROTLI_DECODER_SUCCESS || if (status == BROTLI_DECODER_SUCCESS ||
status == BROTLI_DECODER_NEEDS_MORE_OUTPUT) { status == BROTLI_DECODER_NEEDS_MORE_OUTPUT) {
*size = requested_out - available_out; *size = requested_out - available_out;
} else { } else {
/* This might happen if previous decoder error code was ignored. */ /* ... or stream is broken. Normally this should be caught by
BrotliDecoderDecompressStream, this is just a safeguard. */
if ((int)status < 0) SaveErrorCode(s, status);
*size = 0; *size = 0;
result = 0; result = 0;
} }

View File

@ -549,8 +549,8 @@ void BrotliZopfliCreateCommands(const size_t num_bytes,
BROTLI_BOOL is_dictionary = TO_BROTLI_BOOL(distance > max_distance); BROTLI_BOOL is_dictionary = TO_BROTLI_BOOL(distance > max_distance);
size_t dist_code = ZopfliNodeDistanceCode(next); size_t dist_code = ZopfliNodeDistanceCode(next);
InitCommand( InitCommand(&commands[i], insert_length,
&commands[i], insert_length, copy_length, len_code, dist_code); copy_length, (int)len_code - (int)copy_length, dist_code);
if (!is_dictionary && dist_code > 0) { if (!is_dictionary && dist_code > 0) {
dist_cache[3] = dist_cache[2]; dist_cache[3] = dist_cache[2];

View File

@ -38,29 +38,29 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)(
size_t max_distance = BROTLI_MIN(size_t, position, max_backward_limit); size_t max_distance = BROTLI_MIN(size_t, position, max_backward_limit);
HasherSearchResult sr; HasherSearchResult sr;
sr.len = 0; sr.len = 0;
sr.len_x_code = 0; sr.len_code_delta = 0;
sr.distance = 0; sr.distance = 0;
sr.score = kMinScore; sr.score = kMinScore;
if (FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer,
ringbuffer, ringbuffer_mask, dist_cache, ringbuffer_mask, dist_cache, position,
position, max_length, max_distance, &sr)) { max_length, max_distance, &sr);
if (sr.score > kMinScore) {
/* Found a match. Let's look for something even better ahead. */ /* Found a match. Let's look for something even better ahead. */
int delayed_backward_references_in_row = 0; int delayed_backward_references_in_row = 0;
--max_length; --max_length;
for (;; --max_length) { for (;; --max_length) {
const score_t cost_diff_lazy = 175; const score_t cost_diff_lazy = 175;
BROTLI_BOOL is_match_found;
HasherSearchResult sr2; HasherSearchResult sr2;
sr2.len = params->quality < MIN_QUALITY_FOR_EXTENSIVE_REFERENCE_SEARCH ? sr2.len = params->quality < MIN_QUALITY_FOR_EXTENSIVE_REFERENCE_SEARCH ?
BROTLI_MIN(size_t, sr.len - 1, max_length) : 0; BROTLI_MIN(size_t, sr.len - 1, max_length) : 0;
sr2.len_x_code = 0; sr2.len_code_delta = 0;
sr2.distance = 0; sr2.distance = 0;
sr2.score = kMinScore; sr2.score = kMinScore;
max_distance = BROTLI_MIN(size_t, position + 1, max_backward_limit); max_distance = BROTLI_MIN(size_t, position + 1, max_backward_limit);
is_match_found = FN(FindLongestMatch)(hasher, dictionary, FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer,
dictionary_hash, ringbuffer, ringbuffer_mask, dist_cache, ringbuffer_mask, dist_cache, position + 1,
position + 1, max_length, max_distance, &sr2); max_length, max_distance, &sr2);
if (is_match_found && sr2.score >= sr.score + cost_diff_lazy) { if (sr2.score >= sr.score + cost_diff_lazy) {
/* Ok, let's just write one byte for now and start a match from the /* Ok, let's just write one byte for now and start a match from the
next byte. */ next byte. */
++position; ++position;
@ -88,7 +88,7 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)(
dist_cache[0] = (int)sr.distance; dist_cache[0] = (int)sr.distance;
FN(PrepareDistanceCache)(hasher, dist_cache); FN(PrepareDistanceCache)(hasher, dist_cache);
} }
InitCommand(commands++, insert_length, sr.len, sr.len ^ sr.len_x_code, InitCommand(commands++, insert_length, sr.len, sr.len_code_delta,
distance_code); distance_code);
} }
*num_literals += insert_length; *num_literals += insert_length;

View File

@ -114,17 +114,19 @@ typedef struct Command {
/* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */ /* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */
static BROTLI_INLINE void InitCommand(Command* self, size_t insertlen, static BROTLI_INLINE void InitCommand(Command* self, size_t insertlen,
size_t copylen, size_t copylen_code, size_t distance_code) { size_t copylen, int copylen_code_delta, size_t distance_code) {
/* Don't rely on signed int representation, use honest casts. */
uint32_t delta = (uint8_t)((int8_t)copylen_code_delta);
self->insert_len_ = (uint32_t)insertlen; self->insert_len_ = (uint32_t)insertlen;
self->copy_len_ = (uint32_t)(copylen | ((copylen_code ^ copylen) << 24)); self->copy_len_ = (uint32_t)(copylen | (delta << 24));
/* The distance prefix and extra bits are stored in this Command as if /* The distance prefix and extra bits are stored in this Command as if
npostfix and ndirect were 0, they are only recomputed later after the npostfix and ndirect were 0, they are only recomputed later after the
clustering if needed. */ clustering if needed. */
PrefixEncodeCopyDistance( PrefixEncodeCopyDistance(
distance_code, 0, 0, &self->dist_prefix_, &self->dist_extra_); distance_code, 0, 0, &self->dist_prefix_, &self->dist_extra_);
GetLengthCode( GetLengthCode(
insertlen, copylen_code, TO_BROTLI_BOOL(self->dist_prefix_ == 0), insertlen, (size_t)((int)copylen + copylen_code_delta),
&self->cmd_prefix_); TO_BROTLI_BOOL(self->dist_prefix_ == 0), &self->cmd_prefix_);
} }
static BROTLI_INLINE void InitInsertCommand(Command* self, size_t insertlen) { static BROTLI_INLINE void InitInsertCommand(Command* self, size_t insertlen) {
@ -167,7 +169,8 @@ static BROTLI_INLINE uint32_t CommandCopyLen(const Command* self) {
} }
static BROTLI_INLINE uint32_t CommandCopyLenCode(const Command* self) { static BROTLI_INLINE uint32_t CommandCopyLenCode(const Command* self) {
return (self->copy_len_ & 0xFFFFFF) ^ (self->copy_len_ >> 24); int32_t delta = (int8_t)((uint8_t)(self->copy_len_ >> 24));
return (uint32_t)((int32_t)(self->copy_len_ & 0xFFFFFF) + delta);
} }
#if defined(__cplusplus) || defined(c_plusplus) #if defined(__cplusplus) || defined(c_plusplus)

View File

@ -62,9 +62,9 @@ static const uint64_t kCutoffTransforms =
typedef struct HasherSearchResult { typedef struct HasherSearchResult {
size_t len; size_t len;
size_t len_x_code; /* == len ^ len_code */
size_t distance; size_t distance;
score_t score; score_t score;
int len_code_delta; /* == len_code - len */
} HasherSearchResult; } HasherSearchResult;
/* kHashMul32 multiplier has these properties: /* kHashMul32 multiplier has these properties:
@ -178,22 +178,21 @@ static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
return BROTLI_FALSE; return BROTLI_FALSE;
} }
out->len = matchlen; out->len = matchlen;
out->len_x_code = len ^ matchlen; out->len_code_delta = (int)len - (int)matchlen;
out->distance = backward; out->distance = backward;
out->score = score; out->score = score;
return BROTLI_TRUE; return BROTLI_TRUE;
} }
static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary( static BROTLI_INLINE void SearchInStaticDictionary(
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
HasherHandle handle, const uint8_t* data, size_t max_length, HasherHandle handle, const uint8_t* data, size_t max_length,
size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) { size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
size_t key; size_t key;
size_t i; size_t i;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
HasherCommon* self = GetHasherCommon(handle); HasherCommon* self = GetHasherCommon(handle);
if (self->dict_num_matches < (self->dict_num_lookups >> 7)) { if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
return BROTLI_FALSE; return;
} }
key = Hash14(data) << 1; key = Hash14(data) << 1;
for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
@ -204,11 +203,9 @@ static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
dictionary, item, data, max_length, max_backward, out); dictionary, item, data, max_length, max_backward, out);
if (item_matches) { if (item_matches) {
self->dict_num_matches++; self->dict_num_matches++;
is_match_found = BROTLI_TRUE;
} }
} }
} }
return is_match_found;
} }
typedef struct BackwardMatch { typedef struct BackwardMatch {

View File

@ -152,8 +152,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length. Does not look for matches longer than max_length.
Does not look for matches further away than max_backward. Does not look for matches further away than max_backward.
Writes the best match into |out|. Writes the best match into |out|.
Returns 1 when match is found, otherwise 0. */ |out|->score is updated only if a better match is found. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache, const int* BROTLI_RESTRICT distance_cache,
@ -161,15 +161,15 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
HasherSearchResult* BROTLI_RESTRICT out) { HasherSearchResult* BROTLI_RESTRICT out) {
HashForgetfulChain* self = FN(Self)(handle); HashForgetfulChain* self = FN(Self)(handle);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask; const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */ /* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score; score_t best_score = out->score;
size_t best_len = out->len; size_t best_len = out->len;
size_t i; size_t i;
const size_t key = FN(HashBytes)(&data[cur_ix_masked]); const size_t key = FN(HashBytes)(&data[cur_ix_masked]);
const uint8_t tiny_hash = (uint8_t)(key); const uint8_t tiny_hash = (uint8_t)(key);
out->len = 0; out->len = 0;
out->len_x_code = 0; out->len_code_delta = 0;
/* Try last distance first. */ /* Try last distance first. */
for (i = 0; i < NUM_LAST_DISTANCES_TO_CHECK; ++i) { for (i = 0; i < NUM_LAST_DISTANCES_TO_CHECK; ++i) {
const size_t backward = (size_t)distance_cache[i]; const size_t backward = (size_t)distance_cache[i];
@ -194,7 +194,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
@ -234,19 +233,17 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
} }
FN(Store)(handle, data, ring_buffer_mask, cur_ix); FN(Store)(handle, data, ring_buffer_mask, cur_ix);
} }
if (!is_match_found) { if (out->score == min_score) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out, handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE); BROTLI_FALSE);
} }
return is_match_found;
} }
#undef BANK_SIZE #undef BANK_SIZE

View File

@ -156,8 +156,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length. Does not look for matches longer than max_length.
Does not look for matches further away than max_backward. Does not look for matches further away than max_backward.
Writes the best match into |out|. Writes the best match into |out|.
Returns true when match is found, otherwise false. */ |out|->score is updated only if a better match is found. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix, const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix,
@ -168,13 +168,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
uint16_t* num = FN(Num)(self); uint16_t* num = FN(Num)(self);
uint32_t* buckets = FN(Buckets)(self); uint32_t* buckets = FN(Buckets)(self);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask; const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */ /* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score; score_t best_score = out->score;
size_t best_len = out->len; size_t best_len = out->len;
size_t i; size_t i;
out->len = 0; out->len = 0;
out->len_x_code = 0; out->len_code_delta = 0;
/* Try last distance first. */ /* Try last distance first. */
for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) { for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) {
const size_t backward = (size_t)distance_cache[i]; const size_t backward = (size_t)distance_cache[i];
@ -209,7 +209,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
@ -250,7 +249,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
@ -258,12 +256,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix; bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix;
++num[key]; ++num[key];
} }
if (!is_match_found) { if (min_score == out->score) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out, handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE); BROTLI_FALSE);
} }
return is_match_found;
} }
#undef HashLongestMatch #undef HashLongestMatch

View File

@ -149,8 +149,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length. Does not look for matches longer than max_length.
Does not look for matches further away than max_backward. Does not look for matches further away than max_backward.
Writes the best match into |out|. Writes the best match into |out|.
Returns true when match is found, otherwise false. */ |out|->score is updated only if a better match is found. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix, const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix,
@ -161,13 +161,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
uint16_t* num = FN(Num)(self); uint16_t* num = FN(Num)(self);
uint32_t* buckets = FN(Buckets)(self); uint32_t* buckets = FN(Buckets)(self);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask; const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */ /* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score; score_t best_score = out->score;
size_t best_len = out->len; size_t best_len = out->len;
size_t i; size_t i;
out->len = 0; out->len = 0;
out->len_x_code = 0; out->len_code_delta = 0;
/* Try last distance first. */ /* Try last distance first. */
for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) { for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) {
const size_t backward = (size_t)distance_cache[i]; const size_t backward = (size_t)distance_cache[i];
@ -202,7 +202,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
@ -242,7 +241,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len; out->len = best_len;
out->distance = backward; out->distance = backward;
out->score = best_score; out->score = best_score;
is_match_found = BROTLI_TRUE;
} }
} }
} }
@ -250,12 +248,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix; bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix;
++num[key]; ++num[key];
} }
if (!is_match_found) { if (min_score == out->score) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out, handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE); BROTLI_FALSE);
} }
return is_match_found;
} }
#undef HashLongestMatch #undef HashLongestMatch

View File

@ -123,8 +123,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length. Does not look for matches longer than max_length.
Does not look for matches further away than max_backward. Does not look for matches further away than max_backward.
Writes the best match into |out|. Writes the best match into |out|.
Returns true if match is found, otherwise false. */ |out|->score is updated only if a better match is found. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( static BROTLI_INLINE void FN(FindLongestMatch)(
HasherHandle handle, const BrotliDictionary* dictionary, HasherHandle handle, const BrotliDictionary* dictionary,
const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data, const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data,
const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache,
@ -135,12 +135,12 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
const size_t cur_ix_masked = cur_ix & ring_buffer_mask; const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]); const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
int compare_char = data[cur_ix_masked + best_len_in]; int compare_char = data[cur_ix_masked + best_len_in];
score_t min_score = out->score;
score_t best_score = out->score; score_t best_score = out->score;
size_t best_len = best_len_in; size_t best_len = best_len_in;
size_t cached_backward = (size_t)distance_cache[0]; size_t cached_backward = (size_t)distance_cache[0];
size_t prev_ix = cur_ix - cached_backward; size_t prev_ix = cur_ix - cached_backward;
BROTLI_BOOL is_match_found = BROTLI_FALSE; out->len_code_delta = 0;
out->len_x_code = 0;
if (prev_ix < cur_ix) { if (prev_ix < cur_ix) {
prev_ix &= (uint32_t)ring_buffer_mask; prev_ix &= (uint32_t)ring_buffer_mask;
if (compare_char == data[prev_ix + best_len]) { if (compare_char == data[prev_ix + best_len]) {
@ -148,17 +148,18 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
&data[cur_ix_masked], &data[cur_ix_masked],
max_length); max_length);
if (len >= 4) { if (len >= 4) {
best_score = BackwardReferenceScoreUsingLastDistance(len); const score_t score = BackwardReferenceScoreUsingLastDistance(len);
best_len = len; if (best_score < score) {
out->len = len; best_score = score;
out->distance = cached_backward; best_len = len;
out->score = best_score; out->len = len;
compare_char = data[cur_ix_masked + best_len]; out->distance = cached_backward;
if (BUCKET_SWEEP == 1) { out->score = best_score;
self->buckets_[key] = (uint32_t)cur_ix; compare_char = data[cur_ix_masked + best_len];
return BROTLI_TRUE; if (BUCKET_SWEEP == 1) {
} else { self->buckets_[key] = (uint32_t)cur_ix;
is_match_found = BROTLI_TRUE; return;
}
} }
} }
} }
@ -172,19 +173,22 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
backward = cur_ix - prev_ix; backward = cur_ix - prev_ix;
prev_ix &= (uint32_t)ring_buffer_mask; prev_ix &= (uint32_t)ring_buffer_mask;
if (compare_char != data[prev_ix + best_len_in]) { if (compare_char != data[prev_ix + best_len_in]) {
return BROTLI_FALSE; return;
} }
if (BROTLI_PREDICT_FALSE(backward == 0 || backward > max_backward)) { if (BROTLI_PREDICT_FALSE(backward == 0 || backward > max_backward)) {
return BROTLI_FALSE; return;
} }
len = FindMatchLengthWithLimit(&data[prev_ix], len = FindMatchLengthWithLimit(&data[prev_ix],
&data[cur_ix_masked], &data[cur_ix_masked],
max_length); max_length);
if (len >= 4) { if (len >= 4) {
out->len = len; const score_t score = BackwardReferenceScore(len, backward);
out->distance = backward; if (best_score < score) {
out->score = BackwardReferenceScore(len, backward); out->len = len;
return BROTLI_TRUE; out->distance = backward;
out->score = score;
return;
}
} }
} else { } else {
uint32_t *bucket = self->buckets_ + key; uint32_t *bucket = self->buckets_ + key;
@ -212,18 +216,16 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
out->distance = backward; out->distance = backward;
out->score = score; out->score = score;
compare_char = data[cur_ix_masked + best_len]; compare_char = data[cur_ix_masked + best_len];
is_match_found = BROTLI_TRUE;
} }
} }
} }
} }
if (USE_DICTIONARY && !is_match_found) { if (USE_DICTIONARY && min_score == out->score) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out, handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_TRUE); BROTLI_TRUE);
} }
self->buckets_[key + ((cur_ix >> 3) % BUCKET_SWEEP)] = (uint32_t)cur_ix; self->buckets_[key + ((cur_ix >> 3) % BUCKET_SWEEP)] = (uint32_t)cur_ix;
return is_match_found;
} }
#undef HASH_MAP_SIZE #undef HASH_MAP_SIZE

View File

@ -681,12 +681,14 @@ static BROTLI_BOOL CloseFiles(Context* context, BROTLI_BOOL success) {
} }
} }
if (fclose(context->fin) != 0) { if (context->fin) {
if (is_ok) { if (fclose(context->fin) != 0) {
fprintf(stderr, "fclose failed [%s]: %s\n", if (is_ok) {
PrintablePath(context->current_input_path), strerror(errno)); fprintf(stderr, "fclose failed [%s]: %s\n",
PrintablePath(context->current_input_path), strerror(errno));
}
is_ok = BROTLI_FALSE;
} }
is_ok = BROTLI_FALSE;
} }
if (success && context->junk_source) { if (success && context->junk_source) {
unlink(context->current_input_path); unlink(context->current_input_path);

View File

@ -10,10 +10,12 @@ import unittest
project_dir = os.path.abspath(os.path.join(__file__, '..', '..', '..')) project_dir = os.path.abspath(os.path.join(__file__, '..', '..', '..'))
src_dir = os.path.join(project_dir, 'python')
test_dir = os.path.join(project_dir, 'tests')
PYTHON = sys.executable or 'python' PYTHON = sys.executable or 'python'
BRO = os.path.join(project_dir, 'python', 'bro.py') BRO = os.path.join(src_dir, 'bro.py')
# Get the platform/version-specific build folder. # Get the platform/version-specific build folder.
# By default, the distutils build base is in the same location as setup.py. # By default, the distutils build base is in the same location as setup.py.
@ -30,7 +32,7 @@ if 'PYTHONPATH' not in TEST_ENV:
else: else:
TEST_ENV['PYTHONPATH'] = build_dir + os.pathsep + TEST_ENV['PYTHONPATH'] TEST_ENV['PYTHONPATH'] = build_dir + os.pathsep + TEST_ENV['PYTHONPATH']
TESTDATA_DIR = os.path.join(project_dir, 'tests', 'testdata') TESTDATA_DIR = os.path.join(test_dir, 'testdata')
TESTDATA_FILES = [ TESTDATA_FILES = [
'empty', # Empty file 'empty', # Empty file

273
research/deorummolae.cc Executable file
View File

@ -0,0 +1,273 @@
#include "./deorummolae.h"
#include <array>
#include <vector>
#include "./esaxx/sais.hxx"
/* Used for quick SA-entry to file mapping. Each file is padded to size that
is a multiple of chunk size. */
#define CHUNK_SIZE 64
/* Length of substring that is considered to be covered by dictionary string. */
#define CUT_MATCH 6
/* Minimal dictionary entry size. */
#define MIN_MATCH 24
/* Non tunable definitions. */
#define CHUNK_MASK (CHUNK_SIZE - 1)
#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
static int popcount(uint64_t u) {
return __builtin_popcountll(u);
}
/* Condense terminators and pad file entries. */
static void rewriteText(std::vector<int>* text) {
int terminator = text->back();
int prev = terminator;
size_t to = 0;
for (size_t from = 0; from < text->size(); ++from) {
int next = text->at(from);
if (next < 256 || prev < 256) {
text->at(to++) = next;
if (next >= 256) terminator = next;
}
prev = next;
}
text->resize(to);
if (text->empty()) text->push_back(terminator);
while (text->size() & CHUNK_MASK) text->push_back(terminator);
}
/* Reenumerate terminators for smaller alphabet. */
static void remapTerminators(std::vector<int>* text, int* next_terminator) {
int prev = -1;
int x = 256;
for (size_t i = 0; i < text->size(); ++i) {
int next = text->at(i);
if (next < 256) { // Char.
// Do nothing.
} else if (prev < 256) { // Terminator after char.
next = x++;
} else { // Terminator after terminator.
next = prev;
}
text->at(i) = next;
prev = next;
}
*next_terminator = x;
}
/* Combine all file entries; create mapping position->file. */
static void buildFullText(std::vector<std::vector<int>>* data,
std::vector<int>* full_text, std::vector<size_t>* file_map,
std::vector<size_t>* file_offset, int* next_terminator) {
file_map->resize(0);
file_offset->resize(0);
full_text->resize(0);
for (size_t i = 0; i < data->size(); ++i) {
file_offset->push_back(full_text->size());
std::vector<int>& file = data->at(i);
rewriteText(&file);
full_text->insert(full_text->end(), file.begin(), file.end());
file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
}
if (false) remapTerminators(full_text, next_terminator);
}
/* Build longest-common-prefix based on suffix array and text.
TODO: borrowed -> unknown efficiency. */
static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
std::vector<int>* lcp, std::vector<int>* invese_sa) {
int size = static_cast<int>(text->size());
lcp->resize(size);
int k = 0;
lcp->at(size - 1) = 0;
for (int i = 0; i < size; ++i) {
if (invese_sa->at(i) == size - 1) {
k = 0;
continue;
}
int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
++k;
}
lcp->at(invese_sa->at(i)) = k;
if (k > 0) --k;
}
}
/* Isle is a range in SA with LCP not less than some value.
When we raise the LCP requirement, the isle sunks and smaller isles appear
instead. */
typedef struct {
int lcp;
int l;
int r;
Coverage coverage;
} Isle;
/* Helper routine for `cutMatch`. */
static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
int* next_terminator) {
size_t f = file_map->at(pos / CHUNK_SIZE);
pos -= file_offset->at(f);
std::vector<int>& file = data->at(f);
int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
for (int j = 0; j < l; j++, pos++) {
if (file[pos] >= 256) continue;
if (file[pos + 1] >= 256) {
file[pos] = file[pos + 1];
} else if (pos > 0 && file[pos - 1] >= 256) {
file[pos] = file[pos - 1];
} else {
file[pos] = (*next_terminator)++;
}
}
}
/* Remove substrings of a given match from files.
Substrings are replaced with unique terminators, so next iteration SA would
not allow to cross removed areas. */
static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
int* next_terminator, std::vector<size_t>* file_map,
std::vector<size_t>* file_offset) {
while (length >= CUT_MATCH) {
int i = index;
while (lcp->at(i) >= length) {
i++;
poisonData(
sa->at(i), length, data, file_map, file_offset, next_terminator);
}
while (true) {
poisonData(
sa->at(index), length, data, file_map, file_offset, next_terminator);
if (index == 0 || lcp->at(index - 1) < length) break;
index--;
}
length--;
index = invese_sa->at(sa->at(index) + 1);
}
}
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data) {
{
uint64_t tmp = 0;
if (popcount(tmp - 1u) != 64) {
fprintf(stderr, "64-bit platform is required\n");
return 0;
}
}
/* Could use 256 + '0' for easier debugging. */
int next_terminator = 256;
std::vector<std::vector<int>> data;
size_t offset = 0;
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
size_t next_offset = offset + sample_sizes[n];
data.push_back(
std::vector<int>(sample_data + offset, sample_data + next_offset));
offset = next_offset;
data.back().push_back(next_terminator++);
}
/* Most arrays are allocated once, and then just resized to smaller and
smaller sizes. */
std::vector<int> full_text;
std::vector<size_t> file_map;
std::vector<size_t> file_offset;
std::vector<int> sa;
std::vector<int> invese_sa;
std::vector<int> lcp;
std::vector<Isle> isles;
std::vector<char> output_data;
size_t total = 0;
size_t total_cost = 0;
size_t best_cost;
Isle best_isle;
int min_count = num_samples;
while (true) {
size_t max_match = dictionary_size_limit - total;
buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
sa.resize(full_text.size());
saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
next_terminator);
invese_sa.resize(full_text.size());
for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
buildLcp(&full_text, &sa, &lcp, &invese_sa);
/* Do not rebuild SA/LCP, just use different selection. */
retry:
best_cost = 0;
best_isle = {0, 0, 0, {{0}}};
isles.resize(0);
isles.push_back(best_isle);
for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
int l = i;
Coverage cov = {{0}};
int f = file_map[sa[i] / CHUNK_SIZE];
cov[f >> 6] = ((uint64_t)1) << (f & 63);
while (lcp[i] < isles.back().lcp) {
Isle& top = isles.back();
top.r = i;
l = top.l;
for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
int count = 0;
for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
int effective_lcp = top.lcp;
/* Restrict (last) dictionary entry length. */
if (effective_lcp > max_match) effective_lcp = max_match;
int cost = count * effective_lcp;
if (cost > best_cost && count >= min_count &&
effective_lcp >= MIN_MATCH) {
best_cost = cost;
best_isle = top;
best_isle.lcp = effective_lcp;
}
isles.pop_back();
for (size_t x = 0; x < cov.size(); ++x) {
isles.back().coverage[x] |= cov[x];
}
}
if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
for (size_t x = 0; x < cov.size(); ++x) {
isles.back().coverage[x] |= cov[x];
}
}
/* When saturated matches do not match length restrictions, lower the
saturation requirements. */
if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
if (min_count >= 8) {
min_count = (min_count * 7) / 8;
goto retry;
}
break;
}
/* Save the entry. */
fprintf(stderr,
"Savings: %zu+%zu, dictionary: %zu+%d\n",
total_cost, best_cost, total, best_isle.lcp);
memcpy(
dictionary + total, full_text.data() + sa[best_isle.l], best_isle.lcp);
total += best_isle.lcp;
total_cost += best_cost;
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
&invese_sa, &next_terminator, &file_map, &file_offset);
if (total >= dictionary_size_limit) break;
}
return total;
}

27
research/deorummolae.h Executable file
View File

@ -0,0 +1,27 @@
#ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_
#define BROTLI_RESEARCH_DEORUMMOLAE_H_
#include <stddef.h>
#include <stdint.h>
/* log2(maximal number of files). Value 6 provides some speedups. */
#define LOG_MAX_FILES 6
/* Non tunable definitions. */
#define MAX_FILES (1 << LOG_MAX_FILES)
/**
* Generate a dictionary for given samples.
*
* @param dictionary storage for generated dictionary
* @param dictionary_size_limit maximal dictionary size
* @param num_samples number of samples
* @param sample_sizes array with sample sizes
* @param sample_data concatenated samples
* @return generated dictionary size
*/
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data);
#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_