diff --git a/dec/bit_reader.h b/dec/bit_reader.h index cbd9fae..43e29ba 100644 --- a/dec/bit_reader.h +++ b/dec/bit_reader.h @@ -30,6 +30,11 @@ extern "C" { #if (defined(__x86_64__) || defined(_M_X64)) /* This should be set to 1 only on little-endian machines. */ #define BROTLI_USE_64_BITS 1 +#elif (defined(__arm__)) +/* TODO: __arm__ is much too broad. The following flags should + only be set on ARM architectures with little-endian byte order */ +#define ARMv7 +#define BROTLI_USE_64_BITS 1 #else #define BROTLI_USE_64_BITS 0 #endif @@ -41,10 +46,16 @@ extern "C" { #define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8) #define UNALIGNED_MOVE64(dst, src) memmove(dst, src, 8) +#ifdef ARMv7 +/* Arm instructions can shift and negate registers before an AND operation. */ +static BROTLI_INLINE uint32_t BitMask(int n) { return ~((0xffffffff) << n); } +#else static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = { 0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215 }; +static BROTLI_INLINE uint32_t BitMask(int n) { return kBitMask[n]; } +#endif typedef struct { #if (BROTLI_USE_64_BITS) @@ -91,7 +102,7 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br, uint32_t val) { #ifdef BROTLI_DECODE_DEBUG uint32_t n_bits = val - br->bit_pos_; - const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits]; + const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits); printf("[BrotliReadBits] %010d %2d val: %6x\n", (br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval); #endif @@ -148,7 +159,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) { } br->eos_ = 1; /* Store 32 bytes of zero after the stream end. */ -#if (BROTLI_USE_64_BITS) +#if (BROTLI_USE_64_BITS) && !defined(ARMv7) *(uint64_t*)(dst + bytes_read) = 0; *(uint64_t*)(dst + bytes_read + 8) = 0; *(uint64_t*)(dst + bytes_read + 16) = 0; @@ -159,7 +170,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) { } if (dst == br->buf_) { /* Copy the head of the ringbuffer to the slack region. */ -#if (BROTLI_USE_64_BITS) +#if (BROTLI_USE_64_BITS) && !defined(ARMv7) UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_); UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8); UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16); @@ -203,8 +214,15 @@ static BROTLI_INLINE uint32_t BrotliReadBits( BrotliBitReader* const br, int n_bits) { uint32_t val; #if (BROTLI_USE_64_BITS) +#if defined(ARMv7) + if ((64 - br->bit_pos_) < ((uint32_t) n_bits)) { + BrotliFillBitWindow(br); + } + val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits); +#else BrotliFillBitWindow(br); - val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits]; + val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits); +#endif /* defined (ARMv7) */ #else /* * The if statement gives 2-4% speed boost on Canterbury data set with @@ -213,8 +231,8 @@ static BROTLI_INLINE uint32_t BrotliReadBits( if ((32 - br->bit_pos_) < ((uint32_t) n_bits)) { BrotliFillBitWindow(br); } - val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits]; -#endif + val = (br->val_ >> br->bit_pos_) & Bitmask(n_bits); +#endif /* BROTLI_USE_64_BITS */ #ifdef BROTLI_DECODE_DEBUG printf("[BrotliReadBits] %010d %2d val: %6x\n", (br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val); diff --git a/dec/decode.c b/dec/decode.c index cc1bab5..fc936ad 100644 --- a/dec/decode.c +++ b/dec/decode.c @@ -149,6 +149,21 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table, BrotliBitReader* br) { int nbits; BrotliFillBitWindow(br); +#ifdef ARMv7 + /* Prefetching helps, since this needs to shift a 64 bit + val by a variable length. The other changes are minor. */ + uint32_t val = BrotliPrefetchBits(br); + table += val & HUFFMAN_TABLE_MASK; + nbits = table->bits; + if (PREDICT_FALSE(nbits > HUFFMAN_TABLE_BITS)) { + nbits -= HUFFMAN_TABLE_BITS; + br->bit_pos_ += HUFFMAN_TABLE_BITS; + table += table->value; + table += (int)(val >> HUFFMAN_TABLE_BITS) & ((1 << nbits) - 1); + nbits = table->bits; + } + br->bit_pos_ += nbits; +#else table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK; if (PREDICT_FALSE(table->bits > HUFFMAN_TABLE_BITS)) { br->bit_pos_ += HUFFMAN_TABLE_BITS; @@ -157,6 +172,7 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table, table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1); } br->bit_pos_ += table->bits; +#endif return table->value; } @@ -624,6 +640,8 @@ static BROTLI_INLINE void DecodeBlockTypeWithContext(BrotliState* s, */ static BROTLI_INLINE void IncrementalCopyFastPath( uint8_t* dst, const uint8_t* src, int len) { +/* TODO: On an ARM UNALIGNED_MOVE64 is compiled into a memcpy. + But I don't have a better solution. */ if (src < dst) { while (dst - src < 8) { UNALIGNED_MOVE64(dst, src); @@ -1488,6 +1506,7 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, s->copy_src = &s->ringbuffer[(pos - s->distance) & s->ringbuffer_mask]; + #if (defined(__x86_64__) || defined(_M_X64)) if (s->copy_src + s->copy_length <= s->ringbuffer_end && s->copy_dst + s->copy_length < s->ringbuffer_end) { @@ -1501,6 +1520,20 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, s->meta_block_remaining_len -= s->copy_length; s->copy_length = 0; } +#elif defined(ARMv7) + /* This version is maybe 5% faster than the version above. + UNALIGNED_COPY64 does not inline and generates memcpy calls. */ + if (s->copy_src + s->copy_length <= s->ringbuffer_end && + s->copy_dst + s->copy_length < s->ringbuffer_end) { + if (s->copy_length <= s->distance) { + memcpy(s->copy_dst, s->copy_src, s->copy_length); + } else { + IncrementalCopyFastPath(s->copy_dst, s->copy_src, s->copy_length); + } + pos += s->copy_length; + s->meta_block_remaining_len -= s->copy_length; + s->copy_length = 0; + } #endif /* Modifications to this loop shold be reflected in BROTLI_STATE_BLOCK_POST_WRITE_2 case */