add max_length to Python streaming decompression

This commit is contained in:
Robert Obryk 2024-09-18 15:25:06 +02:00
parent 28ce91caf6
commit eb3a31e2d3
4 changed files with 175 additions and 59 deletions

View File

@ -606,57 +606,6 @@ static PyTypeObject brotli_CompressorType = {
brotli_Compressor_new, /* tp_new */ brotli_Compressor_new, /* tp_new */
}; };
static PyObject* decompress_stream(BrotliDecoderState* dec,
uint8_t* input, size_t input_length) {
BrotliDecoderResult result;
size_t available_in = input_length;
const uint8_t* next_in = input;
size_t available_out;
uint8_t* next_out;
BlocksOutputBuffer buffer = {.list=NULL};
PyObject *ret;
if (BlocksOutputBuffer_InitAndGrow(&buffer, PY_SSIZE_T_MAX, &available_out, &next_out) < 0) {
goto error;
}
while (1) {
Py_BEGIN_ALLOW_THREADS
result = BrotliDecoderDecompressStream(dec,
&available_in, &next_in,
&available_out, &next_out, NULL);
Py_END_ALLOW_THREADS
if (result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
if (available_out == 0) {
if (BlocksOutputBuffer_Grow(&buffer, &available_out, &next_out) < 0) {
goto error;
}
}
continue;
}
break;
}
if (result == BROTLI_DECODER_RESULT_ERROR || available_in != 0) {
goto error;
}
ret = BlocksOutputBuffer_Finish(&buffer, available_out);
if (ret != NULL) {
goto finally;
}
error:
BlocksOutputBuffer_OnError(&buffer);
ret = NULL;
finally:
return ret;
}
PyDoc_STRVAR(brotli_Decompressor_doc, PyDoc_STRVAR(brotli_Decompressor_doc,
"An object to decompress a byte string.\n" "An object to decompress a byte string.\n"
"\n" "\n"
@ -669,10 +618,14 @@ PyDoc_STRVAR(brotli_Decompressor_doc,
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
BrotliDecoderState* dec; BrotliDecoderState* dec;
uint8_t* unconsumed_data;
size_t unconsumed_data_length;
} brotli_Decompressor; } brotli_Decompressor;
static void brotli_Decompressor_dealloc(brotli_Decompressor* self) { static void brotli_Decompressor_dealloc(brotli_Decompressor* self) {
BrotliDecoderDestroyInstance(self->dec); BrotliDecoderDestroyInstance(self->dec);
if (self->unconsumed_data)
free(self->unconsumed_data);
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
Py_TYPE(self)->tp_free((PyObject*)self); Py_TYPE(self)->tp_free((PyObject*)self);
#else #else
@ -688,6 +641,9 @@ static PyObject* brotli_Decompressor_new(PyTypeObject *type, PyObject *args, PyO
self->dec = BrotliDecoderCreateInstance(0, 0, 0); self->dec = BrotliDecoderCreateInstance(0, 0, 0);
} }
self->unconsumed_data = NULL;
self->unconsumed_data_length = 0;
return (PyObject *)self; return (PyObject *)self;
} }
@ -706,6 +662,79 @@ static int brotli_Decompressor_init(brotli_Decompressor *self, PyObject *args, P
return 0; return 0;
} }
static PyObject* decompress_stream(brotli_Decompressor* self,
uint8_t* input, size_t input_length, Py_ssize_t max_output_length) {
BrotliDecoderResult result;
size_t available_in = input_length;
const uint8_t* next_in = input;
size_t available_out;
uint8_t* next_out;
uint8_t* new_tail;
BlocksOutputBuffer buffer = {.list=NULL};
PyObject *ret;
if (BlocksOutputBuffer_InitAndGrow(&buffer, max_output_length, &available_out, &next_out) < 0) {
goto error;
}
while (1) {
Py_BEGIN_ALLOW_THREADS
result = BrotliDecoderDecompressStream(self->dec,
&available_in, &next_in,
&available_out, &next_out, NULL);
Py_END_ALLOW_THREADS
if (result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
if (available_out == 0) {
if (buffer.allocated == PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
goto error;
}
if (buffer.allocated == max_output_length) {
// We've reached the output length limit.
break;
}
if (BlocksOutputBuffer_Grow(&buffer, &available_out, &next_out) < 0) {
goto error;
}
}
continue;
}
if (result == BROTLI_DECODER_RESULT_ERROR || available_in != 0) {
available_in = 0;
goto error;
}
break;
}
ret = BlocksOutputBuffer_Finish(&buffer, available_out);
if (ret != NULL) {
goto finally;
}
error:
BlocksOutputBuffer_OnError(&buffer);
ret = NULL;
finally:
new_tail = available_in > 0 ? malloc(available_in) : NULL;
if (available_in > 0) {
memcpy(new_tail, next_in, available_in);
}
if (self->unconsumed_data) {
free(self->unconsumed_data);
}
self->unconsumed_data = new_tail;
self->unconsumed_data_length = available_in;
return ret;
}
PyDoc_STRVAR(brotli_Decompressor_process_doc, PyDoc_STRVAR(brotli_Decompressor_process_doc,
"Process \"string\" for decompression, returning a string that contains \n" "Process \"string\" for decompression, returning a string that contains \n"
"decompressed output data. This data should be concatenated to the output \n" "decompressed output data. This data should be concatenated to the output \n"
@ -713,28 +742,38 @@ PyDoc_STRVAR(brotli_Decompressor_process_doc,
"Some or all of the input may be kept in internal buffers for later \n" "Some or all of the input may be kept in internal buffers for later \n"
"processing, and the decompressed output data may be empty until enough input \n" "processing, and the decompressed output data may be empty until enough input \n"
"has been accumulated.\n" "has been accumulated.\n"
"If max_output_length is set, no more than max_output_length bytes will be\n"
"returned. If the limit is reached, further calls to process (potentially with\n"
"empty input) will continue to yield more data. If, after returning a string of\n"
"the length equal to limit, can_accept_more_data() returns False, process()\n"
"must only be called with empty input until can_accept_more_data() once again\n"
"returns True.\n"
"\n" "\n"
"Signature:\n" "Signature:\n"
" decompress(string)\n" " decompress(string, max_output_length=int)\n"
"\n" "\n"
"Args:\n" "Args:\n"
" string (bytes): The input data\n" " string (bytes): The input data\n"
"\n" "\n""Returns:\n"
"Returns:\n"
" The decompressed output data (bytes)\n" " The decompressed output data (bytes)\n"
"\n" "\n"
"Raises:\n" "Raises:\n"
" brotli.error: If decompression fails\n"); " brotli.error: If decompression fails\n");
static PyObject* brotli_Decompressor_process(brotli_Decompressor *self, PyObject *args) { static PyObject* brotli_Decompressor_process(brotli_Decompressor *self, PyObject *args, PyObject* keywds) {
PyObject* ret; PyObject* ret;
Py_buffer input; Py_buffer input;
int ok; int ok;
Py_ssize_t max_output_length = PY_SSIZE_T_MAX;
uint8_t* data;
size_t data_length;
static char* kwlist[] = { "", "max_output_length", NULL };
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
ok = PyArg_ParseTuple(args, "y*:process", &input); ok = PyArg_ParseTupleAndKeywords(args, keywds, "y*|n:process", kwlist, &input, &max_output_length);
#else #else
ok = PyArg_ParseTuple(args, "s*:process", &input); ok = PyArg_ParseTupleAndKeywords(args, keywds, "s*|n:process", kwlist, &input, &max_output_length);
#endif #endif
if (!ok) { if (!ok) {
@ -745,7 +784,20 @@ static PyObject* brotli_Decompressor_process(brotli_Decompressor *self, PyObject
goto error; goto error;
} }
ret = decompress_stream(self->dec, (uint8_t*) input.buf, input.len); if (self->unconsumed_data_length > 0) {
if (input.len > 0) {
PyErr_SetString(BrotliError, "process called with data when accept_more_data is False");
ret = NULL;
goto finally;
}
data = self->unconsumed_data;
data_length = self->unconsumed_data_length;
} else {
data = (uint8_t*)input.buf;
data_length = input.len;
}
ret = decompress_stream(self, data, data_length, max_output_length);
if (ret != NULL) { if (ret != NULL) {
goto finally; goto finally;
} }
@ -787,13 +839,35 @@ static PyObject* brotli_Decompressor_is_finished(brotli_Decompressor *self) {
} }
} }
PyDoc_STRVAR(brotli_Decompressor_can_accept_more_data_doc,
"Checks if the decoder instance can accept more compressed data. If the decompress()\n"
"method on this instance of decompressor was never called with max_length,\n"
"this method will always return True.\n"
"\n"
"Signature:"
" can_accept_more_data()\n"
"\n"
"Returns:\n"
" True if the decoder is ready to accept more compressed data via decompress()\n"
" False if the decoder needs to output some data via decompress(b'') before\n"
" being provided any more compressed data\n");
static PyObject* brotli_Decompressor_can_accept_more_data(brotli_Decompressor* self) {
if (self->unconsumed_data_length > 0) {
Py_RETURN_FALSE;
} else {
Py_RETURN_TRUE;
}
}
static PyMemberDef brotli_Decompressor_members[] = { static PyMemberDef brotli_Decompressor_members[] = {
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };
static PyMethodDef brotli_Decompressor_methods[] = { static PyMethodDef brotli_Decompressor_methods[] = {
{"process", (PyCFunction)brotli_Decompressor_process, METH_VARARGS, brotli_Decompressor_process_doc}, {"process", (PyCFunction)brotli_Decompressor_process, METH_VARARGS | METH_KEYWORDS, brotli_Decompressor_process_doc},
{"is_finished", (PyCFunction)brotli_Decompressor_is_finished, METH_NOARGS, brotli_Decompressor_is_finished_doc}, {"is_finished", (PyCFunction)brotli_Decompressor_is_finished, METH_NOARGS, brotli_Decompressor_is_finished_doc},
{"can_accept_more_data", (PyCFunction)brotli_Decompressor_can_accept_more_data, METH_NOARGS, brotli_Decompressor_can_accept_more_data_doc},
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };

View File

@ -4,6 +4,7 @@
# See file LICENSE for detail or copy at https://opensource.org/licenses/MIT # See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
import functools import functools
import os
import unittest import unittest
from . import _test_utils from . import _test_utils
@ -39,10 +40,51 @@ class TestDecompressor(_test_utils.TestCase):
out_file.write(self.decompressor.process(data)) out_file.write(self.decompressor.process(data))
self.assertTrue(self.decompressor.is_finished()) self.assertTrue(self.decompressor.is_finished())
def _decompress_with_limit(self, test_data, max_output_length):
temp_uncompressed = _test_utils.get_temp_uncompressed_name(test_data)
with open(temp_uncompressed, 'wb') as out_file:
with open(test_data, 'rb') as in_file:
chunk_iter = iter(functools.partial(in_file.read, 10 * 1024), b'')
while not self.decompressor.is_finished():
data = b''
if self.decompressor.can_accept_more_data():
data = next(chunk_iter, b'')
decompressed_data = self.decompressor.process(data, max_output_length=max_output_length)
self.assertTrue(len(decompressed_data) <= max_output_length)
out_file.write(decompressed_data)
self.assertTrue(next(chunk_iter, None) == None)
def _test_decompress(self, test_data): def _test_decompress(self, test_data):
self._decompress(test_data) self._decompress(test_data)
self._check_decompression(test_data) self._check_decompression(test_data)
def _test_decompress_with_limit(self, test_data):
self._decompress_with_limit(test_data, max_output_length=20)
self._check_decompression(test_data)
def test_too_much_input(self):
with open(os.path.join(_test_utils.TESTDATA_DIR, "zerosukkanooa.compressed"), 'rb') as in_file:
compressed = in_file.read()
self.decompressor.process(compressed[:-1], max_output_length=1)
# the following assertion checks whether the test setup is correct
self.assertTrue(not self.decompressor.can_accept_more_data())
with self.assertRaises(brotli.error):
self.decompressor.process(compressed[-1:])
def test_changing_limit(self):
test_data = os.path.join(_test_utils.TESTDATA_DIR, "zerosukkanooa.compressed")
temp_uncompressed = _test_utils.get_temp_uncompressed_name(test_data)
with open(temp_uncompressed, 'wb') as out_file:
with open(test_data, 'rb') as in_file:
compressed = in_file.read()
uncompressed = self.decompressor.process(compressed[:-1], max_output_length=1)
self.assertTrue(len(uncompressed) <= 1)
out_file.write(uncompressed)
while not self.decompressor.can_accept_more_data():
out_file.write(self.decompressor.process(b''))
out_file.write(self.decompressor.process(compressed[-1:]))
self._check_decompression(test_data)
def test_garbage_appended(self): def test_garbage_appended(self):
with self.assertRaises(brotli.error): with self.assertRaises(brotli.error):
self.decompressor.process(brotli.compress(b'a') + b'a') self.decompressor.process(brotli.compress(b'a') + b'a')

BIN
tests/testdata/zerosukkanooa vendored Normal file

Binary file not shown.

BIN
tests/testdata/zerosukkanooa.compressed vendored Normal file

Binary file not shown.