From 9f830e0c416c4a137f71e838764064aff5ef11b6 Mon Sep 17 00:00:00 2001 From: "Markus F.X.J. Oberhumer" Date: Fri, 13 Jan 2023 22:07:24 +0100 Subject: [PATCH] all: add zstd compression library --- .github/workflows/ci.yml | 9 +- CMakeLists.txt | 24 ++++ Makefile | 3 + src/compress.cpp | 13 +++ src/compress.h | 22 ++++ src/compress_zstd.cpp | 231 +++++++++++++++++++++++++++++++++++++++ src/conf.h | 24 +++- src/help.cpp | 9 ++ src/main.cpp | 7 +- src/util/membuffer.cpp | 2 + 10 files changed, 334 insertions(+), 10 deletions(-) create mode 100644 src/compress_zstd.cpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3d948f7c..36d0cfff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -259,7 +259,7 @@ jobs: git config --global core.autocrlf input git --version && bash --version git clone --depth=1 https://github.com/upx/upx-testsuite ../deps/upx-testsuite - mkdir -p -v build/$C/$B/{ucl,upx,zlib} + mkdir -p -v build/$C/$B/{ucl,upx,zlib,zstd} - name: 'Set up Developer Command Prompt' uses: ilammy/msvc-dev-cmd@v1 with: @@ -281,12 +281,17 @@ jobs: cd %BDIR%\zlib cl -MT -J -O2 -W3 -WX %DEFS% -c %H%\vendor\zlib\*.c link -lib -out:zlib.lib *.obj + @REM ===== build zstd ===== + cd %BDIR%\zstd + set s=%H%\vendor\zstd\lib + cl -MT -J -O2 -W4 -WX -DDYNAMIC_BMI2=0 -DZSTD_DISABLE_ASM %DEFS% -c %s%\common\*.c %s%\compress\*.c %s%\decompress\*.c + link -lib -out:zstd.lib *.obj @REM ===== build UPX ===== cd %BDIR%\upx set s=%H%\src cat .GITREV.txt set /p GITREV=<.GITREV.txt - cl -std:c++17 -Zc:__cplusplus -EHsc -J -O2 -W4 -WX -DUPX_VERSION_GITREV="""%GITREV%""" %DEFS% -I%H%\vendor -I%H%\vendor\boost-pfr\include -Feupx.exe %s%\*.cpp %s%\util\*.cpp %BDIR%\ucl\ucl.lib %BDIR%\zlib\zlib.lib /link setargv.obj + cl -std:c++17 -Zc:__cplusplus -EHsc -J -O2 -W4 -WX -DUPX_VERSION_GITREV="""%GITREV%""" -DWITH_ZSTD %DEFS% -I%H%\vendor -I%H%\vendor\boost-pfr\include -Feupx.exe %s%\*.cpp %s%\util\*.cpp %BDIR%\ucl\ucl.lib %BDIR%\zlib\zlib.lib %BDIR%\zstd\zstd.lib /link setargv.obj - name: 'Make artifact' shell: bash run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 852df126..82c8339f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,12 +6,14 @@ if(NOT IS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.git") option(UPX_CONFIG_DISABLE_GITREV "Do not compile with default Git version info." ON) option(UPX_CONFIG_DISABLE_SANITIZE "Do not compile with default sanitize options." ON) option(UPX_CONFIG_DISABLE_WERROR "Do not compile with default -Werror option." ON) + option(UPX_CONFIG_DISABLE_ZSTD "Do not compile with zstd; NOTE: zstd is WIP." ON) else() # strict config defaults for devel builds message(STATUS "upx info: strict config defaults enabled") option(UPX_CONFIG_DISABLE_GITREV "Do not compile with default Git version info." OFF) option(UPX_CONFIG_DISABLE_SANITIZE "Do not compile with default sanitize options." OFF) option(UPX_CONFIG_DISABLE_WERROR "Do not compile with default -Werror option." OFF) + option(UPX_CONFIG_DISABLE_ZSTD "Do not compile with zstd; NOTE: zstd is WIP." OFF) endif() # test config options (see below) @@ -109,6 +111,13 @@ list(SORT zlib_SOURCES) add_library(upx_vendor_zlib STATIC ${zlib_SOURCES}) set_property(TARGET upx_vendor_zlib PROPERTY C_STANDARD 11) +if(NOT UPX_CONFIG_DISABLE_ZSTD) +file(GLOB zstd_SOURCES "vendor/zstd/lib/*/*.c") +list(SORT zstd_SOURCES) +add_library(upx_vendor_zstd STATIC ${zstd_SOURCES}) +set_property(TARGET upx_vendor_zstd PROPERTY C_STANDARD 11) +endif() + file(GLOB upx_SOURCES "src/*.cpp" "src/util/*.cpp") list(SORT upx_SOURCES) add_executable(upx ${upx_SOURCES}) @@ -181,6 +190,17 @@ else() target_compile_options(${t} PRIVATE -Wall -Wextra -Wvla -Wno-strict-prototypes ${warn_Werror}) endif() +if(NOT UPX_CONFIG_DISABLE_ZSTD) +set(t upx_vendor_zstd) +upx_sanitize_target(${t}) +target_compile_options(${t} PRIVATE -DDYNAMIC_BMI2=0 -DZSTD_DISABLE_ASM) +if(MSVC) + target_compile_options(${t} PRIVATE -J -W4 ${warn_WX}) +else() + target_compile_options(${t} PRIVATE -Wall -Wextra -Wcast-align -Wcast-qual -Wpointer-arith -Wvla -Wwrite-strings ${warn_Werror}) +endif() +endif() + set(t upx) target_include_directories(${t} PRIVATE vendor vendor/boost-pfr/include) target_compile_definitions(${t} PRIVATE $<$:DEBUG=1>) @@ -199,6 +219,10 @@ else() -Wshadow -Wvla -Wwrite-strings ${warn_Werror} ) endif() +if(NOT UPX_CONFIG_DISABLE_ZSTD) + target_compile_definitions(${t} PRIVATE WITH_ZSTD=1) + target_link_libraries(upx upx_vendor_zstd) +endif() #*********************************************************************** # "make test" diff --git a/Makefile b/Makefile index a6dba611..f35192b2 100644 --- a/Makefile +++ b/Makefile @@ -150,3 +150,6 @@ endif ifeq ($(wildcard ./vendor/zlib/crc32.c),) $(error ERROR: missing git submodule; run 'git submodule update --init') endif +ifeq ($(wildcard ./vendor/zstd/lib/.),) + $(error ERROR: missing git submodule; run 'git submodule update --init') +endif diff --git a/src/compress.cpp b/src/compress.cpp index 50ae65d7..1c7f02bf 100644 --- a/src/compress.cpp +++ b/src/compress.cpp @@ -117,6 +117,11 @@ int upx_compress( const upx_bytep src, unsigned src_len, else if (M_IS_NRV2B(method) || M_IS_NRV2D(method) || M_IS_NRV2E(method)) r = upx_ucl_compress(src, src_len, dst, dst_len, cb, method, level, cconf, cresult); +#endif +#if (WITH_ZSTD) + else if (M_IS_ZSTD(method)) + r = upx_zstd_compress(src, src_len, dst, dst_len, + cb, method, level, cconf, cresult); #endif else { throwInternalError("unknown compression method"); @@ -164,6 +169,10 @@ int upx_decompress(const upx_bytep src, unsigned src_len, #if (WITH_ZLIB) else if (M_IS_DEFLATE(method)) r = upx_zlib_decompress(src, src_len, dst, dst_len, method, cresult); +#endif +#if (WITH_ZSTD) + else if (M_IS_ZSTD(method)) + r = upx_zstd_decompress(src, src_len, dst, dst_len, method, cresult); #endif else { throwInternalError("unknown decompression method"); @@ -207,6 +216,10 @@ int upx_test_overlap( const upx_bytep buf, #if (WITH_UCL) else if (M_IS_NRV2B(method) || M_IS_NRV2D(method) || M_IS_NRV2E(method)) r = upx_ucl_test_overlap(buf, tbuf, src_off, src_len, dst_len, method, cresult); +#endif +#if (WITH_ZSTD) + else if (M_IS_ZSTD(method)) + r = upx_zstd_test_overlap(buf, tbuf, src_off, src_len, dst_len, method, cresult); #endif else { throwInternalError("unknown decompression method"); diff --git a/src/compress.h b/src/compress.h index 6ac5f01d..f2c1c519 100644 --- a/src/compress.h +++ b/src/compress.h @@ -126,6 +126,28 @@ unsigned upx_zlib_crc32 (const void *buf, unsigned len, unsigned crc); #endif +#if (WITH_ZSTD) +int upx_zstd_init(void); +const char *upx_zstd_version_string(void); +int upx_zstd_compress ( const upx_bytep src, unsigned src_len, + upx_bytep dst, unsigned* dst_len, + upx_callback_p cb, + int method, int level, + const upx_compress_config_t *cconf, + upx_compress_result_t *cresult ); +int upx_zstd_decompress ( const upx_bytep src, unsigned src_len, + upx_bytep dst, unsigned* dst_len, + int method, + const upx_compress_result_t *cresult ); +int upx_zstd_test_overlap ( const upx_bytep buf, + const upx_bytep tbuf, + unsigned src_off, unsigned src_len, + unsigned* dst_len, + int method, + const upx_compress_result_t *cresult ); +#endif + + #endif /* already included */ /* vim:set ts=4 sw=4 et: */ diff --git a/src/compress_zstd.cpp b/src/compress_zstd.cpp new file mode 100644 index 00000000..367eef8f --- /dev/null +++ b/src/compress_zstd.cpp @@ -0,0 +1,231 @@ +/* compress_zstd.cpp -- + + This file is part of the UPX executable compressor. + + Copyright (C) 1996-2023 Markus Franz Xaver Johannes Oberhumer + All Rights Reserved. + + UPX and the UCL library are free software; you can redistribute them + and/or modify them under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. + If not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + Markus F.X.J. Oberhumer + + */ + +#include "conf.h" + +void zstd_compress_config_t::reset() { mem_clear(this, sizeof(*this)); } + +#if WITH_ZSTD +#include "compress.h" +#include "util/membuffer.h" +#include +#include +#include + +static int convert_errno_from_zstd(size_t zr) { + const ZSTD_ErrorCode ze = ZSTD_getErrorCode(zr); + switch (ze) { + case ZSTD_error_memory_allocation: + return UPX_E_OUT_OF_MEMORY; + case ZSTD_error_srcSize_wrong: + return UPX_E_INPUT_OVERRUN; + case ZSTD_error_dstSize_tooSmall: + return UPX_E_OUTPUT_OVERRUN; + default: + break; + } + return UPX_E_ERROR; +} + +/************************************************************************* +// TODO later: use advanced compression API for compression finetuning +**************************************************************************/ + +int upx_zstd_compress(const upx_bytep src, unsigned src_len, upx_bytep dst, unsigned *dst_len, + upx_callback_p cb_parm, int method, int level, + const upx_compress_config_t *cconf_parm, upx_compress_result_t *cresult) { + assert(method == M_ZSTD); + assert(level > 0); + assert(cresult != nullptr); + UNUSED(cb_parm); + int r = UPX_E_ERROR; + size_t zr; + const zstd_compress_config_t *const lcconf = cconf_parm ? &cconf_parm->conf_zstd : nullptr; + zstd_compress_result_t *const res = &cresult->result_zstd; + + // TODO later: map level 1..10 to zstd-level 1..22 + if (level == 10) + level = 22; + + // cconf overrides + if (lcconf) { + UNUSED(lcconf); + } + + res->dummy = 0; + + zr = ZSTD_compress(dst, *dst_len, src, src_len, level); + if (ZSTD_isError(zr)) { + *dst_len = 0; // TODO ??? + r = convert_errno_from_zstd(zr); + assert(r != UPX_E_OK); + } else { + assert(zr <= *dst_len); + *dst_len = (unsigned) zr; + r = UPX_E_OK; + } + + return r; +} + +/************************************************************************* +// +**************************************************************************/ + +int upx_zstd_decompress(const upx_bytep src, unsigned src_len, upx_bytep dst, unsigned *dst_len, + int method, const upx_compress_result_t *cresult) { + assert(method == M_ZSTD); + UNUSED(method); + UNUSED(cresult); + int r = UPX_E_ERROR; + size_t zr; + + zr = ZSTD_decompress(dst, *dst_len, src, src_len); + if (ZSTD_isError(zr)) { + *dst_len = 0; // TODO ??? + r = convert_errno_from_zstd(zr); + assert(r != UPX_E_OK); + } else { + assert(zr <= *dst_len); + *dst_len = (unsigned) zr; + r = UPX_E_OK; + } + + return r; +} + +/************************************************************************* +// test_overlap - see for semantics +**************************************************************************/ + +int upx_zstd_test_overlap(const upx_bytep buf, const upx_bytep tbuf, unsigned src_off, + unsigned src_len, unsigned *dst_len, int method, + const upx_compress_result_t *cresult) { + assert(method == M_ZSTD); + + MemBuffer b(src_off + src_len); + memcpy(b + src_off, buf + src_off, src_len); + unsigned saved_dst_len = *dst_len; + int r = upx_zstd_decompress(raw_index_bytes(b, src_off, src_len), src_len, + raw_bytes(b, *dst_len), dst_len, method, cresult); + if (r != UPX_E_OK) + return r; + if (*dst_len != saved_dst_len) + return UPX_E_ERROR; + // NOTE: there is a very tiny possibility that decompression has + // succeeded but the data is not restored correctly because of + // in-place buffer overlapping, so we use an extra memcmp(). + if (tbuf != nullptr && memcmp(tbuf, b, *dst_len) != 0) + return UPX_E_ERROR; + return UPX_E_OK; +} + +/************************************************************************* +// misc +**************************************************************************/ + +int upx_zstd_init(void) { + if (strcmp(ZSTD_VERSION_STRING, ZSTD_versionString()) != 0) + return -2; + return 0; +} + +const char *upx_zstd_version_string(void) { return ZSTD_VERSION_STRING; } + +/************************************************************************* +// doctest checks +**************************************************************************/ + +#if DEBUG && !defined(DOCTEST_CONFIG_DISABLE) && 1 + +#include "util/membuffer.h" + +static bool check_zstd(const int method, const int level, const unsigned expected_c_len) { + const unsigned u_len = 16384; + const unsigned c_extra = 4096; + MemBuffer u_buf, c_buf, d_buf; + unsigned c_len, d_len; + upx_compress_result_t cresult; + int r; + + u_buf.alloc(u_len); + memset(u_buf, 0, u_len); + c_buf.allocForCompression(u_len, c_extra); + d_buf.allocForDecompression(u_len); + + c_len = c_buf.getSize() - c_extra; + r = upx_zstd_compress(raw_bytes(u_buf, u_len), u_len, raw_index_bytes(c_buf, c_extra, c_len), + &c_len, nullptr, method, level, NULL_cconf, &cresult); + if (r != 0 || c_len != expected_c_len) + return false; + + d_len = d_buf.getSize(); + r = upx_zstd_decompress(raw_index_bytes(c_buf, c_extra, c_len), c_len, raw_bytes(d_buf, d_len), + &d_len, method, nullptr); + if (r != 0 || d_len != u_len || memcmp(u_buf, d_buf, u_len) != 0) + return false; + + d_len = u_len - 1; + r = upx_zstd_decompress(raw_index_bytes(c_buf, c_extra, c_len), c_len, raw_bytes(d_buf, d_len), + &d_len, method, nullptr); + if (r == 0) + return false; + + // TODO: rewrite Packer::findOverlapOverhead() so that we can test it here + // unsigned x_len = d_len; + // r = upx_zstd_test_overlap(c_buf, u_buf, c_extra, c_len, &x_len, method, nullptr); + return true; +} + +TEST_CASE("compress_zstd") { + CHECK(check_zstd(M_ZSTD, 1, 19)); + CHECK(check_zstd(M_ZSTD, 3, 19)); + CHECK(check_zstd(M_ZSTD, 5, 19)); +} + +#endif // DEBUG + +TEST_CASE("upx_zstd_decompress") { + typedef const upx_byte C; + C *c_data; + upx_byte d_buf[32]; + unsigned d_len; + int r; + + c_data = (C *) "\x28\xb5\x2f\xfd\x20\x20\x3d\x00\x00\x08\xff\x01\x00\x34\x4e\x08"; + d_len = 32; + r = upx_zstd_decompress(c_data, 16, d_buf, &d_len, M_ZSTD, nullptr); + CHECK((r == 0 && d_len == 32)); + r = upx_zstd_decompress(c_data, 15, d_buf, &d_len, M_ZSTD, nullptr); + CHECK(r == UPX_E_INPUT_OVERRUN); + d_len = 31; + r = upx_zstd_decompress(c_data, 16, d_buf, &d_len, M_ZSTD, nullptr); + CHECK(r == UPX_E_OUTPUT_OVERRUN); +} + +#endif // WITH_ZSTD + +/* vim:set ts=4 sw=4 et: */ diff --git a/src/conf.h b/src/conf.h index 515a8f39..1cb4970f 100644 --- a/src/conf.h +++ b/src/conf.h @@ -563,6 +563,7 @@ constexpr bool string_ge(const char *a, const char *b) { //#define M_CL1B_LE16 13 #define M_LZMA 14 #define M_DEFLATE 15 /* zlib */ +#define M_ZSTD 16 // compression methods internal usage #define M_ALL (-1) #define M_END (-2) @@ -576,6 +577,7 @@ constexpr bool string_ge(const char *a, const char *b) { //#define M_IS_CL1B(x) ((x) >= M_CL1B_LE32 && (x) <= M_CL1B_LE16) #define M_IS_LZMA(x) (((x) & 255) == M_LZMA) #define M_IS_DEFLATE(x) ((x) == M_DEFLATE) +#define M_IS_ZSTD(x) ((x) == M_ZSTD) // filters @@ -672,13 +674,11 @@ struct lzma_compress_config_t void reset(); }; - struct ucl_compress_config_t : public REAL_ucl_compress_config_t { void reset() { memset(this, 0xff, sizeof(*this)); } }; - struct zlib_compress_config_t { typedef OptVar mem_level_t; // ml @@ -692,13 +692,20 @@ struct zlib_compress_config_t void reset(); }; +struct zstd_compress_config_t +{ + unsigned dummy; + + void reset(); +}; struct upx_compress_config_t { lzma_compress_config_t conf_lzma; ucl_compress_config_t conf_ucl; zlib_compress_config_t conf_zlib; - void reset() { conf_lzma.reset(); conf_ucl.reset(); conf_zlib.reset(); } + zstd_compress_config_t conf_zstd; + void reset() { conf_lzma.reset(); conf_ucl.reset(); conf_zlib.reset(); conf_zstd.reset(); } }; #define NULL_cconf ((upx_compress_config_t *) nullptr) @@ -722,7 +729,6 @@ struct lzma_compress_result_t void reset() { memset(this, 0, sizeof(*this)); } }; - struct ucl_compress_result_t { ucl_uint result[16]; @@ -730,7 +736,6 @@ struct ucl_compress_result_t void reset() { memset(this, 0, sizeof(*this)); } }; - struct zlib_compress_result_t { unsigned dummy; @@ -738,6 +743,12 @@ struct zlib_compress_result_t void reset() { memset(this, 0, sizeof(*this)); } }; +struct zstd_compress_result_t +{ + unsigned dummy; + + void reset() { memset(this, 0, sizeof(*this)); } +}; struct upx_compress_result_t { @@ -748,10 +759,11 @@ struct upx_compress_result_t lzma_compress_result_t result_lzma; ucl_compress_result_t result_ucl; zlib_compress_result_t result_zlib; + zstd_compress_result_t result_zstd; void reset() { memset(this, 0, sizeof(*this)); - result_lzma.reset(); result_ucl.reset(); result_zlib.reset(); + result_lzma.reset(); result_ucl.reset(); result_zlib.reset(); result_zstd.reset(); } }; diff --git a/src/help.cpp b/src/help.cpp index eb568a81..5e85fbbd 100644 --- a/src/help.cpp +++ b/src/help.cpp @@ -411,6 +411,11 @@ void show_version(bool one_line) if (v != nullptr && v[0]) fprintf(fp, "LZMA SDK version %s\n", v); #endif +#if (WITH_ZSTD) + v = upx_zstd_version_string(); + if (v != nullptr && v[0]) + fprintf(fp, "zstd data compression library %s\n", v); +#endif #if !defined(DOCTEST_CONFIG_DISABLE) fprintf(fp, "doctest C++ testing framework version %s\n", DOCTEST_VERSION_STR); #endif @@ -424,6 +429,10 @@ void show_version(bool one_line) #if (WITH_LZMA) fprintf(fp, "Copyright (C) 1999" "-2006 Igor Pavlov\n"); #endif +#if (WITH_ZSTD) + // see vendor/zstd/LICENSE; main author is Yann Collet + fprintf(fp, "Copyright (C) 2015" "-2023 Meta Platforms, Inc. and affiliates\n"); +#endif #if !defined(DOCTEST_CONFIG_DISABLE) fprintf(fp, "Copyright (C) 2016" "-2021 Viktor Kirilov\n"); #endif diff --git a/src/main.cpp b/src/main.cpp index 0c56d707..9f907966 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1208,11 +1208,14 @@ int upx_main(int argc, char *argv[]) { set_term(stderr); assert(upx_lzma_init() == 0); - assert(upx_ucl_init() == 0); - assert(upx_zlib_init() == 0); #if (WITH_NRV) assert(upx_nrv_init() == 0); #endif + assert(upx_ucl_init() == 0); + assert(upx_zlib_init() == 0); +#if (WITH_ZSTD) + assert(upx_zstd_init() == 0); +#endif /* get options */ first_options(argc, argv); diff --git a/src/util/membuffer.cpp b/src/util/membuffer.cpp index d7e9cfe7..0a0db6ce 100644 --- a/src/util/membuffer.cpp +++ b/src/util/membuffer.cpp @@ -129,6 +129,8 @@ unsigned MemBuffer::getSizeForCompression(unsigned uncompressed_size, unsigned e bytes = umax(bytes, (z / 3 * (8 + 2 * (w - 8) / 1)) / 8); // NRV2E: 1 byte plus 3 bits per pair of width exceeding 7 ("ss12") bytes = umax(bytes, (z / 3 * (8 + 3 * (w - 7) / 2)) / 8); + // zstd: ZSTD_COMPRESSBOUND + bytes = umax(bytes, z + (z >> 8) + ((z < (128 << 10)) ? (((128 << 10) - z) >> 11) : 0)); // extra + 256 safety for rounding bytes = mem_size(1, bytes, extra, 256); return bytes;