diff --git a/.gitignore b/.gitignore index 61b4fa5..d722831 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,41 @@ priv *.la *~ *.swp +_build +c_src/snappy/*.d +c_src/*.d +compile_commands.json + +### Elixir ### +/_build +/cover +/deps +/doc +/.fetch +erl_crash.dump +*.ez +*.beam +/config/*.secret.exs +.elixir_ls/ + +### Elixir Patch ### + +### Erlang ### +.eunit +*.o +*.plt +.concrete/DEV_MODE + +# rebar 2.x +.rebar +rel/example_project +ebin/*.beam +deps + +# rebar 3 +.rebar3 +_build/ +_checkouts/ + +### Erlang Patch ### +rebar3.crashdump diff --git a/Makefile b/Makefile index 058a7ec..9be4a26 100644 --- a/Makefile +++ b/Makefile @@ -1,41 +1,14 @@ -REBAR?=rebar +all: snappy eunit +snappy: + ./rebar3 compile -.PHONY: all -# target: all - Makes everything -all: build - - -.PHONY: build -# target: build - Builds the project -build: - $(REBAR) compile - - -.PHONY: check -# target: check - Checks if project builds and passes all the tests -check: build eunit +eunit: + ./rebar3 eunit +check: eunit -.PHONY: clean -# target: clean - Removes build artifacts clean: - $(REBAR) clean - - -.PHONY: distclean -# target: distclean - Removes all unversioned files -distclean: clean - git clean -fxd - - -.PHONY: eunit -# target: eunit - Runs eunit test suite -eunit: - $(REBAR) eunit - + ./rebar3 clean + rm -fr priv ebin -.PHONY: help -# target: help - Prints this help -help: - @egrep "^# target:" Makefile | sed -e 's/^# target: //g' | sort diff --git a/README.md b/README.md index 0e54b59..b851eec 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,42 @@ Google snappy's official repository and bug tracking system is at: Its source is included in this project. +# changelog + +- tag `1.1.2` as tests pass against OTP-24 +- tag `otp-24` compatibility +- switch build chain to [rebar3] and [enc] requiring OTP20 or newer +- tag `snappy-1.1.8`, also first version compatible with OTP23 (thanks @skaes) +- tag `snappy-1.1.3` +- add a changelog + +[rebar3]: https://rebar3.org/ +[enc]: https://github.com/davisp/erlang-native-compiler # site -https://github.com/fdmanana/snappy-erlang-nif +https://github.com/skunkwerks/snappy-erlang-nif + +# credits + +Software is built by a few people and maintained by many. Thank-you for +all your patches! + +- Arne Ehrlich +- Bryan Chan +- Dave Cottlehuber +- Filipe David Manana +- Joshua Scott +- Kelly McLaughlin +- Louis-Philippe Gauthier +- Mikhail Uvarov +- Peter Membrey +- Piotr Nosek +- benoitc +- @dch314 +- glejeune +- Stefan Kaes # performance tests diff --git a/c_src/snappy/snappy-internal.h b/c_src/snappy/snappy-internal.h index 0653dc6..1e1c307 100644 --- a/c_src/snappy/snappy-internal.h +++ b/c_src/snappy/snappy-internal.h @@ -36,21 +36,30 @@ namespace snappy { namespace internal { +// Working memory performs a single allocation to hold all scratch space +// required for compression. class WorkingMemory { public: - WorkingMemory() : large_table_(NULL) { } - ~WorkingMemory() { delete[] large_table_; } + explicit WorkingMemory(size_t input_size); + ~WorkingMemory(); // Allocates and clears a hash table using memory in "*this", // stores the number of buckets in "*table_size" and returns a pointer to // the base of the hash table. - uint16* GetHashTable(size_t input_size, int* table_size); + uint16* GetHashTable(size_t fragment_size, int* table_size) const; + char* GetScratchInput() const { return input_; } + char* GetScratchOutput() const { return output_; } private: - uint16 small_table_[1<<10]; // 2KB - uint16* large_table_; // Allocated only when needed + char* mem_; // the allocated memory, never nullptr + size_t size_; // the size of the allocated memory, never 0 + uint16* table_; // the pointer to the hashtable + char* input_; // the pointer to the input scratch buffer + char* output_; // the pointer to the output scratch buffer - DISALLOW_COPY_AND_ASSIGN(WorkingMemory); + // No copying + WorkingMemory(const WorkingMemory&); + void operator=(const WorkingMemory&); }; // Flat array compression that does not emit the "uncompressed length" @@ -70,57 +79,72 @@ char* CompressFragment(const char* input, uint16* table, const int table_size); -// Return the largest n such that +// Find the largest n such that // // s1[0,n-1] == s2[0,n-1] // and n <= (s2_limit - s2). // +// Return make_pair(n, n < 8). // Does not read *s2_limit or beyond. // Does not read *(s1 + (s2_limit - s2)) or beyond. // Requires that s2_limit >= s2. // -// Separate implementation for x86_64, for speed. Uses the fact that -// x86_64 is little endian. -#if defined(ARCH_K8) -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +// Separate implementation for 64-bit, little-endian cpus. +#if !defined(SNAPPY_IS_BIG_ENDIAN) && \ + (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)) +static inline std::pair FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { assert(s2_limit >= s2); - int matched = 0; + size_t matched = 0; + + // This block isn't necessary for correctness; we could just start looping + // immediately. As an optimization though, it is useful. It creates some not + // uncommon code paths that determine, without extra effort, whether the match + // length is less than 8. In short, we are hoping to avoid a conditional + // branch, and perhaps get better code layout from the C++ compiler. + if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) { + uint64 a1 = UNALIGNED_LOAD64(s1); + uint64 a2 = UNALIGNED_LOAD64(s2); + if (a1 != a2) { + return std::pair(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3, + true); + } else { + matched = 8; + s2 += 8; + } + } // Find out how long the match is. We loop over the data 64 bits at a // time until we find a 64-bit block that doesn't match; then we find // the first non-matching bit and use that to calculate the total // length of the match. - while (PREDICT_TRUE(s2 <= s2_limit - 8)) { + while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) { if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) { s2 += 8; matched += 8; } else { - // On current (mid-2008) Opteron models there is a 3% more - // efficient code sequence to find the first non-matching byte. - // However, what follows is ~10% better on Intel Core 2 and newer, - // and we expect AMD's bsf instruction to improve. uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched); int matching_bits = Bits::FindLSBSetNonZero64(x); matched += matching_bits >> 3; - return matched; + assert(matched >= 8); + return std::pair(matched, false); } } - while (PREDICT_TRUE(s2 < s2_limit)) { + while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) { if (s1[matched] == *s2) { ++s2; ++matched; } else { - return matched; + return std::pair(matched, matched < 8); } } - return matched; + return std::pair(matched, matched < 8); } #else -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +static inline std::pair FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { // Implementation based on the x86-64 version, above. assert(s2_limit >= s2); int matched = 0; @@ -140,10 +164,67 @@ static inline int FindMatchLength(const char* s1, ++matched; } } - return matched; + return std::pair(matched, matched < 8); } #endif +// Lookup tables for decompression code. Give --snappy_dump_decompression_table +// to the unit test to recompute char_table. + +enum { + LITERAL = 0, + COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode + COPY_2_BYTE_OFFSET = 2, + COPY_4_BYTE_OFFSET = 3 +}; +static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. + +// Data stored per entry in lookup table: +// Range Bits-used Description +// ------------------------------------ +// 1..64 0..7 Literal/copy length encoded in opcode byte +// 0..7 8..10 Copy offset encoded in opcode byte / 256 +// 0..4 11..13 Extra bytes after opcode +// +// We use eight bits for the length even though 7 would have sufficed +// because of efficiency reasons: +// (1) Extracting a byte is faster than a bit-field +// (2) It properly aligns copy offset so we do not need a <<8 +static const uint16 char_table[256] = { + 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, + 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, + 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, + 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, + 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, + 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, + 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, + 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, + 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, + 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, + 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, + 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, + 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, + 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, + 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, + 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, + 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, + 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, + 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, + 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, + 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, + 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, + 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, + 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, + 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, + 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, + 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, + 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, + 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, + 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, + 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, + 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 +}; + } // end namespace internal } // end namespace snappy diff --git a/c_src/snappy/snappy-stubs-internal.cc b/c_src/snappy/snappy-stubs-internal.cc index 6ed3343..66ed2e9 100644 --- a/c_src/snappy/snappy-stubs-internal.cc +++ b/c_src/snappy/snappy-stubs-internal.cc @@ -33,7 +33,7 @@ namespace snappy { -void Varint::Append32(string* s, uint32 value) { +void Varint::Append32(std::string* s, uint32 value) { char buf[Varint::kMax32]; const char* p = Varint::Encode32(buf, value); s->append(buf, p - buf); diff --git a/c_src/snappy/snappy-stubs-internal.h b/c_src/snappy/snappy-stubs-internal.h index a07b6df..4854689 100644 --- a/c_src/snappy/snappy-stubs-internal.h +++ b/c_src/snappy/snappy-stubs-internal.h @@ -45,11 +45,26 @@ #include #endif -#ifdef _MSC_VER -#include -typedef SSIZE_T ssize_t; +#ifdef HAVE_UNISTD_H +#include +#endif + +#if defined(_MSC_VER) +#include +#endif // defined(_MSC_VER) + +#ifndef __has_feature +#define __has_feature(x) 0 #endif +#if __has_feature(memory_sanitizer) +#include +#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \ + __msan_unpoison((address), (size)) +#else +#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) /* empty */ +#endif // __has_feature(memory_sanitizer) + #include "snappy-stubs-public.h" #if defined(__x86_64__) @@ -57,6 +72,14 @@ typedef SSIZE_T ssize_t; // Enable 64-bit optimized versions of some routines. #define ARCH_K8 1 +#elif defined(__ppc64__) + +#define ARCH_PPC 1 + +#elif defined(__aarch64__) + +#define ARCH_ARM 1 + #endif // Needed by OS X, among others. @@ -64,10 +87,6 @@ typedef SSIZE_T ssize_t; #define MAP_ANONYMOUS MAP_ANON #endif -// Pull in std::min, std::ostream, and the likes. This is safe because this -// header file is never used from any public header files. -using namespace std; - // The size of an array, if known at compile-time. // Will give unexpected results if used on a pointer. // We undefine it first, since some compilers already have a definition. @@ -78,11 +97,11 @@ using namespace std; // Static prediction hints. #ifdef HAVE_BUILTIN_EXPECT -#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) -#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#define SNAPPY_PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define SNAPPY_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) #else -#define PREDICT_FALSE(x) x -#define PREDICT_TRUE(x) x +#define SNAPPY_PREDICT_FALSE(x) x +#define SNAPPY_PREDICT_TRUE(x) x #endif // This is only used for recomputing the tag byte table used during @@ -101,9 +120,10 @@ static const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); // Potentially unaligned loads and stores. -// x86 and PowerPC can simply do these loads and stores native. +// x86, PowerPC, and ARM64 can simply do these loads and stores native. -#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) +#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \ + defined(__aarch64__) #define UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) #define UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) @@ -121,6 +141,15 @@ static const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); // sub-architectures. // // This is a mess, but there's not much we can do about it. +// +// To further complicate matters, only LDR instructions (single reads) are +// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we +// explicitly tell the compiler that these accesses can be unaligned, it can and +// will combine accesses. On armcc, the way to signal this is done by accessing +// through the type (uint32 __packed *), but GCC has no such attribute +// (it ignores __attribute__((packed)) on individual variables). However, +// we can tell it that a _struct_ is unaligned, which has the same effect, +// so we do that. #elif defined(__arm__) && \ !defined(__ARM_ARCH_4__) && \ @@ -136,13 +165,41 @@ static const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); !defined(__ARM_ARCH_6ZK__) && \ !defined(__ARM_ARCH_6T2__) -#define UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) -#define UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) +#if __GNUC__ +#define ATTRIBUTE_PACKED __attribute__((__packed__)) +#else +#define ATTRIBUTE_PACKED +#endif -#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast(_p) = (_val)) -#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast(_p) = (_val)) +namespace base { +namespace internal { + +struct Unaligned16Struct { + uint16 value; + uint8 dummy; // To make the size non-power-of-two. +} ATTRIBUTE_PACKED; + +struct Unaligned32Struct { + uint32 value; + uint8 dummy; // To make the size non-power-of-two. +} ATTRIBUTE_PACKED; -// TODO(user): NEON supports unaligned 64-bit loads and stores. +} // namespace internal +} // namespace base + +#define UNALIGNED_LOAD16(_p) \ + ((reinterpret_cast(_p))->value) +#define UNALIGNED_LOAD32(_p) \ + ((reinterpret_cast(_p))->value) + +#define UNALIGNED_STORE16(_p, _val) \ + ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \ + (_val)) +#define UNALIGNED_STORE32(_p, _val) \ + ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \ + (_val)) + +// TODO: NEON supports unaligned 64-bit loads and stores. // See if that would be more efficient on platforms supporting it, // at least for copies. @@ -193,22 +250,8 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) { #endif -// This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64 -// on some platforms, in particular ARM. -inline void UnalignedCopy64(const void *src, void *dst) { - if (sizeof(void *) == 8) { - UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); - } else { - const char *src_char = reinterpret_cast(src); - char *dst_char = reinterpret_cast(dst); - - UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char)); - UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4)); - } -} - // The following guarantees declaration of the byte swap functions. -#ifdef WORDS_BIGENDIAN +#if defined(SNAPPY_IS_BIG_ENDIAN) #ifdef HAVE_SYS_BYTEORDER_H #include @@ -265,7 +308,7 @@ inline uint64 bswap_64(uint64 x) { #endif -#endif // WORDS_BIGENDIAN +#endif // defined(SNAPPY_IS_BIG_ENDIAN) // Convert to little-endian storage, opposite of network format. // Convert x from host to little endian: x = LittleEndian.FromHost(x); @@ -279,7 +322,7 @@ inline uint64 bswap_64(uint64 x) { class LittleEndian { public: // Conversion functions. -#ifdef WORDS_BIGENDIAN +#if defined(SNAPPY_IS_BIG_ENDIAN) static uint16 FromHost16(uint16 x) { return bswap_16(x); } static uint16 ToHost16(uint16 x) { return bswap_16(x); } @@ -289,7 +332,7 @@ class LittleEndian { static bool IsLittleEndian() { return false; } -#else // !defined(WORDS_BIGENDIAN) +#else // !defined(SNAPPY_IS_BIG_ENDIAN) static uint16 FromHost16(uint16 x) { return x; } static uint16 ToHost16(uint16 x) { return x; } @@ -299,7 +342,7 @@ class LittleEndian { static bool IsLittleEndian() { return true; } -#endif // !defined(WORDS_BIGENDIAN) +#endif // !defined(SNAPPY_IS_BIG_ENDIAN) // Functions to do unaligned loads and stores in little-endian order. static uint16 Load16(const void *p) { @@ -322,6 +365,9 @@ class LittleEndian { // Some bit-manipulation functions. class Bits { public: + // Return floor(log2(n)) for positive integer n. + static int Log2FloorNonZero(uint32 n); + // Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0. static int Log2Floor(uint32 n); @@ -329,31 +375,85 @@ class Bits { // undefined value if n == 0. FindLSBSetNonZero() is similar to ffs() except // that it's 0-indexed. static int FindLSBSetNonZero(uint32 n); + +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) static int FindLSBSetNonZero64(uint64 n); +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) private: - DISALLOW_COPY_AND_ASSIGN(Bits); + // No copying + Bits(const Bits&); + void operator=(const Bits&); }; #ifdef HAVE_BUILTIN_CTZ +inline int Bits::Log2FloorNonZero(uint32 n) { + assert(n != 0); + // (31 ^ x) is equivalent to (31 - x) for x in [0, 31]. An easy proof + // represents subtraction in base 2 and observes that there's no carry. + // + // GCC and Clang represent __builtin_clz on x86 as 31 ^ _bit_scan_reverse(x). + // Using "31 ^" here instead of "31 -" allows the optimizer to strip the + // function body down to _bit_scan_reverse(x). + return 31 ^ __builtin_clz(n); +} + inline int Bits::Log2Floor(uint32 n) { - return n == 0 ? -1 : 31 ^ __builtin_clz(n); + return (n == 0) ? -1 : Bits::Log2FloorNonZero(n); } inline int Bits::FindLSBSetNonZero(uint32 n) { + assert(n != 0); return __builtin_ctz(n); } +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) inline int Bits::FindLSBSetNonZero64(uint64 n) { + assert(n != 0); return __builtin_ctzll(n); } +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) -#else // Portable versions. +#elif defined(_MSC_VER) + +inline int Bits::Log2FloorNonZero(uint32 n) { + assert(n != 0); + unsigned long where; + _BitScanReverse(&where, n); + return static_cast(where); +} inline int Bits::Log2Floor(uint32 n) { - if (n == 0) - return -1; + unsigned long where; + if (_BitScanReverse(&where, n)) + return static_cast(where); + return -1; +} + +inline int Bits::FindLSBSetNonZero(uint32 n) { + assert(n != 0); + unsigned long where; + if (_BitScanForward(&where, n)) + return static_cast(where); + return 32; +} + +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) +inline int Bits::FindLSBSetNonZero64(uint64 n) { + assert(n != 0); + unsigned long where; + if (_BitScanForward64(&where, n)) + return static_cast(where); + return 64; +} +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) + +#else // Portable versions. + +inline int Bits::Log2FloorNonZero(uint32 n) { + assert(n != 0); + int log = 0; uint32 value = n; for (int i = 4; i >= 0; --i) { @@ -368,7 +468,13 @@ inline int Bits::Log2Floor(uint32 n) { return log; } +inline int Bits::Log2Floor(uint32 n) { + return (n == 0) ? -1 : Bits::Log2FloorNonZero(n); +} + inline int Bits::FindLSBSetNonZero(uint32 n) { + assert(n != 0); + int rc = 31; for (int i = 4, shift = 1 << 4; i >= 0; --i) { const uint32 x = n << shift; @@ -381,8 +487,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) { return rc; } +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) // FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero(). inline int Bits::FindLSBSetNonZero64(uint64 n) { + assert(n != 0); + const uint32 bottombits = static_cast(n); if (bottombits == 0) { // Bottom bits are zero, so scan in top bits @@ -391,6 +500,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) { return FindLSBSetNonZero(bottombits); } } +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #endif // End portable versions. @@ -414,7 +524,7 @@ class Varint { static char* Encode32(char* ptr, uint32 v); // EFFECTS Appends the varint representation of "value" to "*s". - static void Append32(string* s, uint32 value); + static void Append32(std::string* s, uint32 value); }; inline const char* Varint::Parse32WithLimit(const char* p, @@ -471,7 +581,7 @@ inline char* Varint::Encode32(char* sptr, uint32 v) { // replace this function with one that resizes the string without // filling the new space with zeros (if applicable) -- // it will be non-portable but faster. -inline void STLStringResizeUninitialized(string* s, size_t new_size) { +inline void STLStringResizeUninitialized(std::string* s, size_t new_size) { s->resize(new_size); } @@ -487,7 +597,7 @@ inline void STLStringResizeUninitialized(string* s, size_t new_size) { // (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-defects.html#530) // proposes this as the method. It will officially be part of the standard // for C++0x. This should already work on all current implementations. -inline char* string_as_array(string* str) { +inline char* string_as_array(std::string* str) { return str->empty() ? NULL : &*str->begin(); } diff --git a/c_src/snappy/snappy-stubs-public.h b/c_src/snappy/snappy-stubs-public.h index aec79dc..c156ba4 100644 --- a/c_src/snappy/snappy-stubs-public.h +++ b/c_src/snappy/snappy-stubs-public.h @@ -33,11 +33,20 @@ // which is a public header. Instead, snappy-stubs-public.h is generated by // from snappy-stubs-public.h.in at configure time. -#ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ -#define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ +#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ +#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ +#if 1 #include +#endif + +#if 1 #include +#endif + +#if 0 +#include +#endif #define SNAPPY_MAJOR 1 #define SNAPPY_MINOR 1 @@ -49,6 +58,7 @@ namespace snappy { +#if 1 typedef int8_t int8; typedef uint8_t uint8; typedef int16_t int16; @@ -57,6 +67,16 @@ typedef int32_t int32; typedef uint32_t uint32; typedef int64_t int64; typedef uint64_t uint64; +#else +typedef signed char int8; +typedef unsigned char uint8; +typedef short int16; +typedef unsigned short uint16; +typedef int int32; +typedef unsigned int uint32; +typedef long long int64; +typedef unsigned long long uint64; +#endif typedef std::string string; @@ -64,11 +84,15 @@ typedef std::string string; TypeName(const TypeName&); \ void operator=(const TypeName&) +#if !0 +// Windows does not have an iovec type, yet the concept is universally useful. +// It is simple to define it ourselves, so we put it inside our own namespace. struct iovec { - void* iov_base; - size_t iov_len; + void* iov_base; + size_t iov_len; }; +#endif } // namespace snappy -#endif // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ +#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_ diff --git a/c_src/snappy/snappy.cc b/c_src/snappy/snappy.cc index 09eab49..ce1eef4 100644 --- a/c_src/snappy/snappy.cc +++ b/c_src/snappy/snappy.cc @@ -30,15 +30,58 @@ #include "snappy-internal.h" #include "snappy-sinksource.h" +#if !defined(SNAPPY_HAVE_SSSE3) +// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD +// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and +// defines __AVX__ when AVX support is available. +#if defined(__SSSE3__) || defined(__AVX__) +#define SNAPPY_HAVE_SSSE3 1 +#else +#define SNAPPY_HAVE_SSSE3 0 +#endif +#endif // !defined(SNAPPY_HAVE_SSSE3) + +#if !defined(SNAPPY_HAVE_BMI2) +// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2 +// specifically, but it does define __AVX2__ when AVX2 support is available. +// Fortunately, AVX2 was introduced in Haswell, just like BMI2. +// +// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So, +// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which +// case issuing BMI2 instructions results in a compiler error. +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) +#define SNAPPY_HAVE_BMI2 1 +#else +#define SNAPPY_HAVE_BMI2 0 +#endif +#endif // !defined(SNAPPY_HAVE_BMI2) + +#if SNAPPY_HAVE_SSSE3 +// Please do not replace with . or with headers that assume more +// advanced SSE versions without checking with all the OWNERS. +#include +#endif + +#if SNAPPY_HAVE_BMI2 +// Please do not replace with . or with headers that assume more +// advanced SSE versions without checking with all the OWNERS. +#include +#endif + #include #include #include #include - namespace snappy { +using internal::COPY_1_BYTE_OFFSET; +using internal::COPY_2_BYTE_OFFSET; +using internal::LITERAL; +using internal::char_table; +using internal::kMaximumTagLength; + // Any hash function will produce a valid compressed bitstream, but a good // hash function reduces the number of collisions and thus yields better // compression for compressible input, and more speed for incompressible @@ -76,161 +119,313 @@ size_t MaxCompressedLength(size_t source_len) { return 32 + source_len + source_len/6; } -enum { - LITERAL = 0, - COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode - COPY_2_BYTE_OFFSET = 2, - COPY_4_BYTE_OFFSET = 3 -}; -static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. - -// Copy "len" bytes from "src" to "op", one byte at a time. Used for -// handling COPY operations where the input and output regions may -// overlap. For example, suppose: -// src == "ab" -// op == src + 2 -// len == 20 -// After IncrementalCopy(src, op, len), the result will have -// eleven copies of "ab" +namespace { + +void UnalignedCopy64(const void* src, void* dst) { + char tmp[8]; + memcpy(tmp, src, 8); + memcpy(dst, tmp, 8); +} + +void UnalignedCopy128(const void* src, void* dst) { + // memcpy gets vectorized when the appropriate compiler options are used. + // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load + // and store. + char tmp[16]; + memcpy(tmp, src, 16); + memcpy(dst, tmp, 16); +} + +// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used +// for handling COPY operations where the input and output regions may overlap. +// For example, suppose: +// src == "ab" +// op == src + 2 +// op_limit == op + 20 +// After IncrementalCopySlow(src, op, op_limit), the result will have eleven +// copies of "ab" // ababababababababababab -// Note that this does not match the semantics of either memcpy() -// or memmove(). -static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { - assert(len > 0); - do { +// Note that this does not match the semantics of either memcpy() or memmove(). +inline char* IncrementalCopySlow(const char* src, char* op, + char* const op_limit) { + // TODO: Remove pragma when LLVM is aware this + // function is only called in cold regions and when cold regions don't get + // vectorized or unrolled. +#ifdef __clang__ +#pragma clang loop unroll(disable) +#endif + while (op < op_limit) { *op++ = *src++; - } while (--len > 0); + } + return op_limit; } -// Equivalent to IncrementalCopy except that it can write up to ten extra -// bytes after the end of the copy, and that it is faster. -// -// The main part of this loop is a simple copy of eight bytes at a time until -// we've copied (at least) the requested amount of bytes. However, if op and -// src are less than eight bytes apart (indicating a repeating pattern of -// length < 8), we first need to expand the pattern in order to get the correct -// results. For instance, if the buffer looks like this, with the eight-byte -// and patterns marked as intervals: -// -// abxxxxxxxxxxxx -// [------] src -// [------] op -// -// a single eight-byte copy from to will repeat the pattern once, -// after which we can move two bytes without moving : -// -// ababxxxxxxxxxx -// [------] src -// [------] op -// -// and repeat the exercise until the two no longer overlap. -// -// This allows us to do very well in the special case of one single byte -// repeated many times, without taking a big hit for more general cases. -// -// The worst case of extra writing past the end of the match occurs when -// op - src == 1 and len == 1; the last copy will read from byte positions -// [0..7] and write to [4..11], whereas it was only supposed to write to -// position 1. Thus, ten excess bytes. +#if SNAPPY_HAVE_SSSE3 + +// This is a table of shuffle control masks that can be used as the source +// operand for PSHUFB to permute the contents of the destination XMM register +// into a repeating byte pattern. +alignas(16) const char pshufb_fill_patterns[7][16] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, + {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, + {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, + {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, +}; -namespace { +#endif // SNAPPY_HAVE_SSSE3 + +// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than +// IncrementalCopySlow. buf_limit is the address past the end of the writable +// region of the buffer. +inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, + char* const buf_limit) { + // Terminology: + // + // slop = buf_limit - op + // pat = op - src + // len = limit - op + assert(src < op); + assert(op <= op_limit); + assert(op_limit <= buf_limit); + // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that + // to optimize this function but we have to also handle other cases in case + // the input does not satisfy these conditions. + + size_t pattern_size = op - src; + // The cases are split into different branches to allow the branch predictor, + // FDO, and static prediction hints to work better. For each input we list the + // ratio of invocations that match each condition. + // + // input slop < 16 pat < 8 len > 16 + // ------------------------------------------ + // html|html4|cp 0% 1.01% 27.73% + // urls 0% 0.88% 14.79% + // jpg 0% 64.29% 7.14% + // pdf 0% 2.56% 58.06% + // txt[1-4] 0% 0.23% 0.97% + // pb 0% 0.96% 13.88% + // bin 0.01% 22.27% 41.17% + // + // It is very rare that we don't have enough slop for doing block copies. It + // is also rare that we need to expand a pattern. Small patterns are common + // for incompressible formats and for those we are plenty fast already. + // Lengths are normally not greater than 16 but they vary depending on the + // input. In general if we always predict len <= 16 it would be an ok + // prediction. + // + // In order to be fast we want a pattern >= 8 bytes and an unrolled loop + // copying 2x 8 bytes at a time. + + // Handle the uncommon case where pattern is less than 8 bytes. + if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) { +#if SNAPPY_HAVE_SSSE3 + // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB + // to permute the register's contents in-place into a repeating sequence of + // the first "pattern_size" bytes. + // For example, suppose: + // src == "abc" + // op == op + 3 + // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc" + // followed by one byte of slop: abcabcabcabcabca. + // + // The non-SSE fallback implementation suffers from store-forwarding stalls + // because its loads and stores partly overlap. By expanding the pattern + // in-place, we avoid the penalty. + if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) { + const __m128i shuffle_mask = _mm_load_si128( + reinterpret_cast(pshufb_fill_patterns) + + pattern_size - 1); + const __m128i pattern = _mm_shuffle_epi8( + _mm_loadl_epi64(reinterpret_cast(src)), shuffle_mask); + // Uninitialized bytes are masked out by the shuffle mask. + // TODO: remove annotation and macro defs once MSan is fixed. + SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern)); + pattern_size *= 16 / pattern_size; + char* op_end = std::min(op_limit, buf_limit - 15); + while (op < op_end) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); + op += pattern_size; + } + if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit; + } + return IncrementalCopySlow(src, op, op_limit); +#else // !SNAPPY_HAVE_SSSE3 + // If plenty of buffer space remains, expand the pattern to at least 8 + // bytes. The way the following loop is written, we need 8 bytes of buffer + // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10 + // bytes if pattern_size is 2. Precisely encoding that is probably not + // worthwhile; instead, invoke the slow path if we cannot write 11 bytes + // (because 11 are required in the worst case). + if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) { + while (pattern_size < 8) { + UnalignedCopy64(src, op); + op += pattern_size; + pattern_size *= 2; + } + if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit; + } else { + return IncrementalCopySlow(src, op, op_limit); + } +#endif // SNAPPY_HAVE_SSSE3 + } + assert(pattern_size >= 8); + + // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single + // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe + // because expanding the pattern to at least 8 bytes guarantees that + // op - src >= 8. + // + // Typically, the op_limit is the gating factor so try to simplify the loop + // based on that. + if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) { + // There is at least one, and at most four 16-byte blocks. Writing four + // conditionals instead of a loop allows FDO to layout the code with respect + // to the actual probabilities of each length. + // TODO: Replace with loop with trip count hint. + UnalignedCopy64(src, op); + UnalignedCopy64(src + 8, op + 8); -const int kMaxIncrementCopyOverflow = 10; + if (op + 16 < op_limit) { + UnalignedCopy64(src + 16, op + 16); + UnalignedCopy64(src + 24, op + 24); + } + if (op + 32 < op_limit) { + UnalignedCopy64(src + 32, op + 32); + UnalignedCopy64(src + 40, op + 40); + } + if (op + 48 < op_limit) { + UnalignedCopy64(src + 48, op + 48); + UnalignedCopy64(src + 56, op + 56); + } + return op_limit; + } -inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) { - while (PREDICT_FALSE(op - src < 8)) { + // Fall back to doing as much as we can with the available slop in the + // buffer. This code path is relatively cold however so we save code size by + // avoiding unrolling and vectorizing. + // + // TODO: Remove pragma when when cold regions don't get vectorized + // or unrolled. +#ifdef __clang__ +#pragma clang loop unroll(disable) +#endif + for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) { UnalignedCopy64(src, op); - len -= op - src; - op += op - src; + UnalignedCopy64(src + 8, op + 8); } - while (len > 0) { + if (op >= op_limit) + return op_limit; + + // We only take this branch if we didn't have enough slop and we can do a + // single 8 byte copy. + if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) { UnalignedCopy64(src, op); src += 8; op += 8; - len -= 8; } + return IncrementalCopySlow(src, op, op_limit); } } // namespace +template static inline char* EmitLiteral(char* op, const char* literal, - int len, - bool allow_fast_path) { - int n = len - 1; // Zero-length literals are disallowed - if (n < 60) { + int len) { + // The vast majority of copies are below 16 bytes, for which a + // call to memcpy is overkill. This fast path can sometimes + // copy up to 15 bytes too much, but that is okay in the + // main loop, since we have a bit to go on for both sides: + // + // - The input will always have kInputMarginBytes = 15 extra + // available bytes, as long as we're in the main loop, and + // if not, allow_fast_path = false. + // - The output will always have 32 spare bytes (see + // MaxCompressedLength). + assert(len > 0); // Zero-length literals are disallowed + int n = len - 1; + if (allow_fast_path && len <= 16) { // Fits in tag byte *op++ = LITERAL | (n << 2); - // The vast majority of copies are below 16 bytes, for which a - // call to memcpy is overkill. This fast path can sometimes - // copy up to 15 bytes too much, but that is okay in the - // main loop, since we have a bit to go on for both sides: - // - // - The input will always have kInputMarginBytes = 15 extra - // available bytes, as long as we're in the main loop, and - // if not, allow_fast_path = false. - // - The output will always have 32 spare bytes (see - // MaxCompressedLength). - if (allow_fast_path && len <= 16) { - UnalignedCopy64(literal, op); - UnalignedCopy64(literal + 8, op + 8); - return op + len; - } + UnalignedCopy128(literal, op); + return op + len; + } + + if (n < 60) { + // Fits in tag byte + *op++ = LITERAL | (n << 2); } else { - // Encode in upcoming bytes - char* base = op; - int count = 0; - op++; - while (n > 0) { - *op++ = n & 0xff; - n >>= 8; - count++; - } + int count = (Bits::Log2Floor(n) >> 3) + 1; assert(count >= 1); assert(count <= 4); - *base = LITERAL | ((59+count) << 2); + *op++ = LITERAL | ((59 + count) << 2); + // Encode in upcoming bytes. + // Write 4 bytes, though we may care about only 1 of them. The output buffer + // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds + // here and there is a memcpy of size 'len' below. + LittleEndian::Store32(op, n); + op += count; } memcpy(op, literal, len); return op + len; } -static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) { +template +static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) { assert(len <= 64); assert(len >= 4); assert(offset < 65536); + assert(len_less_than_12 == (len < 12)); - if ((len < 12) && (offset < 2048)) { - size_t len_minus_4 = len - 4; - assert(len_minus_4 < 8); // Must fit in 3 bits - *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5); + if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) { + // offset fits in 11 bits. The 3 highest go in the top of the first byte, + // and the rest go in the second byte. + *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0); *op++ = offset & 0xff; } else { - *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2); - LittleEndian::Store16(op, offset); - op += 2; + // Write 4 bytes, though we only care about 3 of them. The output buffer + // is required to have some slack, so the extra byte won't overrun it. + uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8); + LittleEndian::Store32(op, u); + op += 3; } return op; } -static inline char* EmitCopy(char* op, size_t offset, int len) { - // Emit 64 byte copies but make sure to keep at least four bytes reserved - while (PREDICT_FALSE(len >= 68)) { - op = EmitCopyLessThan64(op, offset, 64); - len -= 64; - } +template +static inline char* EmitCopy(char* op, size_t offset, size_t len) { + assert(len_less_than_12 == (len < 12)); + if (len_less_than_12) { + return EmitCopyAtMost64(op, offset, len); + } else { + // A special case for len <= 64 might help, but so far measurements suggest + // it's in the noise. - // Emit an extra 60 byte copy if have too much data to fit in one copy - if (len > 64) { - op = EmitCopyLessThan64(op, offset, 60); - len -= 60; - } + // Emit 64 byte copies but make sure to keep at least four bytes reserved. + while (SNAPPY_PREDICT_FALSE(len >= 68)) { + op = EmitCopyAtMost64(op, offset, 64); + len -= 64; + } - // Emit remainder - op = EmitCopyLessThan64(op, offset, len); - return op; -} + // One or two copies will now finish the job. + if (len > 64) { + op = EmitCopyAtMost64(op, offset, 60); + len -= 60; + } + // Emit remainder. + if (len < 12) { + op = EmitCopyAtMost64(op, offset, len); + } else { + op = EmitCopyAtMost64(op, offset, len); + } + return op; + } +} bool GetUncompressedLength(const char* start, size_t n, size_t* result) { uint32 v = 0; @@ -243,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) { } } +namespace { +uint32 CalculateTableSize(uint32 input_size) { + static_assert( + kMaxHashTableSize >= kMinHashTableSize, + "kMaxHashTableSize should be greater or equal to kMinHashTableSize."); + if (input_size > kMaxHashTableSize) { + return kMaxHashTableSize; + } + if (input_size < kMinHashTableSize) { + return kMinHashTableSize; + } + // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1. + // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)). + return 2u << Bits::Log2Floor(input_size - 1); +} +} // namespace + namespace internal { -uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) { - // Use smaller hash table when input.size() is smaller, since we - // fill the table, incurring O(hash table size) overhead for - // compression, and if the input is short, we won't need that - // many hash table entries anyway. - assert(kMaxHashTableSize >= 256); - size_t htsize = 256; - while (htsize < kMaxHashTableSize && htsize < input_size) { - htsize <<= 1; - } +WorkingMemory::WorkingMemory(size_t input_size) { + const size_t max_fragment_size = std::min(input_size, kBlockSize); + const size_t table_size = CalculateTableSize(max_fragment_size); + size_ = table_size * sizeof(*table_) + max_fragment_size + + MaxCompressedLength(max_fragment_size); + mem_ = std::allocator().allocate(size_); + table_ = reinterpret_cast(mem_); + input_ = mem_ + table_size * sizeof(*table_); + output_ = input_ + max_fragment_size; +} - uint16* table; - if (htsize <= ARRAYSIZE(small_table_)) { - table = small_table_; - } else { - if (large_table_ == NULL) { - large_table_ = new uint16[kMaxHashTableSize]; - } - table = large_table_; - } +WorkingMemory::~WorkingMemory() { + std::allocator().deallocate(mem_, size_); +} +uint16* WorkingMemory::GetHashTable(size_t fragment_size, + int* table_size) const { + const size_t htsize = CalculateTableSize(fragment_size); + memset(table_, 0, htsize * sizeof(*table_)); *table_size = htsize; - memset(table, 0, htsize * sizeof(*table)); - return table; + return table_; } } // end namespace internal @@ -334,7 +543,7 @@ char* CompressFragment(const char* input, // "ip" is the input pointer, and "op" is the output pointer. const char* ip = input; assert(input_size <= kBlockSize); - assert((table_size & (table_size - 1)) == 0); // table must be power of two + assert((table_size & (table_size - 1)) == 0); // table must be power of two const int shift = 32 - Bits::Log2Floor(table_size); assert(static_cast(kuint32max >> shift) == table_size - 1); const char* ip_end = input + input_size; @@ -344,7 +553,7 @@ char* CompressFragment(const char* input, const char* next_emit = ip; const size_t kInputMarginBytes = 15; - if (PREDICT_TRUE(input_size >= kInputMarginBytes)) { + if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { const char* ip_limit = input + input_size - kInputMarginBytes; for (uint32 next_hash = Hash(++ip, shift); ; ) { @@ -364,9 +573,9 @@ char* CompressFragment(const char* input, // // Heuristic match skipping: If 32 bytes are scanned with no matches // found, start looking only at every other byte. If 32 more bytes are - // scanned, look at every third byte, etc.. When a match is found, - // immediately go back to looking at every byte. This is a small loss - // (~5% performance, ~0.1% density) for compressible data due to more + // scanned (or skipped), look at every third byte, etc.. When a match is + // found, immediately go back to looking at every byte. This is a small + // loss (~5% performance, ~0.1% density) for compressible data due to more // bookkeeping, but for non-compressible data (such as JPEG) it's a huge // win since the compressor quickly "realizes" the data is incompressible // and doesn't bother looking for matches everywhere. @@ -382,9 +591,10 @@ char* CompressFragment(const char* input, ip = next_ip; uint32 hash = next_hash; assert(hash == Hash(ip, shift)); - uint32 bytes_between_hash_lookups = skip++ >> 5; + uint32 bytes_between_hash_lookups = skip >> 5; + skip += bytes_between_hash_lookups; next_ip = ip + bytes_between_hash_lookups; - if (PREDICT_FALSE(next_ip > ip_limit)) { + if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { goto emit_remainder; } next_hash = Hash(next_ip, shift); @@ -393,14 +603,14 @@ char* CompressFragment(const char* input, assert(candidate < ip); table[hash] = ip - base_ip; - } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) != - UNALIGNED_LOAD32(candidate))); + } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) != + UNALIGNED_LOAD32(candidate))); // Step 2: A 4-byte match has been found. We'll later see if more // than 4 bytes match. But, prior to the match, input // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes." assert(next_emit + 16 <= ip_end); - op = EmitLiteral(op, next_emit, ip - next_emit, true); + op = EmitLiteral(op, next_emit, ip - next_emit); // Step 3: Call EmitCopy, and then see if another EmitCopy could // be our next move. Repeat until we find no match for the @@ -417,19 +627,25 @@ char* CompressFragment(const char* input, // We have a 4-byte match at ip, and no need to emit any // "literal bytes" prior to ip. const char* base = ip; - int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end); + std::pair p = + FindMatchLength(candidate + 4, ip + 4, ip_end); + size_t matched = 4 + p.first; ip += matched; size_t offset = base - candidate; assert(0 == memcmp(base, candidate, matched)); - op = EmitCopy(op, offset, matched); - // We could immediately start working at ip now, but to improve - // compression we first update table[Hash(ip - 1, ...)]. - const char* insert_tail = ip - 1; + if (p.second) { + op = EmitCopy(op, offset, matched); + } else { + op = EmitCopy(op, offset, matched); + } next_emit = ip; - if (PREDICT_FALSE(ip >= ip_limit)) { + if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { goto emit_remainder; } - input_bytes = GetEightBytesAt(insert_tail); + // We are now looking for a 4-byte match again. We read + // table[Hash(ip, shift)] for that. To improve compression, + // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)]. + input_bytes = GetEightBytesAt(ip - 1); uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift); table[prev_hash] = ip - base_ip - 1; uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift); @@ -446,13 +662,18 @@ char* CompressFragment(const char* input, emit_remainder: // Emit the remaining bytes as a literal if (next_emit < ip_end) { - op = EmitLiteral(op, next_emit, ip_end - next_emit, false); + op = EmitLiteral(op, next_emit, + ip_end - next_emit); } return op; } } // end namespace internal +// Called back at avery compression call to trace parameters and sizes. +static inline void Report(const char *algorithm, size_t compressed_size, + size_t uncompressed_size) {} + // Signature of output types needed by decompression code. // The decompression code is templatized on a type that obeys this // signature so that we do not pay virtual function call overhead in @@ -493,162 +714,28 @@ char* CompressFragment(const char* input, // bool TryFastAppend(const char* ip, size_t available, size_t length); // }; -// ----------------------------------------------------------------------- -// Lookup table for decompression code. Generated by ComputeTable() below. -// ----------------------------------------------------------------------- - -// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits -static const uint32 wordmask[] = { - 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu -}; - -// Data stored per entry in lookup table: -// Range Bits-used Description -// ------------------------------------ -// 1..64 0..7 Literal/copy length encoded in opcode byte -// 0..7 8..10 Copy offset encoded in opcode byte / 256 -// 0..4 11..13 Extra bytes after opcode -// -// We use eight bits for the length even though 7 would have sufficed -// because of efficiency reasons: -// (1) Extracting a byte is faster than a bit-field -// (2) It properly aligns copy offset so we do not need a <<8 -static const uint16 char_table[256] = { - 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, - 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, - 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, - 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, - 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, - 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, - 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, - 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, - 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, - 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, - 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, - 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, - 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, - 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, - 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, - 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, - 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, - 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, - 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, - 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, - 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, - 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, - 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, - 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, - 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, - 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, - 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, - 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, - 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, - 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, - 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, - 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 -}; - -// In debug mode, allow optional computation of the table at startup. -// Also, check that the decompression table is correct. -#ifndef NDEBUG -DEFINE_bool(snappy_dump_decompression_table, false, - "If true, we print the decompression table at startup."); - -static uint16 MakeEntry(unsigned int extra, - unsigned int len, - unsigned int copy_offset) { - // Check that all of the fields fit within the allocated space - assert(extra == (extra & 0x7)); // At most 3 bits - assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits - assert(len == (len & 0x7f)); // At most 7 bits - return len | (copy_offset << 8) | (extra << 11); +static inline uint32 ExtractLowBytes(uint32 v, int n) { + assert(n >= 0); + assert(n <= 4); +#if SNAPPY_HAVE_BMI2 + return _bzhi_u32(v, 8 * n); +#else + // This needs to be wider than uint32 otherwise `mask << 32` will be + // undefined. + uint64 mask = 0xffffffff; + return v & ~(mask << (8 * n)); +#endif } -static void ComputeTable() { - uint16 dst[256]; - - // Place invalid entries in all places to detect missing initialization - int assigned = 0; - for (int i = 0; i < 256; i++) { - dst[i] = 0xffff; - } - - // Small LITERAL entries. We store (len-1) in the top 6 bits. - for (unsigned int len = 1; len <= 60; len++) { - dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0); - assigned++; - } - - // Large LITERAL entries. We use 60..63 in the high 6 bits to - // encode the number of bytes of length info that follow the opcode. - for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) { - // We set the length field in the lookup table to 1 because extra - // bytes encode len-1. - dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0); - assigned++; - } - - // COPY_1_BYTE_OFFSET. - // - // The tag byte in the compressed data stores len-4 in 3 bits, and - // offset/256 in 5 bits. offset%256 is stored in the next byte. - // - // This format is used for length in range [4..11] and offset in - // range [0..2047] - for (unsigned int len = 4; len < 12; len++) { - for (unsigned int offset = 0; offset < 2048; offset += 256) { - dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] = - MakeEntry(1, len, offset>>8); - assigned++; - } - } - - // COPY_2_BYTE_OFFSET. - // Tag contains len-1 in top 6 bits, and offset in next two bytes. - for (unsigned int len = 1; len <= 64; len++) { - dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0); - assigned++; - } - - // COPY_4_BYTE_OFFSET. - // Tag contents len-1 in top 6 bits, and offset in next four bytes. - for (unsigned int len = 1; len <= 64; len++) { - dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0); - assigned++; - } - - // Check that each entry was initialized exactly once. - if (assigned != 256) { - fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned); - abort(); - } - for (int i = 0; i < 256; i++) { - if (dst[i] == 0xffff) { - fprintf(stderr, "ComputeTable: did not assign byte %d\n", i); - abort(); - } - } - - if (FLAGS_snappy_dump_decompression_table) { - printf("static const uint16 char_table[256] = {\n "); - for (int i = 0; i < 256; i++) { - printf("0x%04x%s", - dst[i], - ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", "))); - } - printf("};\n"); - } - - // Check that computed table matched recorded table - for (int i = 0; i < 256; i++) { - if (dst[i] != char_table[i]) { - fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n", - i, static_cast(dst[i]), static_cast(char_table[i])); - abort(); - } - } +static inline bool LeftShiftOverflows(uint8 value, uint32 shift) { + assert(shift < 32); + static const uint8 masks[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // + 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe}; + return (value & masks[shift]) != 0; } -#endif /* !NDEBUG */ // Helper class for decompression class SnappyDecompressor { @@ -687,7 +774,7 @@ class SnappyDecompressor { } // Read the uncompressed length stored at the start of the compressed data. - // On succcess, stores the length in *result and returns true. + // On success, stores the length in *result and returns true. // On failure, returns false. bool ReadUncompressedLength(uint32* result) { assert(ip_ == NULL); // Must not have read anything yet @@ -701,7 +788,9 @@ class SnappyDecompressor { if (n == 0) return false; const unsigned char c = *(reinterpret_cast(ip)); reader_->Skip(1); - *result |= static_cast(c & 0x7f) << shift; + uint32 val = c & 0x7f; + if (LeftShiftOverflows(static_cast(val), shift)) return false; + *result |= val << shift; if (c < 128) { break; } @@ -713,9 +802,27 @@ class SnappyDecompressor { // Process the next item found in the input. // Returns true if successful, false on error or end of input. template +#if defined(__GNUC__) && defined(__x86_64__) + __attribute__((aligned(32))) +#endif void DecompressAllTags(Writer* writer) { - const char* ip = ip_; + // In x86, pad the function body to start 16 bytes later. This function has + // a couple of hotspots that are highly sensitive to alignment: we have + // observed regressions by more than 20% in some metrics just by moving the + // exact same code to a different position in the benchmark binary. + // + // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit + // the "lucky" case consistently. Unfortunately, this is a very brittle + // workaround, and future differences in code generation may reintroduce + // this regression. If you experience a big, difficult to explain, benchmark + // performance regression here, first try removing this hack. +#if defined(__GNUC__) && defined(__x86_64__) + // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions. + asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00"); + asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00"); +#endif + const char* ip = ip_; // We could have put this refill fragment only at the beginning of the loop. // However, duplicating it at the end of each branch gives the compiler more // scope to optimize the expression based on the local @@ -731,21 +838,34 @@ class SnappyDecompressor { for ( ;; ) { const unsigned char c = *(reinterpret_cast(ip++)); - if ((c & 0x3) == LITERAL) { + // Ratio of iterations that have LITERAL vs non-LITERAL for different + // inputs. + // + // input LITERAL NON_LITERAL + // ----------------------------------- + // html|html4|cp 23% 77% + // urls 36% 64% + // jpg 47% 53% + // pdf 19% 81% + // txt[1-4] 25% 75% + // pb 24% 76% + // bin 24% 76% + if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) { size_t literal_length = (c >> 2) + 1u; if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) { assert(literal_length < 61); ip += literal_length; - // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend() + // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend() // will not return true unless there's already at least five spare // bytes in addition to the literal. continue; } - if (PREDICT_FALSE(literal_length >= 61)) { + if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) { // Long literal. const size_t literal_length_length = literal_length - 60; literal_length = - (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1; + ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) + + 1; ip += literal_length_length; } @@ -767,15 +887,16 @@ class SnappyDecompressor { ip += literal_length; MAYBE_REFILL(); } else { - const uint32 entry = char_table[c]; - const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11]; - const uint32 length = entry & 0xff; + const size_t entry = char_table[c]; + const size_t trailer = + ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11); + const size_t length = entry & 0xff; ip += entry >> 11; // copy_offset/256 is encoded in bits 8..10. By just fetching // those bits, we get copy_offset (since the bit-field starts at // bit 8). - const uint32 copy_offset = entry & 0x700; + const size_t copy_offset = entry & 0x700; if (!writer->AppendFromSelf(copy_offset + trailer, length)) { return; } @@ -795,10 +916,8 @@ bool SnappyDecompressor::RefillTag() { size_t n; ip = reader_->Peek(&n); peeked_ = n; - if (n == 0) { - eof_ = true; - return false; - } + eof_ = (n == 0); + if (eof_) return false; ip_limit_ = ip + n; } @@ -823,7 +942,7 @@ bool SnappyDecompressor::RefillTag() { size_t length; const char* src = reader_->Peek(&length); if (length == 0) return false; - uint32 to_add = min(needed - nbuf, length); + uint32 to_add = std::min(needed - nbuf, length); memcpy(scratch_ + nbuf, src, to_add); nbuf += to_add; reader_->Skip(to_add); @@ -852,13 +971,18 @@ static bool InternalUncompress(Source* r, Writer* writer) { SnappyDecompressor decompressor(r); uint32 uncompressed_len = 0; if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false; - return InternalUncompressAllTags(&decompressor, writer, uncompressed_len); + + return InternalUncompressAllTags(&decompressor, writer, r->Available(), + uncompressed_len); } template static bool InternalUncompressAllTags(SnappyDecompressor* decompressor, Writer* writer, + uint32 compressed_len, uint32 uncompressed_len) { + Report("snappy_uncompress", compressed_len, uncompressed_len); + writer->SetExpectedLength(uncompressed_len); // Process the entire input @@ -875,21 +999,20 @@ bool GetUncompressedLength(Source* source, uint32* result) { size_t Compress(Source* reader, Sink* writer) { size_t written = 0; size_t N = reader->Available(); + const size_t uncompressed_size = N; char ulength[Varint::kMax32]; char* p = Varint::Encode32(ulength, N); writer->Append(ulength, p-ulength); written += (p - ulength); - internal::WorkingMemory wmem; - char* scratch = NULL; - char* scratch_output = NULL; + internal::WorkingMemory wmem(N); while (N > 0) { // Get next block to compress (without copying if possible) size_t fragment_size; const char* fragment = reader->Peek(&fragment_size); assert(fragment_size != 0); // premature end of input - const size_t num_to_read = min(N, kBlockSize); + const size_t num_to_read = std::min(N, kBlockSize); size_t bytes_read = fragment_size; size_t pending_advance = 0; @@ -898,19 +1021,13 @@ size_t Compress(Source* reader, Sink* writer) { pending_advance = num_to_read; fragment_size = num_to_read; } else { - // Read into scratch buffer - if (scratch == NULL) { - // If this is the last iteration, we want to allocate N bytes - // of space, otherwise the max possible kBlockSize space. - // num_to_read contains exactly the correct value - scratch = new char[num_to_read]; - } + char* scratch = wmem.GetScratchInput(); memcpy(scratch, fragment, bytes_read); reader->Skip(bytes_read); while (bytes_read < num_to_read) { fragment = reader->Peek(&fragment_size); - size_t n = min(fragment_size, num_to_read - bytes_read); + size_t n = std::min(fragment_size, num_to_read - bytes_read); memcpy(scratch + bytes_read, fragment, n); bytes_read += n; reader->Skip(n); @@ -930,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) { // Need a scratch buffer for the output, in case the byte sink doesn't // have room for us directly. - if (scratch_output == NULL) { - scratch_output = new char[max_output]; - } else { - // Since we encode kBlockSize regions followed by a region - // which is <= kBlockSize in length, a previously allocated - // scratch_output[] region is big enough for this iteration. - } - char* dest = writer->GetAppendBuffer(max_output, scratch_output); - char* end = internal::CompressFragment(fragment, fragment_size, - dest, table, table_size); + + // Since we encode kBlockSize regions followed by a region + // which is <= kBlockSize in length, a previously allocated + // scratch_output[] region is big enough for this iteration. + char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput()); + char* end = internal::CompressFragment(fragment, fragment_size, dest, table, + table_size); writer->Append(dest, end - dest); written += (end - dest); @@ -947,8 +1061,7 @@ size_t Compress(Source* reader, Sink* writer) { reader->Skip(pending_advance); } - delete[] scratch; - delete[] scratch_output; + Report("snappy_compress", written, uncompressed_size); return written; } @@ -962,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) { // Writer template argument to SnappyDecompressor::DecompressAllTags(). class SnappyIOVecWriter { private: + // output_iov_end_ is set to iov + count and used to determine when + // the end of the iovs is reached. + const struct iovec* output_iov_end_; + +#if !defined(NDEBUG) const struct iovec* output_iov_; - const size_t output_iov_count_; +#endif // !defined(NDEBUG) + + // Current iov that is being written into. + const struct iovec* curr_iov_; - // We are currently writing into output_iov_[curr_iov_index_]. - size_t curr_iov_index_; + // Pointer to current iov's write location. + char* curr_iov_output_; - // Bytes written to output_iov_[curr_iov_index_] so far. - size_t curr_iov_written_; + // Remaining bytes to write into curr_iov_output. + size_t curr_iov_remaining_; // Total bytes decompressed into output_iov_ so far. size_t total_written_; @@ -977,22 +1098,24 @@ class SnappyIOVecWriter { // Maximum number of bytes that will be decompressed into output_iov_. size_t output_limit_; - inline char* GetIOVecPointer(size_t index, size_t offset) { - return reinterpret_cast(output_iov_[index].iov_base) + - offset; + static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) { + return reinterpret_cast(iov->iov_base) + offset; } public: // Does not take ownership of iov. iov must be valid during the // entire lifetime of the SnappyIOVecWriter. inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count) - : output_iov_(iov), - output_iov_count_(iov_count), - curr_iov_index_(0), - curr_iov_written_(0), + : output_iov_end_(iov + iov_count), +#if !defined(NDEBUG) + output_iov_(iov), +#endif // !defined(NDEBUG) + curr_iov_(iov), + curr_iov_output_(iov_count ? reinterpret_cast(iov->iov_base) + : nullptr), + curr_iov_remaining_(iov_count ? iov->iov_len : 0), total_written_(0), - output_limit_(-1) { - } + output_limit_(-1) {} inline void SetExpectedLength(size_t len) { output_limit_ = len; @@ -1007,23 +1130,25 @@ class SnappyIOVecWriter { return false; } + return AppendNoCheck(ip, len); + } + + inline bool AppendNoCheck(const char* ip, size_t len) { while (len > 0) { - assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len); - if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) { + if (curr_iov_remaining_ == 0) { // This iovec is full. Go to the next one. - if (curr_iov_index_ + 1 >= output_iov_count_) { + if (curr_iov_ + 1 >= output_iov_end_) { return false; } - curr_iov_written_ = 0; - ++curr_iov_index_; + ++curr_iov_; + curr_iov_output_ = reinterpret_cast(curr_iov_->iov_base); + curr_iov_remaining_ = curr_iov_->iov_len; } - const size_t to_write = std::min( - len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_); - memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_), - ip, - to_write); - curr_iov_written_ += to_write; + const size_t to_write = std::min(len, curr_iov_remaining_); + memcpy(curr_iov_output_, ip, to_write); + curr_iov_output_ += to_write; + curr_iov_remaining_ -= to_write; total_written_ += to_write; ip += to_write; len -= to_write; @@ -1035,12 +1160,11 @@ class SnappyIOVecWriter { inline bool TryFastAppend(const char* ip, size_t available, size_t len) { const size_t space_left = output_limit_ - total_written_; if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 && - output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) { + curr_iov_remaining_ >= 16) { // Fast path, used for the majority (about 95%) of invocations. - char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_); - UnalignedCopy64(ip, ptr); - UnalignedCopy64(ip + 8, ptr + 8); - curr_iov_written_ += len; + UnalignedCopy128(ip, curr_iov_output_); + curr_iov_output_ += len; + curr_iov_remaining_ -= len; total_written_ += len; return true; } @@ -1049,7 +1173,9 @@ class SnappyIOVecWriter { } inline bool AppendFromSelf(size_t offset, size_t len) { - if (offset > total_written_ || offset == 0) { + // See SnappyArrayWriter::AppendFromSelf for an explanation of + // the "offset - 1u" trick. + if (offset - 1u >= total_written_) { return false; } const size_t space_left = output_limit_ - total_written_; @@ -1058,8 +1184,8 @@ class SnappyIOVecWriter { } // Locate the iovec from which we need to start the copy. - size_t from_iov_index = curr_iov_index_; - size_t from_iov_offset = curr_iov_written_; + const iovec* from_iov = curr_iov_; + size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_; while (offset > 0) { if (from_iov_offset >= offset) { from_iov_offset -= offset; @@ -1067,46 +1193,47 @@ class SnappyIOVecWriter { } offset -= from_iov_offset; - assert(from_iov_index > 0); - --from_iov_index; - from_iov_offset = output_iov_[from_iov_index].iov_len; + --from_iov; +#if !defined(NDEBUG) + assert(from_iov >= output_iov_); +#endif // !defined(NDEBUG) + from_iov_offset = from_iov->iov_len; } // Copy bytes starting from the iovec pointed to by from_iov_index to // the current iovec. while (len > 0) { - assert(from_iov_index <= curr_iov_index_); - if (from_iov_index != curr_iov_index_) { - const size_t to_copy = std::min( - output_iov_[from_iov_index].iov_len - from_iov_offset, - len); - Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy); + assert(from_iov <= curr_iov_); + if (from_iov != curr_iov_) { + const size_t to_copy = + std::min(from_iov->iov_len - from_iov_offset, len); + AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy); len -= to_copy; if (len > 0) { - ++from_iov_index; + ++from_iov; from_iov_offset = 0; } } else { - assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len); - size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len - - curr_iov_written_, - len); + size_t to_copy = curr_iov_remaining_; if (to_copy == 0) { // This iovec is full. Go to the next one. - if (curr_iov_index_ + 1 >= output_iov_count_) { + if (curr_iov_ + 1 >= output_iov_end_) { return false; } - ++curr_iov_index_; - curr_iov_written_ = 0; + ++curr_iov_; + curr_iov_output_ = reinterpret_cast(curr_iov_->iov_base); + curr_iov_remaining_ = curr_iov_->iov_len; continue; } if (to_copy > len) { to_copy = len; } - IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset), - GetIOVecPointer(curr_iov_index_, curr_iov_written_), - to_copy); - curr_iov_written_ += to_copy; + + IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset), + curr_iov_output_, curr_iov_output_ + to_copy, + curr_iov_output_ + curr_iov_remaining_); + curr_iov_output_ += to_copy; + curr_iov_remaining_ -= to_copy; from_iov_offset += to_copy; total_written_ += to_copy; len -= to_copy; @@ -1175,8 +1302,7 @@ class SnappyArrayWriter { const size_t space_left = op_limit_ - op; if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. - UnalignedCopy64(ip, op); - UnalignedCopy64(ip + 8, op + 8); + UnalignedCopy128(ip, op); op_ = op + len; return true; } else { @@ -1185,8 +1311,7 @@ class SnappyArrayWriter { } inline bool AppendFromSelf(size_t offset, size_t len) { - char* op = op_; - const size_t space_left = op_limit_ - op; + char* const op_end = op_ + len; // Check if we try to append from before the start of the buffer. // Normally this would just be a check for "produced < offset", @@ -1195,30 +1320,13 @@ class SnappyArrayWriter { // to a very big number. This is convenient, as offset==0 is another // invalid case that we also want to catch, so that we do not go // into an infinite loop. - assert(op >= base_); - size_t produced = op - base_; - if (produced <= offset - 1u) { - return false; - } - if (len <= 16 && offset >= 8 && space_left >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - UnalignedCopy64(op - offset, op); - UnalignedCopy64(op - offset + 8, op + 8); - } else { - if (space_left >= len + kMaxIncrementCopyOverflow) { - IncrementalCopyFastPath(op - offset, op, len); - } else { - if (space_left < len) { - return false; - } - IncrementalCopy(op - offset, op, len); - } - } + if (Produced() <= offset - 1u || op_end > op_limit_) return false; + op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_); - op_ = op + len; return true; } inline size_t Produced() const { + assert(op_ >= base_); return op_ - base_; } inline void Flush() {} @@ -1234,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) { return InternalUncompress(compressed, &output); } -bool Uncompress(const char* compressed, size_t n, string* uncompressed) { +bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) { size_t ulength; if (!GetUncompressedLength(compressed, n, &ulength)) { return false; @@ -1302,9 +1410,10 @@ void RawCompress(const char* input, *compressed_length = (writer.CurrentDestination() - compressed); } -size_t Compress(const char* input, size_t input_length, string* compressed) { +size_t Compress(const char* input, size_t input_length, + std::string* compressed) { // Pre-grow the buffer to the max length of the compressed output - compressed->resize(MaxCompressedLength(input_length)); + STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length)); size_t compressed_length; RawCompress(input, input_length, string_as_array(compressed), @@ -1327,7 +1436,7 @@ class SnappyScatteredWriter { // We need random access into the data generated so far. Therefore // we keep track of all of the generated data as an array of blocks. // All of the blocks except the last have length kBlockSize. - vector blocks_; + std::vector blocks_; size_t expected_; // Total size of all fully generated blocks so far @@ -1386,8 +1495,7 @@ class SnappyScatteredWriter { if (length <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); + UnalignedCopy128(ip, op); op_ptr_ = op + length; return true; } else { @@ -1396,16 +1504,14 @@ class SnappyScatteredWriter { } inline bool AppendFromSelf(size_t offset, size_t len) { + char* const op_end = op_ptr_ + len; // See SnappyArrayWriter::AppendFromSelf for an explanation of // the "offset - 1u" trick. - if (offset - 1u < op_ptr_ - op_base_) { - const size_t space_left = op_limit_ - op_ptr_; - if (space_left >= len + kMaxIncrementCopyOverflow) { - // Fast path: src and dst in current block. - IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len); - op_ptr_ += len; - return true; - } + if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && + op_end <= op_limit_)) { + // Fast path: src and dst in current block. + op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_); + return true; } return SlowAppendFromSelf(offset, len); } @@ -1433,7 +1539,7 @@ bool SnappyScatteredWriter::SlowAppend(const char* ip, size_t len) { } // Make new block - size_t bsize = min(kBlockSize, expected_ - full_size_); + size_t bsize = std::min(kBlockSize, expected_ - full_size_); op_base_ = allocator_.Allocate(bsize); op_ptr_ = op_base_; op_limit_ = op_base_ + bsize; @@ -1489,8 +1595,8 @@ class SnappySinkAllocator { void Flush(size_t size) { size_t size_written = 0; size_t block_size; - for (size_t i = 0; i < blocks_.size(); ++i) { - block_size = min(blocks_[i].size, size - size_written); + for (int i = 0; i < blocks_.size(); ++i) { + block_size = std::min(blocks_[i].size, size - size_written); dest_->AppendAndTakeOwnership(blocks_[i].data, block_size, &SnappySinkAllocator::Deleter, NULL); size_written += block_size; @@ -1510,7 +1616,7 @@ class SnappySinkAllocator { } Sink* dest_; - vector blocks_; + std::vector blocks_; // Note: copying this object is allowed }; @@ -1535,19 +1641,21 @@ bool Uncompress(Source* compressed, Sink* uncompressed) { char* buf = uncompressed->GetAppendBufferVariable( 1, uncompressed_len, &c, 1, &allocated_size); + const size_t compressed_len = compressed->Available(); // If we can get a flat buffer, then use it, otherwise do block by block // uncompression if (allocated_size >= uncompressed_len) { SnappyArrayWriter writer(buf); - bool result = InternalUncompressAllTags( - &decompressor, &writer, uncompressed_len); + bool result = InternalUncompressAllTags(&decompressor, &writer, + compressed_len, uncompressed_len); uncompressed->Append(buf, writer.Produced()); return result; } else { SnappySinkAllocator allocator(uncompressed); SnappyScatteredWriter writer(allocator); - return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len); + return InternalUncompressAllTags(&decompressor, &writer, compressed_len, + uncompressed_len); } } -} // end namespace snappy +} // namespace snappy diff --git a/c_src/snappy/snappy.h b/c_src/snappy/snappy.h index 4568db8..e9805bf 100644 --- a/c_src/snappy/snappy.h +++ b/c_src/snappy/snappy.h @@ -39,7 +39,7 @@ #ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__ #define THIRD_PARTY_SNAPPY_SNAPPY_H__ -#include +#include #include #include "snappy-stubs-public.h" @@ -69,11 +69,12 @@ namespace snappy { // Higher-level string based routines (should be sufficient for most users) // ------------------------------------------------------------------------ - // Sets "*output" to the compressed version of "input[0,input_length-1]". - // Original contents of *output are lost. + // Sets "*compressed" to the compressed version of "input[0,input_length-1]". + // Original contents of *compressed are lost. // - // REQUIRES: "input[]" is not an alias of "*output". - size_t Compress(const char* input, size_t input_length, string* output); + // REQUIRES: "input[]" is not an alias of "*compressed". + size_t Compress(const char* input, size_t input_length, + std::string* compressed); // Decompresses "compressed[0,compressed_length-1]" to "*uncompressed". // Original contents of "*uncompressed" are lost. @@ -82,7 +83,7 @@ namespace snappy { // // returns false if the message is corrupted and could not be decompressed bool Uncompress(const char* compressed, size_t compressed_length, - string* uncompressed); + std::string* uncompressed); // Decompresses "compressed" to "*uncompressed". // @@ -193,11 +194,14 @@ namespace snappy { // Note that there might be older data around that is compressed with larger // block sizes, so the decompression code should not rely on the // non-existence of long backreferences. - static const int kBlockLog = 16; - static const size_t kBlockSize = 1 << kBlockLog; + static constexpr int kBlockLog = 16; + static constexpr size_t kBlockSize = 1 << kBlockLog; - static const int kMaxHashTableBits = 14; - static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits; + static constexpr int kMinHashTableBits = 8; + static constexpr size_t kMinHashTableSize = 1 << kMinHashTableBits; + + static constexpr int kMaxHashTableBits = 14; + static constexpr size_t kMaxHashTableSize = 1 << kMaxHashTableBits; } // end namespace snappy #endif // THIRD_PARTY_SNAPPY_SNAPPY_H__ diff --git a/c_src/snappy_nif.cc b/c_src/snappy_nif.cc index 50e9018..f9d0139 100644 --- a/c_src/snappy_nif.cc +++ b/c_src/snappy_nif.cc @@ -22,6 +22,10 @@ #include "snappy/snappy.h" #include "snappy/snappy-sinksource.h" +#ifdef OTP_R13B03 +#error OTP R13B03 not supported. Upgrade to R13B04 or later. +#endif + #ifdef __cplusplus #define BEGIN_C extern "C" { #define END_C } @@ -40,7 +44,7 @@ class SnappyNifSink : public snappy::Sink void Append(const char* data, size_t n); char* GetAppendBuffer(size_t len, char* scratch); - ErlNifBinary& GetBin(); + ErlNifBinary& getBin(); private: void EnsureSize(size_t append_length); @@ -83,7 +87,7 @@ SnappyNifSink::GetAppendBuffer(size_t len, char* scratch) } ErlNifBinary& -SnappyNifSink::GetBin() +SnappyNifSink::getBin() { if(bin.size > length) { if(!enif_realloc_binary_compat(env, &bin, length)) { @@ -93,15 +97,13 @@ SnappyNifSink::GetBin() return bin; } - void SnappyNifSink::EnsureSize(size_t append_length) { + size_t sz; + if((length + append_length) > bin.size) { - size_t sz = append_length * 4; - if(sz < 8192) { - sz = 8192; - } + sz = (append_length * 4) < 8192 ? 8192 : (append_length * 4); if(!enif_realloc_binary_compat(env, &bin, bin.size + sz)) { throw std::bad_alloc(); @@ -140,7 +142,7 @@ BEGIN_C ERL_NIF_TERM -snappy_compress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +snappy_compress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary input; @@ -148,12 +150,22 @@ snappy_compress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) return enif_make_badarg(env); } + // If empty binary has been provided, return an empty binary. + // Snappy will do this in any case, so might as well skip the + // overhead... + if(input.size == 0) { + ErlNifBinary empty; + // init empty; + memset(&empty,0,sizeof(ErlNifBinary)); + return make_ok(env, enif_make_binary(env,&empty)); + } + try { snappy::ByteArraySource source(SC_PTR(input.data), input.size); SnappyNifSink sink(env); snappy::Compress(&source, &sink); - return make_ok(env, enif_make_binary(env, &sink.GetBin())); - } catch(std::bad_alloc e) { + return make_ok(env, enif_make_binary(env, &sink.getBin())); + } catch(const std::bad_alloc & e) { return make_error(env, "insufficient_memory"); } catch(...) { return make_error(env, "unknown"); @@ -162,7 +174,7 @@ snappy_compress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) ERL_NIF_TERM -snappy_decompress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +snappy_decompress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary bin; ErlNifBinary ret; @@ -172,6 +184,15 @@ snappy_decompress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) return enif_make_badarg(env); } + // Check that the binary is not empty + if(bin.size == 0) { + // Snappy library cannot decompress an empty binary - although + // it will unfortunately let you compress one. If an empty binary + // has been passed - send an empty binary back. + memset(&ret,0,sizeof(ErlNifBinary)); + return make_ok(env, enif_make_binary(env,&ret)); + } + try { if(!snappy::GetUncompressedLength(SC_PTR(bin.data), bin.size, &len)) { return make_error(env, "data_not_compressed"); @@ -194,7 +215,7 @@ snappy_decompress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) ERL_NIF_TERM -snappy_uncompressed_length(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +snappy_uncompressed_length_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary bin; size_t len; @@ -257,9 +278,9 @@ on_upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM info) static ErlNifFunc nif_functions[] = { - {"compress", 1, snappy_compress}, - {"decompress", 1, snappy_decompress}, - {"uncompressed_length", 1, snappy_uncompressed_length}, + {"compress", 1, snappy_compress_erl}, + {"decompress", 1, snappy_decompress_erl}, + {"uncompressed_length", 1, snappy_uncompressed_length_erl}, {"is_valid", 1, snappy_is_valid} }; diff --git a/enc b/enc new file mode 100755 index 0000000..9bcea06 Binary files /dev/null and b/enc differ diff --git a/rebar.config b/rebar.config index 3b93f8a..cb89949 100644 --- a/rebar.config +++ b/rebar.config @@ -1,12 +1,12 @@ -{require_otp_vsn, "R14|R15|R16|17|18|19|20|21|22|23|24|25"}. - {erl_opts, [debug_info, warn_unused_vars, warn_shadow_vars, warn_unused_import]}. -{port_sources, ["c_src/*.cc", - "c_src/snappy/*.cc"]}. -{port_env, [ - {"CXXFLAGS", "$CXXFLAGS -DNDEBUG"}, - {"(linux|solaris|freebsd|netbsd|openbsd|dragonfly|darwin)", - "LDFLAGS", "$LDFLAGS -lstdc++"} -]}. -{so_name, "snappy_nif.so"}. {eunit_opts, [verbose]}. + +{artifacts, ["priv/snappy_nif.so"]}. + +{port_specs, [{".*", "priv/snappy_nif.so", + ["c_src/snappy/*.cc", "c_src/*.cc"], + [{env, [{"LDFLAGS", "$LDFLAGS -lstdc++"}, + {"CXXFLAGS", "$CXXFLAGS -std=c++11"}]}]}]}. + +{pre_hooks, [{"", compile, "./enc compile"}]}. +{post_hooks, [{"", clean, "./enc clean"}]}. diff --git a/rebar.lock b/rebar.lock new file mode 100644 index 0000000..57afcca --- /dev/null +++ b/rebar.lock @@ -0,0 +1 @@ +[]. diff --git a/rebar3 b/rebar3 new file mode 100755 index 0000000..c7d4824 Binary files /dev/null and b/rebar3 differ diff --git a/src/snappy.app.src b/src/snappy.app.src index 25d37b5..965a190 100644 --- a/src/snappy.app.src +++ b/src/snappy.app.src @@ -1,7 +1,7 @@ {application, snappy, [ {description, "snappy compressor/decompressor Erlang NIF wrapper"}, - {vsn, "1.0.5"}, + {vsn, "1.1.1"}, {registered, []}, {applications, [ kernel, diff --git a/src/snappy.erl b/src/snappy.erl index 7d3d36a..ebf1587 100644 --- a/src/snappy.erl +++ b/src/snappy.erl @@ -33,24 +33,20 @@ init() -> Dir -> filename:join(Dir, "snappy_nif") end, - (catch erlang:load_nif(SoName, 0)), - case erlang:system_info(otp_release) of - "R13B03" -> true; - _ -> ok - end. + erlang:load_nif(SoName, 0). compress(_IoList) -> - exit(snappy_nif_not_loaded). + erlang:nif_error(snappy_nif_not_loaded). decompress(_IoList) -> - exit(snappy_nif_not_loaded). + erlang:nif_error(snappy_nif_not_loaded). uncompressed_length(_IoList) -> - exit(snappy_nif_not_loaded). + erlang:nif_error(snappy_nif_not_loaded). is_valid(_IoList) -> - exit(snappy_nif_not_loaded). + erlang:nif_error(snappy_nif_not_loaded). diff --git a/test/snappy_tests.erl b/test/snappy_tests.erl index ac39c58..37a0fc7 100644 --- a/test/snappy_tests.erl +++ b/test/snappy_tests.erl @@ -72,3 +72,36 @@ decompression() -> ?assertEqual({ok, BigData}, snappy:decompress(Compressed3)), ok. +check_double_decompress_doesnt_cause_segfault_test() -> + % Try to decompress an empty binary + ?assertMatch({ok,<<>>}, snappy:decompress(<<>>)), + % And once more... + ?assertMatch({ok,<<>>}, snappy:decompress(<<>>)), + ok. + +check_double_compress_doesnt_cause_segfault_test() -> + % Try to compress an empty binary + ?assertMatch({ok,<<>>},snappy:compress(<<>>)), + % And once more... + ?assertMatch({ok,<<>>},snappy:compress(<<>>)), + ok. + +can_compress_and_decompress_empty_binary_test() -> + % Try to compress an empty binary... + {ok, Compressed} = snappy:compress(<<>>), + % Try to decompress the result of the compression... + {ok, Decompressed} = snappy:decompress(Compressed), + ok. + +can_compress_an_empty_binary_test() -> + % Try to compress an empty binary... + {ok, Compressed} = snappy:compress(<<>>), + % Check an empty binary was returned + ?assertMatch(Compressed,<<>>), + ok. + +can_decompress_an_empty_binary_test() -> + % Try to decompress an empty binary + {ok, Decompressed} = snappy:decompress(<<>>), + ?assertMatch(Decompressed,<<>>), + ok.