From 13d34c012b90c750845a727c53d49fd5c8abdc84 Mon Sep 17 00:00:00 2001
From: Andrea Mazzoleni <amadvance@gmail.com>
Date: Sun, 4 Jun 2017 09:54:03 +0200
Subject: [PATCH] Update to latest libdeflate

---
 HISTORY                       |  8 ++++----
 doc/history.1                 | 10 +++++-----
 doc/history.d                 | 10 +++++-----
 doc/history.txt               | 10 +++++-----
 libdeflate/bt_matchfinder.h   | 10 ++++------
 libdeflate/deflate_compress.c | 19 ++++++++++++++-----
 libdeflate/hc_matchfinder.h   | 18 ++++++++----------
 libdeflate/matchfinder_avx2.h |  4 ++--
 libdeflate/matchfinder_neon.h |  4 ++--
 libdeflate/matchfinder_sse2.h |  4 ++--
 10 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/HISTORY b/HISTORY
index 8735d8c..c99ab95 100644
--- a/HISTORY
+++ b/HISTORY
@@ -3,14 +3,14 @@
                             =======================
 
 
-ADVANCECOMP VERSION 2.0 2017/03
+ADVANCECOMP VERSION 2.0 2017/06
 ===============================
 
 * Added support for reading MNG files with depth of 1, 2, and 4 bits.
 * Added 64 bits binary for Windows.
-* Updated to libdeflate 0.7.
+* Updated to libdeflate 29-May-2017.
   From https://github.com/ebiggers/libdeflate
-  at commit a32bdb097de48e5ddffc959a58297d384b58fcaa.
+  at commit 1726e9e87fb6f98682dfdea2356d5ee58881fe7b.
 
 
 ADVANCECOMP VERSION 1.23 2016/11
@@ -32,7 +32,7 @@ ADVANCECOMP VERSION 1.21 2016/11
 * Added libdeflate support. It's the new default because it provides
   better performance and compression than 7z.
   From https://github.com/ebiggers/libdeflate
-  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
+  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88.
 * Update to the latest zopfli library.
   From https://github.com/google/zopfli
   at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/doc/history.1 b/doc/history.1
index fc8bc41..bc925f6 100644
--- a/doc/history.1
+++ b/doc/history.1
@@ -1,16 +1,16 @@
 .TH "History For AdvanceCOMP" 1
 .SH NAME
 advcomp \- History For AdvanceCOMP
-.SH ADVANCECOMP VERSION 2.0 2017/01 
+.SH ADVANCECOMP VERSION 2.0 2017/06 
 .PD 0
 .IP \(bu
 Added support for reading MNG files with depth of 1, 2, and 4 bits.
 .IP \(bu
 Added 64 bits binary for Windows.
 .IP \(bu
-Updated to libdeflate 0.7.
-From https://github.com/google/zopfli
-at commit a32bdb097de48e5ddffc959a58297d384b58fcaa.
+Updated to libdeflate 29\-May\-2017.
+From https://github.com/ebiggers/libdeflate
+at commit 1726e9e87fb6f98682dfdea2356d5ee58881fe7b.
 .PD
 .SH ADVANCECOMP VERSION 1.23 2016/11 
 .PD 0
@@ -29,7 +29,7 @@ builds. The new MingW compiler was disabling it by default.
 Added libdeflate support. It\'s the new default because it provides
 better performance and compression than 7z.
 From https://github.com/ebiggers/libdeflate
-at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
+at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88.
 .IP \(bu
 Update to the latest zopfli library.
 From https://github.com/google/zopfli
diff --git a/doc/history.d b/doc/history.d
index d95e0ea..85904d5 100644
--- a/doc/history.d
+++ b/doc/history.d
@@ -1,12 +1,12 @@
 Name
 	advcomp - History For AdvanceCOMP
 
-AdvanceCOMP Version 2.0 2017/01
+AdvanceCOMP Version 2.0 2017/06
 	) Added support for reading MNG files with depth of 1, 2, and 4 bits.
 	) Added 64 bits binary for Windows.
-	) Updated to libdeflate 0.7.
-		From https://github.com/google/zopfli
-		at commit a32bdb097de48e5ddffc959a58297d384b58fcaa.
+	) Updated to libdeflate 29-May-2017.
+		From https://github.com/ebiggers/libdeflate
+		at commit 1726e9e87fb6f98682dfdea2356d5ee58881fe7b.
 
 AdvanceCOMP Version 1.23 2016/11
 	) Fixed build issue from source code due missing libdeflate header.
@@ -19,7 +19,7 @@ AdvanceCOMP Version 1.21 2016/11
 	) Added libdeflate support. It's the new default because it provides
 		better performance and compression than 7z.
 		From https://github.com/ebiggers/libdeflate
-		at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
+		at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88.
 	) Update to the latest zopfli library.
 		From https://github.com/google/zopfli
 		at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/doc/history.txt b/doc/history.txt
index 859fc78..4d096fb 100644
--- a/doc/history.txt
+++ b/doc/history.txt
@@ -3,14 +3,14 @@
                             =======================
 
 
-ADVANCECOMP VERSION 2.0 2017/01
+ADVANCECOMP VERSION 2.0 2017/06
 ===============================
 
 * Added support for reading MNG files with depth of 1, 2, and 4 bits.
 * Added 64 bits binary for Windows.
-* Updated to libdeflate 0.7.
-  From https://github.com/google/zopfli
-  at commit a32bdb097de48e5ddffc959a58297d384b58fcaa.
+* Updated to libdeflate 29-May-2017.
+  From https://github.com/ebiggers/libdeflate
+  at commit 1726e9e87fb6f98682dfdea2356d5ee58881fe7b.
 
 
 ADVANCECOMP VERSION 1.23 2016/11
@@ -32,7 +32,7 @@ ADVANCECOMP VERSION 1.21 2016/11
 * Added libdeflate support. It's the new default because it provides
   better performance and compression than 7z.
   From https://github.com/ebiggers/libdeflate
-  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
+  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88.
 * Update to the latest zopfli library.
   From https://github.com/google/zopfli
   at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/libdeflate/bt_matchfinder.h b/libdeflate/bt_matchfinder.h
index 5039b0a..49fc0bf 100644
--- a/libdeflate/bt_matchfinder.h
+++ b/libdeflate/bt_matchfinder.h
@@ -153,8 +153,7 @@ bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
 	const u8 *in_next = in_base + cur_pos;
 	u32 depth_remaining = max_search_depth;
 	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
-	u32 next_seq4;
-	u32 next_seq3;
+	u32 next_hashseq;
 	u32 hash3;
 	u32 hash4;
 	s32 cur_node;
@@ -170,14 +169,13 @@ bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
 	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
 		      BT_MATCHFINDER_HASH3_WAYS <= 2);
 
-	next_seq4 = load_u32_unaligned(in_next + 1);
-	next_seq3 = loaded_u32_to_u24(next_seq4);
+	next_hashseq = get_unaligned_le32(in_next + 1);
 
 	hash3 = next_hashes[0];
 	hash4 = next_hashes[1];
 
-	next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
-	next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
 	prefetchw(&mf->hash3_tab[next_hashes[0]]);
 	prefetchw(&mf->hash4_tab[next_hashes[1]]);
 
diff --git a/libdeflate/deflate_compress.c b/libdeflate/deflate_compress.c
index a77314b..5049b13 100644
--- a/libdeflate/deflate_compress.c
+++ b/libdeflate/deflate_compress.c
@@ -491,10 +491,19 @@ struct deflate_output_bitstream {
 	u8 *end;
 };
 
-#define MIN_OUTPUT_SIZE	(UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
+/*
+ * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
+ * present following os->end, in order to not overrun the buffer when generating
+ * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
+ * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
+ * to make the compression algorithm produce the same result on all CPU
+ * architectures (which is sometimes desirable), we have to unconditionally use
+ * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
+ */
+#define OUTPUT_END_PADDING	8
 
 /* Initialize the output bitstream.  'size' is assumed to be at least
- * MIN_OUTPUT_SIZE.  */
+ * OUTPUT_END_PADDING.  */
 static void
 deflate_init_output(struct deflate_output_bitstream *os,
 		    void *buffer, size_t size)
@@ -503,7 +512,7 @@ deflate_init_output(struct deflate_output_bitstream *os,
 	os->bitcount = 0;
 	os->begin = buffer;
 	os->next = os->begin;
-	os->end = os->begin + size - MIN_OUTPUT_SIZE;
+	os->end = os->begin + size - OUTPUT_END_PADDING;
 }
 
 /* Add some bits to the bitbuffer variable of the output bitstream.  The caller
@@ -2774,7 +2783,7 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c,
 			    const void *in, size_t in_nbytes,
 			    void *out, size_t out_nbytes_avail)
 {
-	if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
+	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
 		return 0;
 
 	/* For extremely small inputs just use a single uncompressed block. */
@@ -2813,5 +2822,5 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
 	 */
 	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
-	return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
+	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
 }
diff --git a/libdeflate/hc_matchfinder.h b/libdeflate/hc_matchfinder.h
index 0def8f9..8412a6f 100644
--- a/libdeflate/hc_matchfinder.h
+++ b/libdeflate/hc_matchfinder.h
@@ -194,7 +194,7 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 	const u8 *best_matchptr = in_next;
 	mf_pos_t cur_node3, cur_node4;
 	u32 hash3, hash4;
-	u32 next_seq3, next_seq4;
+	u32 next_hashseq;
 	u32 seq4;
 	const u8 *matchptr;
 	u32 len;
@@ -232,10 +232,9 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 	mf->next_tab[cur_pos] = cur_node4;
 
 	/* Compute the next hash codes.  */
-	next_seq4 = load_u32_unaligned(in_next + 1);
-	next_seq3 = loaded_u32_to_u24(next_seq4);
-	next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
-	next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
+	next_hashseq = get_unaligned_le32(in_next + 1);
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
 	prefetchw(&mf->hash3_tab[next_hashes[0]]);
 	prefetchw(&mf->hash4_tab[next_hashes[1]]);
 
@@ -370,7 +369,7 @@ hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
 {
 	u32 cur_pos;
 	u32 hash3, hash4;
-	u32 next_seq3, next_seq4;
+	u32 next_hashseq;
 	u32 remaining = count;
 
 	if (unlikely(count + 5 > in_end - in_next))
@@ -389,10 +388,9 @@ hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
 		mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
 		mf->hash4_tab[hash4] = cur_pos;
 
-		next_seq4 = load_u32_unaligned(++in_next);
-		next_seq3 = loaded_u32_to_u24(next_seq4);
-		hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
-		hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
+		next_hashseq = get_unaligned_le32(++in_next);
+		hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+		hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
 		cur_pos++;
 	} while (--remaining);
 
diff --git a/libdeflate/matchfinder_avx2.h b/libdeflate/matchfinder_avx2.h
index 6187ee7..3514226 100644
--- a/libdeflate/matchfinder_avx2.h
+++ b/libdeflate/matchfinder_avx2.h
@@ -11,7 +11,7 @@ matchfinder_init_avx2(mf_pos_t *data, size_t size)
 	__m256i v, *p;
 	size_t n;
 
-	if (size % sizeof(__m256i) * 4)
+	if (size % (sizeof(__m256i) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
@@ -34,7 +34,7 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
 	__m256i v, *p;
 	size_t n;
 
-	if ((size % sizeof(__m256i) * 4 != 0))
+	if (size % (sizeof(__m256i) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
diff --git a/libdeflate/matchfinder_neon.h b/libdeflate/matchfinder_neon.h
index 42ec662..e2512d1 100644
--- a/libdeflate/matchfinder_neon.h
+++ b/libdeflate/matchfinder_neon.h
@@ -11,7 +11,7 @@ matchfinder_init_neon(mf_pos_t *data, size_t size)
 	int16x8_t v, *p;
 	size_t n;
 
-	if (size % sizeof(int16x8_t) * 4)
+	if (size % (sizeof(int16x8_t) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
@@ -38,7 +38,7 @@ matchfinder_rebase_neon(mf_pos_t *data, size_t size)
 	int16x8_t v, *p;
 	size_t n;
 
-	if ((size % sizeof(int16x8_t) * 4 != 0))
+	if (size % (sizeof(int16x8_t) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
diff --git a/libdeflate/matchfinder_sse2.h b/libdeflate/matchfinder_sse2.h
index c949602..bbed3b6 100644
--- a/libdeflate/matchfinder_sse2.h
+++ b/libdeflate/matchfinder_sse2.h
@@ -11,7 +11,7 @@ matchfinder_init_sse2(mf_pos_t *data, size_t size)
 	__m128i v, *p;
 	size_t n;
 
-	if (size % sizeof(__m128i) * 4)
+	if (size % (sizeof(__m128i) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
@@ -34,7 +34,7 @@ matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
 	__m128i v, *p;
 	size_t n;
 
-	if ((size % sizeof(__m128i) * 4 != 0))
+	if (size % (sizeof(__m128i) * 4) != 0)
 		return false;
 
 	STATIC_ASSERT(sizeof(mf_pos_t) == 2);