From bcbe58ffae6ae2fdbfeb7f2d1f097eeb37bb895b Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 06:20:15 +0200
Subject: [PATCH 01/22] Make compiler definitions complete

---
 src/sequali/function_dispatch.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/sequali/function_dispatch.h b/src/sequali/function_dispatch.h
index b3a7ca5..42205d9 100644
--- a/src/sequali/function_dispatch.h
+++ b/src/sequali/function_dispatch.h
@@ -27,13 +27,21 @@ along with Sequali.  If not, see <https://www.gnu.org/licenses/
 #ifdef __clang__
 #ifdef __has_attribute
 #define CLANG_COMPILER_HAS(attribute) __has_attribute(attribute)
-#endif 
+#endif
+#ifdef __has_builtin
+#define CLANG_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
+#endif
 #endif
 #ifndef CLANG_COMPILER_HAS 
 #define CLANG_COMPILER_HAS(attribute) 0
-#endif 
+#endif
+#ifndef CLANG_COMPILER_HAS_BUILTIN
+#define CLANG_COMPILER_HAS_BUILTIN(function) 0
+#endif
 
-#define COMPILER_HAS_TARGET (GCC_AT_LEAST(4, 8) || CLANG_COMPILER_HAS(__target__))
+#define COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
+    (GCC_AT_LEAST(4, 8) || (CLANG_COMPILER_HAS(__target__) && \
+        CLANG_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)))
 #define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))
 
 #if defined(__x86_64__) || defined(_M_X64)
@@ -77,7 +85,7 @@ decode_bam_sequence_default(uint8_t *dest, const uint8_t *encoded_sequence, size
     }
 }
 
-#if COMPILER_HAS_TARGET && BUILD_IS_X86_64
+#if COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS && BUILD_IS_X86_64
 __attribute__((__target__("ssse3")))
 static void 
 decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) 
@@ -254,7 +262,7 @@ static int64_t sequence_to_canonical_kmer_default(uint8_t *sequence, uint64_t k)
     return revcomp_kmer;
 }
 
-#if COMPILER_HAS_TARGET && BUILD_IS_X86_64
+#if COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS && BUILD_IS_X86_64
 __attribute__((__target__("avx2")))
 static int64_t sequence_to_canonical_kmer_avx2(uint8_t *sequence, uint64_t k) {
     /* By using a load mask, at most 3 extra bytes are loaded. Given that a 

From 42f66276c91afb9fa65f089144bca2ba6012f8ae Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 06:21:40 +0200
Subject: [PATCH 02/22] Split out compiler definitions in a separate file

---
 src/sequali/compiler_defs.h     | 52 +++++++++++++++++++++++++++++++++
 src/sequali/function_dispatch.h | 36 +----------------------
 2 files changed, 53 insertions(+), 35 deletions(-)
 create mode 100644 src/sequali/compiler_defs.h

diff --git a/src/sequali/compiler_defs.h b/src/sequali/compiler_defs.h
new file mode 100644
index 0000000..d240a4e
--- /dev/null
+++ b/src/sequali/compiler_defs.h
@@ -0,0 +1,52 @@
+/*
+Copyright (C) 2023 Leiden University Medical Center
+This file is part of Sequali
+
+Sequali is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Sequali is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with Sequali.  If not, see <https://www.gnu.org/licenses/
+*/
+
+// Macros also used in htslib, very useful.
+#if defined __GNUC__
+#define GCC_AT_LEAST(major, minor) \
+    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+# define GCC_AT_LEAST(major, minor) 0
+#endif
+
+#ifdef __clang__
+#ifdef __has_attribute
+#define CLANG_COMPILER_HAS(attribute) __has_attribute(attribute)
+#endif
+#ifdef __has_builtin
+#define CLANG_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
+#endif
+#endif
+#ifndef CLANG_COMPILER_HAS
+#define CLANG_COMPILER_HAS(attribute) 0
+#endif
+#ifndef CLANG_COMPILER_HAS_BUILTIN
+#define CLANG_COMPILER_HAS_BUILTIN(function) 0
+#endif
+
+#define COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
+    (GCC_AT_LEAST(4, 8) || (CLANG_COMPILER_HAS(__target__) && \
+        CLANG_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)))
+#define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define BUILD_IS_X86_64 1
+#include "immintrin.h"
+#else
+#define BUILD_IS_X86_64 0
+#endif
diff --git a/src/sequali/function_dispatch.h b/src/sequali/function_dispatch.h
index 42205d9..3349558 100644
--- a/src/sequali/function_dispatch.h
+++ b/src/sequali/function_dispatch.h
@@ -16,41 +16,7 @@ You should have received a copy of the GNU Affero General Public License
 along with Sequali.  If not, see <https://www.gnu.org/licenses/
 */
 
-// Macros also used in htslib, very useful.
-#if defined __GNUC__
-#define GCC_AT_LEAST(major, minor) \
-    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
-#else 
-# define GCC_AT_LEAST(major, minor) 0
-#endif
-
-#ifdef __clang__
-#ifdef __has_attribute
-#define CLANG_COMPILER_HAS(attribute) __has_attribute(attribute)
-#endif
-#ifdef __has_builtin
-#define CLANG_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
-#endif
-#endif
-#ifndef CLANG_COMPILER_HAS 
-#define CLANG_COMPILER_HAS(attribute) 0
-#endif
-#ifndef CLANG_COMPILER_HAS_BUILTIN
-#define CLANG_COMPILER_HAS_BUILTIN(function) 0
-#endif
-
-#define COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
-    (GCC_AT_LEAST(4, 8) || (CLANG_COMPILER_HAS(__target__) && \
-        CLANG_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)))
-#define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define BUILD_IS_X86_64 1
-#include "immintrin.h"
-#else 
-#define BUILD_IS_X86_64 0
-#endif
-
+#include "compiler_defs.h"
 #include <stdint.h>
 #include <string.h>
 #include <stddef.h>

From 448b2d8370069658a004fc6240f74ae2bf35013b Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 06:29:59 +0200
Subject: [PATCH 03/22] Split out smith waterman function

---
 src/sequali/_seqidentmodule.c | 137 ++++++++++++++++++++--------------
 1 file changed, 81 insertions(+), 56 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index dc6804f..465a1d4 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -23,6 +23,76 @@ struct Entry {
     Py_ssize_t query_matches;
 };
 
+static int8_t
+get_smith_waterman_matches(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+)
+{
+    Py_ssize_t highest_score = 0;
+    Py_ssize_t most_matches = 0;
+    struct Entry prev_column[32];
+    struct Entry new_column[32];
+    memset(prev_column, 0, 32 * sizeof(struct Entry));
+    memset(new_column, 0, 32 * sizeof(struct Entry));
+    for (size_t i=0; i < target_length; i++) {
+        for (size_t j=1; j < query_length + 1; j++) {
+            uint8_t target_char = target[i];
+            uint8_t query_char = query[j - 1];
+            struct Entry prev_entry = prev_column[j-1];
+            Py_ssize_t linear_score; 
+            Py_ssize_t linear_matches;
+            if (target_char == query_char) {
+                linear_score = prev_entry.score + match_score;
+                linear_matches = prev_entry.query_matches + 1;
+            } else {
+                linear_score = prev_entry.score + mismatch_penalty;
+                linear_matches = prev_entry.query_matches;
+            }
+            struct Entry prev_ins_entry = prev_column[j];
+            struct Entry prev_del_entry = new_column[j - 1];
+            Py_ssize_t insertion_score = prev_ins_entry.score + insertion_penalty;
+            Py_ssize_t deletion_score = prev_del_entry.score + deletion_penalty;
+            Py_ssize_t score;
+            Py_ssize_t matches; 
+            if (linear_score >= insertion_score && linear_score >= deletion_score) {
+                score = linear_score;
+                matches = linear_matches;
+            } else if (insertion_score >= deletion_score) {
+                /* When an insertion happens in the query in theory we can 
+                   match all query characeters still. So deduct one as a penalty. */
+                score = insertion_score;
+                matches = prev_ins_entry.query_matches - 1;
+            } else {
+                /* When a deletion happens in the query, that character cannot
+                   match anything anymore. So no need to deduct a penalty. */
+                score = deletion_score;
+                matches = prev_del_entry.query_matches;
+            }
+            if (score < 0) {
+                score = 0;
+                matches = 0;
+            }
+            new_column[j].score = score;
+            new_column[j].query_matches = matches;
+            if (score == highest_score && matches > most_matches) {
+                most_matches = matches;
+            } else if (score > highest_score) {
+                highest_score = score;
+                most_matches = matches;
+            }
+        }
+        memcpy(prev_column, new_column, sizeof(prev_column));
+    }
+    return most_matches;
+}
+
 PyDoc_STRVAR(sequence_identity__doc__, 
 "Calculate sequence identity based on a smith-waterman matrix. Only keep\n"
 "two columns in memory as no walk-back is needed.\n"
@@ -37,7 +107,7 @@ sequence_identity(PyObject *module, PyObject *args, PyObject *kwargs)
     static char *format = "UU|nnnn:identify_sequence";
     static char *kwnames[] = {
         "target", "query", "match_score", "mismatch_penalty", 
-        "deletion_penalty", "inertion_penalty", NULL
+        "deletion_penalty", "insertion_penalty", NULL
     };
     PyObject *target_obj = NULL; 
     PyObject *query_obj = NULL;
@@ -80,61 +150,16 @@ sequence_identity(PyObject *module, PyObject *args, PyObject *kwargs)
         );
         return NULL;
     }
-    Py_ssize_t highest_score = 0;
-    Py_ssize_t most_matches = 0;
-    struct Entry prev_column[32];
-    struct Entry new_column[32];
-    memset(prev_column, 0, 32 * sizeof(struct Entry));
-    memset(new_column, 0, 32 * sizeof(struct Entry));
-    for (Py_ssize_t i=0; i < target_length; i++) {
-        for (Py_ssize_t j=1; j < query_length + 1; j++) {
-            uint8_t target_char = target[i];
-            uint8_t query_char = query[j - 1];
-            struct Entry prev_entry = prev_column[j-1];
-            Py_ssize_t linear_score; 
-            Py_ssize_t linear_matches;
-            if (target_char == query_char) {
-                linear_score = prev_entry.score + match_score;
-                linear_matches = prev_entry.query_matches + 1;
-            } else {
-                linear_score = prev_entry.score + mismatch_penalty;
-                linear_matches = prev_entry.query_matches;
-            }
-            struct Entry prev_ins_entry = prev_column[j];
-            struct Entry prev_del_entry = new_column[j - 1];
-            Py_ssize_t insertion_score = prev_ins_entry.score + insertion_penalty;
-            Py_ssize_t deletion_score = prev_del_entry.score + deletion_penalty;
-            Py_ssize_t score;
-            Py_ssize_t matches; 
-            if (linear_score >= insertion_score && linear_score >= deletion_score) {
-                score = linear_score;
-                matches = linear_matches;
-            } else if (insertion_score >= deletion_score) {
-                /* When an insertion happens in the query in theory we can 
-                   match all query characeters still. So deduct one as a penalty. */
-                score = insertion_score;
-                matches = prev_ins_entry.query_matches - 1;
-            } else {
-                /* When a deletion happens in the query, that character cannot
-                   match anything anymore. So no need to deduct a penalty. */
-                score = deletion_score;
-                matches = prev_del_entry.query_matches;
-            }
-            if (score < 0) {
-                score = 0;
-                matches = 0;
-            }
-            new_column[j].score = score;
-            new_column[j].query_matches = matches;
-            if (score == highest_score && matches > most_matches) {
-                most_matches = matches;
-            } else if (score > highest_score) {
-                highest_score = score;
-                most_matches = matches;
-            }
-        }
-        memcpy(prev_column, new_column, sizeof(prev_column));
-    }
+    Py_ssize_t most_matches = get_smith_waterman_matches(
+        target, 
+        target_length,
+        query,
+        query_length,
+        match_score,
+        mismatch_penalty,
+        deletion_penalty,
+        insertion_penalty
+    );
     double identity = (double)most_matches / (double)query_length;
     return PyFloat_FromDouble(identity);
 }

From de47317bbccdfb1f3eb211e74ba909247af3734d Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 06:33:37 +0200
Subject: [PATCH 04/22] Target char dereferencing out of inner loop

---
 src/sequali/_seqidentmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index 465a1d4..ad02a79 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -42,8 +42,8 @@ get_smith_waterman_matches(
     memset(prev_column, 0, 32 * sizeof(struct Entry));
     memset(new_column, 0, 32 * sizeof(struct Entry));
     for (size_t i=0; i < target_length; i++) {
+        uint8_t target_char = target[i];
         for (size_t j=1; j < query_length + 1; j++) {
-            uint8_t target_char = target[i];
             uint8_t query_char = query[j - 1];
             struct Entry prev_entry = prev_column[j-1];
             Py_ssize_t linear_score; 

From b688bf4f2492957bfa994759aa4a4ed08cb1b671 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 06:43:47 +0200
Subject: [PATCH 05/22] Build skeleton for specialized implementations

---
 src/sequali/_seqidentmodule.c | 86 ++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index ad02a79..b7043b8 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -16,6 +16,7 @@ You should have received a copy of the GNU Affero General Public License
 along with Sequali.  If not, see <https://www.gnu.org/licenses/
 */
 
+#include "compiler_defs.h"
 #include "Python.h"
 
 struct Entry {
@@ -24,7 +25,7 @@ struct Entry {
 };
 
 static int8_t
-get_smith_waterman_matches(
+get_smith_waterman_matches_default(
     const uint8_t *restrict target, 
     size_t target_length, 
     const uint8_t *restrict query,
@@ -93,6 +94,89 @@ get_smith_waterman_matches(
     return most_matches;
 }
 
+#if COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS && BUILD_IS_X86_64
+__attribute__((__target__("avx2")))
+static int8_t
+get_smith_waterman_matches_avx2(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+) {
+    return get_smith_waterman_matches_default(
+        target, target_length, query, query_length, match_score, 
+        mismatch_penalty, deletion_penalty, insertion_penalty
+    );    
+}
+
+static int8_t
+(*get_smith_waterman_matches)(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+);
+
+static int8_t get_smith_waterman_matches_dispatch(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+) {
+    if (__builtin_cpu_supports("avx2")) {
+        get_smith_waterman_matches = get_smith_waterman_matches_avx2;
+    } else {
+        get_smith_waterman_matches = get_smith_waterman_matches_default;
+    }
+    return get_smith_waterman_matches(
+        target, target_length, query, query_length, match_score, 
+        mismatch_penalty, deletion_penalty, insertion_penalty
+    );
+}
+
+static int8_t
+(*get_smith_waterman_matches)(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+) = get_smith_waterman_matches_dispatch;
+
+#else 
+static inline int8_t
+get_smith_waterman_matches(
+    const uint8_t *restrict target, 
+    size_t target_length, 
+    const uint8_t *restrict query,
+    size_t query_length, 
+    int8_t match_score,
+    int8_t mismatch_penalty, 
+    int8_t deletion_penalty, 
+    int8_t insertion_penalty
+) {
+    return get_smith_waterman_matches_default(
+        target, target_length, query, query_length, match_score, 
+        mismatch_penalty, deletion_penalty, insertion_penalty
+    );
+}
+#endif
+
 PyDoc_STRVAR(sequence_identity__doc__, 
 "Calculate sequence identity based on a smith-waterman matrix. Only keep\n"
 "two columns in memory as no walk-back is needed.\n"

From ffde902a4da4ff6a25397d2bf29b83bc2044a74a Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 23 May 2024 10:50:06 +0200
Subject: [PATCH 06/22] Implement a reverse-diagonal avx2 approach for the
 Smith-Waterman alignment in sequence_identity

---
 src/sequali/_seqidentmodule.c | 131 ++++++++++++++++++++++++++++++++--
 1 file changed, 126 insertions(+), 5 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index b7043b8..69aa0ba 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -96,10 +96,14 @@ get_smith_waterman_matches_default(
 
 #if COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS && BUILD_IS_X86_64
 __attribute__((__target__("avx2")))
+/**
+ * Get the smith waterman matches by an avx2 algorithm. This goes over diagonals
+ * rather than columns, as the diagonals are independent.
+ */
 static int8_t
 get_smith_waterman_matches_avx2(
     const uint8_t *restrict target, 
-    size_t target_length, 
+    size_t target_length,
     const uint8_t *restrict query,
     size_t query_length, 
     int8_t match_score,
@@ -107,10 +111,127 @@ get_smith_waterman_matches_avx2(
     int8_t deletion_penalty, 
     int8_t insertion_penalty
 ) {
-    return get_smith_waterman_matches_default(
-        target, target_length, query, query_length, match_score, 
-        mismatch_penalty, deletion_penalty, insertion_penalty
-    );    
+
+    uint8_t *padded_target = PyMem_Calloc(target_length + 62, 1);
+    if (padded_target == NULL) {
+        return -1;
+    }
+    memcpy(padded_target + 31, target, target_length);
+    uint8_t padded_query_store[32];
+    uint8_t *padded_query = padded_query_store;
+    /* Pacters than target */
+    memset(padded_query, 0xff, sizeof(padded_query_store));
+    for (size_t i=0; i<query_length; i++) {
+        size_t index = 31 - i;
+        padded_query[index] = query[i];
+    }
+    __m256i query_vec = _mm256_lddqu_si256((__m256i *)padded_query);
+    __m256i max_matches = _mm256_setzero_si256();
+    __m256i max_score = _mm256_setzero_si256();
+    __m256i prev_diagonal_score = _mm256_setzero_si256();
+    __m256i prev_prev_diagonal_score = _mm256_setzero_si256();
+    __m256i prev_diagonal_matches = _mm256_setzero_si256();
+    __m256i prev_prev_diagonal_matches = _mm256_setzero_si256();
+    __m256i match_score_vec = _mm256_set1_epi8(match_score);
+    __m256i mismatch_penalty_vec = _mm256_set1_epi8(mismatch_penalty);
+    __m256i deletion_penalty_vec = _mm256_set1_epi8(deletion_penalty);
+    __m256i insertion_penalty_vec = _mm256_set1_epi8(insertion_penalty);
+    size_t run_length = target_length + 31; 
+    for (size_t i=0; i < run_length; i++) {
+        __m256i target_vec = _mm256_lddqu_si256((__m256i *)(padded_target + i));
+    
+        __m256i prev_linear_score = prev_prev_diagonal_score;
+        __m256i prev_linear_matches = prev_prev_diagonal_matches;
+        __m256i prev_insertion_score = prev_diagonal_score;
+        __m256i prev_insertion_matches = prev_diagonal_matches;
+        uint8_t prev_deletion_score_store[33];
+        memset(prev_deletion_score_store, 0, 33);
+        _mm256_storeu_si256((__m256i *)prev_deletion_score_store, 
+            prev_diagonal_score);
+        __m256i prev_deletion_score = _mm256_lddqu_si256(
+            (__m256i *)((uint8_t *)prev_deletion_score_store + 1));
+        uint8_t prev_deletion_matches_store[33];
+        memset(prev_deletion_matches_store, 0, 33);
+        _mm256_storeu_si256(
+            (__m256i *)prev_deletion_matches_store, 
+            prev_diagonal_matches);
+        __m256i prev_deletion_matches = _mm256_lddqu_si256(
+            (__m256i *)((uint8_t *)prev_deletion_matches_store + 1)
+        );
+
+        __m256i query_equals_target = _mm256_cmpeq_epi8(target_vec, query_vec);
+        __m256i linear_score_if_equals = _mm256_add_epi8(
+            prev_linear_score, match_score_vec);
+        __m256i linear_score_if_not_equals = _mm256_add_epi8(
+            prev_linear_score, mismatch_penalty_vec);
+        __m256i linear_matches_if_equals = _mm256_add_epi8(
+            prev_linear_matches, _mm256_set1_epi8(1));
+        __m256i linear_score = _mm256_blendv_epi8(
+            linear_score_if_not_equals, linear_score_if_equals, query_equals_target
+        );
+        __m256i linear_matches = _mm256_blendv_epi8(
+            prev_linear_matches, linear_matches_if_equals, query_equals_target
+        );
+        __m256i deletion_score = _mm256_add_epi8(
+            prev_deletion_score, deletion_penalty_vec);
+        __m256i deletion_matches = prev_deletion_matches;
+        __m256i insertion_score = _mm256_add_epi8(
+            prev_insertion_score, insertion_penalty_vec);
+        __m256i insertion_matches = _mm256_sub_epi8(
+            prev_insertion_matches, _mm256_set1_epi8(1));
+
+        __m256i insertion_greater_than_linear = _mm256_cmpgt_epi8(insertion_score, linear_score);
+        __m256i deletion_greater_than_linear = _mm256_cmpgt_epi8(deletion_score, linear_score);
+        __m256i deletion_greater_than_insertion = _mm256_cmpgt_epi8(deletion_score, insertion_score);
+        __m256i deletion_greatest = _mm256_and_si256(
+            deletion_greater_than_insertion, 
+            deletion_greater_than_linear);
+        __m256i insertion_greatest = _mm256_andnot_si256(deletion_greatest, insertion_greater_than_linear);
+        __m256i indels_greatest = _mm256_or_si256(insertion_greatest, deletion_greatest);
+        __m256i indel_score = _mm256_blendv_epi8(
+            insertion_score, deletion_score, deletion_greater_than_insertion);
+        __m256i indel_matches = _mm256_blendv_epi8(
+            insertion_matches, deletion_matches, deletion_greater_than_insertion);
+        __m256i scores = _mm256_blendv_epi8(
+            linear_score, indel_score, indels_greatest);
+        __m256i matches = _mm256_blendv_epi8(
+            linear_matches, indel_matches, indels_greatest);
+        __m256i zero_greater_than_scores = _mm256_cmpgt_epi8(
+            _mm256_setzero_si256(), scores);
+        matches = _mm256_blendv_epi8(
+            matches, _mm256_setzero_si256(), zero_greater_than_scores
+        );
+        scores = _mm256_max_epi8(scores, _mm256_setzero_si256());
+
+        __m256i equal_score = _mm256_cmpeq_epi8(scores, max_score);
+        __m256i greater_score = _mm256_cmpgt_epi8(scores, max_score);
+        __m256i tmp_max_matches = _mm256_max_epi8(matches, max_matches);
+        max_score = _mm256_max_epi8(scores, max_score);
+        max_matches = _mm256_blendv_epi8(max_matches, tmp_max_matches, equal_score);
+        max_matches = _mm256_blendv_epi8(max_matches, matches, greater_score);
+        prev_prev_diagonal_score = prev_deletion_score; 
+        prev_prev_diagonal_matches = prev_deletion_matches;
+        prev_diagonal_score = scores; 
+        prev_diagonal_matches = matches;
+    }
+    uint8_t max_score_store[32];
+    uint8_t max_matches_store[32];
+    int8_t highest_score = 0; 
+    int8_t best_matches = 0; 
+    _mm256_storeu_si256((__m256i *)max_score_store, max_score);
+    _mm256_storeu_si256((__m256i *)max_matches_store, max_matches);
+    for (size_t i=0; i<32; i++) {
+        int8_t score = max_score_store[i];
+        int8_t matches = max_matches_store[i];
+        if (score == highest_score && matches > best_matches) {
+            best_matches = matches; 
+        } else if (score > highest_score) {
+            highest_score = score; 
+            best_matches = matches;
+        }
+    }
+    PyMem_Free(padded_target);
+    return best_matches;
 }
 
 static int8_t

From 359dd3a6c9e22ace38839c91e5177fb0f8ce47ee Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Sat, 25 May 2024 14:56:51 +0200
Subject: [PATCH 07/22] Hoist invariant storage arrays outside of the loop

---
 src/sequali/_seqidentmodule.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index 69aa0ba..be02d56 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -125,6 +125,11 @@ get_smith_waterman_matches_avx2(
         size_t index = 31 - i;
         padded_query[index] = query[i];
     }
+    uint8_t prev_deletion_score_store[33];
+    memset(prev_deletion_score_store, 0, 33);
+    uint8_t prev_deletion_matches_store[33];
+    memset(prev_deletion_matches_store, 0, 33);
+
     __m256i query_vec = _mm256_lddqu_si256((__m256i *)padded_query);
     __m256i max_matches = _mm256_setzero_si256();
     __m256i max_score = _mm256_setzero_si256();
@@ -144,14 +149,10 @@ get_smith_waterman_matches_avx2(
         __m256i prev_linear_matches = prev_prev_diagonal_matches;
         __m256i prev_insertion_score = prev_diagonal_score;
         __m256i prev_insertion_matches = prev_diagonal_matches;
-        uint8_t prev_deletion_score_store[33];
-        memset(prev_deletion_score_store, 0, 33);
         _mm256_storeu_si256((__m256i *)prev_deletion_score_store, 
             prev_diagonal_score);
         __m256i prev_deletion_score = _mm256_lddqu_si256(
             (__m256i *)((uint8_t *)prev_deletion_score_store + 1));
-        uint8_t prev_deletion_matches_store[33];
-        memset(prev_deletion_matches_store, 0, 33);
         _mm256_storeu_si256(
             (__m256i *)prev_deletion_matches_store, 
             prev_diagonal_matches);

From 186ca002de56b113bc920ac01b5b8c49e62424ee Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Sun, 26 May 2024 07:59:07 +0200
Subject: [PATCH 08/22] Update changelog with avx2 enabled SMith-Waterman

---
 CHANGELOG.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 99d4437..468d031 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,11 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.10.0-dev
+------------------
++ Speed up sequence identity calculations on AVX2 enabled CPUs by using a
+  reverse-diagonal parallelized version of Smith-Waterman.
+
 version 0.9.1
 -----------------
 + Fix an issue where the insert size metrics module would crash when no

From a154d3251d2f7195f56570a697b516674ec147de Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Sun, 26 May 2024 22:06:14 +0200
Subject: [PATCH 09/22] Ensure matching is correct

---
 scripts/benchmark_sequence_identity.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmark_sequence_identity.py b/scripts/benchmark_sequence_identity.py
index c517d95..0bc2b92 100644
--- a/scripts/benchmark_sequence_identity.py
+++ b/scripts/benchmark_sequence_identity.py
@@ -10,5 +10,7 @@
         data = json.load(f)
     sequence_dicts = data["overrepresented_sequences"]["overrepresented_sequences"]
     for seqdict in sequence_dicts:
-        identify_sequence_builtin(seqdict["sequence"])
-
+        best_match = seqdict["best_match"]
+        total, max, found_match = identify_sequence_builtin(seqdict["sequence"])
+        if best_match != found_match:
+            raise ValueError(f"{best_match} != {found_match}")

From 090ac20f4702a8a1682dcfe45d4e4aa5c9c98caa Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Sun, 26 May 2024 09:49:58 +0200
Subject: [PATCH 10/22] Do not temporarily store in memory but use vector
 manipulations exclusively for Smith-Waterman

---
 src/sequali/_seqidentmodule.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index be02d56..e2ea176 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -95,6 +95,25 @@ get_smith_waterman_matches_default(
 }
 
 #if COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS && BUILD_IS_X86_64
+
+/**
+ * @brief Shift everything one byte down. Similar to _mm256_bslli_epi128, but
+ * does shift over the 128-bit lanes.
+ */
+__attribute__((__target__("avx2")))
+static inline __m256i _mm256_move_one_down(__m256i vec) {
+    /* This moves the vector one down, but the entry at position 15 will be
+       missing, as the shift does not go beyond the 128-bit lanes. */
+    __m256i shifted_vec = _mm256_bsrli_epi128(vec, 1);
+    /* Rotate 64 bit integers: 1, 2, 3, 0. */
+    __m256i rotated_vec = _mm256_permute4x64_epi64(vec, 0b00111001);
+    /* shift 7 bytes up. This brings the right byte to position 15. */
+    __m256i shifted_rotated_vec = _mm256_slli_epi64(rotated_vec, 56);
+    __m256i masked_vec = _mm256_and_si256(
+        _mm256_setr_epi64x(0, -1, 0, 0), shifted_rotated_vec);
+    return _mm256_or_si256(shifted_vec, masked_vec);
+}
+
 __attribute__((__target__("avx2")))
 /**
  * Get the smith waterman matches by an avx2 algorithm. This goes over diagonals
@@ -125,10 +144,6 @@ get_smith_waterman_matches_avx2(
         size_t index = 31 - i;
         padded_query[index] = query[i];
     }
-    uint8_t prev_deletion_score_store[33];
-    memset(prev_deletion_score_store, 0, 33);
-    uint8_t prev_deletion_matches_store[33];
-    memset(prev_deletion_matches_store, 0, 33);
 
     __m256i query_vec = _mm256_lddqu_si256((__m256i *)padded_query);
     __m256i max_matches = _mm256_setzero_si256();
@@ -149,16 +164,8 @@ get_smith_waterman_matches_avx2(
         __m256i prev_linear_matches = prev_prev_diagonal_matches;
         __m256i prev_insertion_score = prev_diagonal_score;
         __m256i prev_insertion_matches = prev_diagonal_matches;
-        _mm256_storeu_si256((__m256i *)prev_deletion_score_store, 
-            prev_diagonal_score);
-        __m256i prev_deletion_score = _mm256_lddqu_si256(
-            (__m256i *)((uint8_t *)prev_deletion_score_store + 1));
-        _mm256_storeu_si256(
-            (__m256i *)prev_deletion_matches_store, 
-            prev_diagonal_matches);
-        __m256i prev_deletion_matches = _mm256_lddqu_si256(
-            (__m256i *)((uint8_t *)prev_deletion_matches_store + 1)
-        );
+        __m256i prev_deletion_score = _mm256_move_one_down(prev_diagonal_score);
+        __m256i prev_deletion_matches = _mm256_move_one_down(prev_diagonal_matches);
 
         __m256i query_equals_target = _mm256_cmpeq_epi8(target_vec, query_vec);
         __m256i linear_score_if_equals = _mm256_add_epi8(

From 5488d775cff4784696c088927d71efb18f398a60 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 06:55:15 +0200
Subject: [PATCH 11/22] Replace two blend instructions with max instructions

---
 src/sequali/_seqidentmodule.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index e2ea176..769a9e0 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -196,12 +196,10 @@ get_smith_waterman_matches_avx2(
             deletion_greater_than_linear);
         __m256i insertion_greatest = _mm256_andnot_si256(deletion_greatest, insertion_greater_than_linear);
         __m256i indels_greatest = _mm256_or_si256(insertion_greatest, deletion_greatest);
-        __m256i indel_score = _mm256_blendv_epi8(
-            insertion_score, deletion_score, deletion_greater_than_insertion);
+        __m256i indel_score = _mm256_max_epi8(insertion_score, deletion_score);
         __m256i indel_matches = _mm256_blendv_epi8(
             insertion_matches, deletion_matches, deletion_greater_than_insertion);
-        __m256i scores = _mm256_blendv_epi8(
-            linear_score, indel_score, indels_greatest);
+        __m256i scores = _mm256_max_epi8(linear_score, indel_score);
         __m256i matches = _mm256_blendv_epi8(
             linear_matches, indel_matches, indels_greatest);
         __m256i zero_greater_than_scores = _mm256_cmpgt_epi8(

From de6bcba136be8201f92849fbfa9a9eabece29720 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 07:00:09 +0200
Subject: [PATCH 12/22] Only run the required length

---
 src/sequali/_seqidentmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index 769a9e0..982b9d1 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -156,7 +156,7 @@ get_smith_waterman_matches_avx2(
     __m256i mismatch_penalty_vec = _mm256_set1_epi8(mismatch_penalty);
     __m256i deletion_penalty_vec = _mm256_set1_epi8(deletion_penalty);
     __m256i insertion_penalty_vec = _mm256_set1_epi8(insertion_penalty);
-    size_t run_length = target_length + 31; 
+    size_t run_length = target_length + query_length;
     for (size_t i=0; i < run_length; i++) {
         __m256i target_vec = _mm256_lddqu_si256((__m256i *)(padded_target + i));
     

From bb10665d0e1e43dea9d8f13e02254cb8d372cc37 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 08:32:24 +0200
Subject: [PATCH 13/22] Add explanatory comments

---
 src/sequali/_seqidentmodule.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index 982b9d1..e184552 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -130,12 +130,16 @@ get_smith_waterman_matches_avx2(
     int8_t deletion_penalty, 
     int8_t insertion_penalty
 ) {
-
+    /* Since the algorithm goes over reversed diagonals, it needs to be padded 
+       with  the vector length - 1 on both sides. So vectors can be loaded 
+       immediately rather than have a complex initialization. */
     uint8_t *padded_target = PyMem_Calloc(target_length + 62, 1);
     if (padded_target == NULL) {
         return -1;
     }
     memcpy(padded_target + 31, target, target_length);
+
+    /* Also the query needs to be fitted inside a vector.*/
     uint8_t padded_query_store[32];
     uint8_t *padded_query = padded_query_store;
     /* Pacters than target */
@@ -144,8 +148,8 @@ get_smith_waterman_matches_avx2(
         size_t index = 31 - i;
         padded_query[index] = query[i];
     }
-
     __m256i query_vec = _mm256_lddqu_si256((__m256i *)padded_query);
+    
     __m256i max_matches = _mm256_setzero_si256();
     __m256i max_score = _mm256_setzero_si256();
     __m256i prev_diagonal_score = _mm256_setzero_si256();
@@ -156,14 +160,21 @@ get_smith_waterman_matches_avx2(
     __m256i mismatch_penalty_vec = _mm256_set1_epi8(mismatch_penalty);
     __m256i deletion_penalty_vec = _mm256_set1_epi8(deletion_penalty);
     __m256i insertion_penalty_vec = _mm256_set1_epi8(insertion_penalty);
+    
     size_t run_length = target_length + query_length;
     for (size_t i=0; i < run_length; i++) {
         __m256i target_vec = _mm256_lddqu_si256((__m256i *)(padded_target + i));
     
+        /* Normally the previous diagonals need to be loaded from arrays, but 
+           since the query length is limited at 31, the previous diagonals can  
+           be stored in registers instead. */
         __m256i prev_linear_score = prev_prev_diagonal_score;
         __m256i prev_linear_matches = prev_prev_diagonal_matches;
         __m256i prev_insertion_score = prev_diagonal_score;
         __m256i prev_insertion_matches = prev_diagonal_matches;
+        /* Rather than loading from a +1 offset from memory, instead use 
+           vector instructions to achieve the same effect. This is much faster 
+           but only works since the query is at most 31 bp. */
         __m256i prev_deletion_score = _mm256_move_one_down(prev_diagonal_score);
         __m256i prev_deletion_matches = _mm256_move_one_down(prev_diagonal_matches);
 
@@ -188,6 +199,9 @@ get_smith_waterman_matches_avx2(
         __m256i insertion_matches = _mm256_sub_epi8(
             prev_insertion_matches, _mm256_set1_epi8(1));
 
+        /* For calculating the scores, simple max instructions suffice. But the 
+           matches vector needs to line up with the score vector so some 
+           comparators and blend instructions are needed for it. */
         __m256i insertion_greater_than_linear = _mm256_cmpgt_epi8(insertion_score, linear_score);
         __m256i deletion_greater_than_linear = _mm256_cmpgt_epi8(deletion_score, linear_score);
         __m256i deletion_greater_than_insertion = _mm256_cmpgt_epi8(deletion_score, insertion_score);

From 1183d07e2d545c181cb9a2745dcb62e2967b5a7e Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 08:33:47 +0200
Subject: [PATCH 14/22] Properly handle error condition

---
 src/sequali/_seqidentmodule.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index e184552..0ae7a40 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -385,6 +385,9 @@ sequence_identity(PyObject *module, PyObject *args, PyObject *kwargs)
         deletion_penalty,
         insertion_penalty
     );
+    if (most_matches < 0) {
+        return PyErr_NoMemory();
+    }
     double identity = (double)most_matches / (double)query_length;
     return PyFloat_FromDouble(identity);
 }

From 14adf161d49bfd5bb4c4b10975250ba9c54b9625 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 08:46:10 +0200
Subject: [PATCH 15/22] Fix pedantic compile warning

---
 src/sequali/_seqidentmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
index 0ae7a40..73a9a07 100644
--- a/src/sequali/_seqidentmodule.c
+++ b/src/sequali/_seqidentmodule.c
@@ -106,7 +106,7 @@ static inline __m256i _mm256_move_one_down(__m256i vec) {
        missing, as the shift does not go beyond the 128-bit lanes. */
     __m256i shifted_vec = _mm256_bsrli_epi128(vec, 1);
     /* Rotate 64 bit integers: 1, 2, 3, 0. */
-    __m256i rotated_vec = _mm256_permute4x64_epi64(vec, 0b00111001);
+    __m256i rotated_vec = _mm256_permute4x64_epi64(vec, 0x39);  // 0b00111001
     /* shift 7 bytes up. This brings the right byte to position 15. */
     __m256i shifted_rotated_vec = _mm256_slli_epi64(rotated_vec, 56);
     __m256i masked_vec = _mm256_and_si256(

From fbaa4029c8e7b3ab6ba5bda0ff74139539faa396 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 09:11:16 +0200
Subject: [PATCH 16/22] Add extra test for full length query testing

---
 tests/test_sequence_identification.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_sequence_identification.py b/tests/test_sequence_identification.py
index 22d0a0b..c1e4fe9 100644
--- a/tests/test_sequence_identification.py
+++ b/tests/test_sequence_identification.py
@@ -53,6 +53,12 @@ def test_canonical_kmers():
     # GGTTGACTA
     # 5 out of 9 matches
     ("TGTTACGG", "GGTTGACTA", 5 / 9),
+    # 31 bp query to properly test AVX2 implementation
+    # AGATCGGAAGAGCACACGTCTGAACTCC--AGTCA
+    # ||| ||||  ||||  ||||||||||||  || ||
+    # AGA-CGGA--AGCAAGCGTCTGAACTCCCCAG-CA
+    ("AGATCGGAAGAGCACACGTCTGAACTCCAGTCA",
+     "AGACGGAAGCAAGCGTCTGAACTCCCCAGCA", 23 / 31)
 ])
 def test_sequence_identity(target, query, result):
     assert sequence_identity(target, query) == result

From 0cd82ffc9286111c4edb19f83902213c804d3900 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 09:18:59 +0200
Subject: [PATCH 17/22] Also compile pedantically with clang

---
 tox.ini | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index d2555bc..b5dd74f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -35,8 +35,10 @@ deps=
 skip_install=True
 setenv=
     CFLAGS=-Wall -Werror -Wpedantic
+allowlist_externals = bash
 commands =
-    python setup.py build_ext -f
+    bash -c 'CC=gcc python setup.py build_ext -f'
+    bash -c 'CC=clang python setup.py build_ext -f'
 
 [flake8]
 max-line-length=88

From 0861513782244edf39ab4c75d542332f49fd0acd Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 09:20:19 +0200
Subject: [PATCH 18/22] Fix clang pedantic errors

---
 src/sequali/function_dispatch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sequali/function_dispatch.h b/src/sequali/function_dispatch.h
index 3349558..7c821ee 100644
--- a/src/sequali/function_dispatch.h
+++ b/src/sequali/function_dispatch.h
@@ -206,7 +206,7 @@ static int64_t sequence_to_canonical_kmer_default(uint8_t *sequence, uint64_t k)
         kmer <<= 8;
         kmer |= kchunk;
     }
-    for (i=i; i<(int64_t)k; i++) {
+    for (; i<(int64_t)k; i++) {
         size_t nuc = NUCLEOTIDE_TO_TWOBIT[sequence[i]];
         all_nucs |= nuc;
         kmer <<= 2;
@@ -378,4 +378,4 @@ static int64_t (*sequence_to_canonical_kmer)(
 static inline int64_t sequence_to_canonical_kmer(uint8_t *sequence, uint64_t k) {
     return sequence_to_canonical_kmer_default(sequence, k);
 }
-#endif
\ No newline at end of file
+#endif

From 34465b837a794fa1653588381ac399194b4471b8 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 27 May 2024 10:58:57 +0200
Subject: [PATCH 19/22] Stage hashes to prevent in-sequence duplication to
 affect overrepresented results

---
 CHANGELOG.rst                      |  4 ++
 docs/module_options.rst            |  9 +++--
 src/sequali/_qcmodule.c            | 63 +++++++++++++++++++++++++++++-
 src/sequali/report_modules.py      |  5 ++-
 tests/test_sequence_duplication.py |  3 +-
 5 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 468d031..bf8bd84 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,10 @@ Changelog
 
 version 0.10.0-dev
 ------------------
++ Overrepresented sequence analysis now only counts unique fragments per read.
+  Fragments that are duplicated inside the read are counted only once. This
+  prevents long stretches of genomic repeats getting higher reported
+  frequencies.
 + Speed up sequence identity calculations on AVX2 enabled CPUs by using a
   reverse-diagonal parallelized version of Smith-Waterman.
 
diff --git a/docs/module_options.rst b/docs/module_options.rst
index 1c2c28e..a4d2cc9 100644
--- a/docs/module_options.rst
+++ b/docs/module_options.rst
@@ -64,9 +64,12 @@ such the adapter sequences will always be sampled in the same frame.
     sequence is sampled when the length is not divisible by the size of the
     fragments.
 
-Fragments are stored and counted in a hash table. When the hash table is full
-only fragments that are already present will be counted. To diminish the time
-spent on the module, by default 1 in 8 sequences is analysed.
+For each sequence only unique fragments within the sequence are sampled, to
+avoid overrepresenting common genomic repeats.
+unique fragments are then stored and counted in a hash table. When the hash
+table is full only fragments that are already present will be counted.
+To diminish the time spent on the module, by default 1 in 8 sequences is
+analysed.
 
 After the module is run, stored fragments are checked for their counts. If the
 count exceeds a certain threshold it is considered overrepresented. Sequali
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index 91c4604..7c4a84a 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3154,6 +3154,8 @@ typedef struct _SequenceDuplicationStruct {
     size_t fragment_length;
     uint64_t number_of_sequences;
     uint64_t sampled_sequences;
+    uint64_t staging_hash_table_size; 
+    uint64_t *staging_hash_table;
     uint64_t hash_table_size;
     uint64_t *hashes; 
     uint32_t *counts;
@@ -3166,6 +3168,7 @@ typedef struct _SequenceDuplicationStruct {
 static void 
 SequenceDuplication_dealloc(SequenceDuplication *self)
 {
+    PyMem_Free(self->staging_hash_table);
     PyMem_Free(self->hashes);
     PyMem_Free(self->counts);
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -3232,6 +3235,8 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     self->hash_table_size = hash_table_size;
     self->total_fragments = 0;
     self->fragment_length = fragment_length;
+    self->staging_hash_table_size = 0;
+    self->staging_hash_table = NULL;
     self->hashes = hashes;
     self->counts = counts;
     self->sample_every = sample_every;
@@ -3265,6 +3270,44 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
     }
 }
 
+static int 
+SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
+{
+    if (new_size <= self->staging_hash_table_size) {
+        return 0;
+    }
+    uint64_t *tmp = PyMem_Realloc(self->staging_hash_table, 
+                                  new_size * sizeof(uint64_t));
+    if (tmp == NULL) {
+        PyErr_NoMemory();
+        return -1; 
+    }
+    self->staging_hash_table = tmp;
+    self->staging_hash_table_size = new_size;
+    return 0;
+}
+
+static inline void 
+add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size, 
+               uint64_t hash) 
+{
+    /* Works because size is a power of 2 */
+    uint64_t hash_to_index_int = staging_hash_table_size - 1; 
+    uint64_t index = hash & hash_to_index_int;
+    while (true) {
+        uint64_t current_entry  = staging_hash_table[index];
+        if (current_entry == 0) {
+            staging_hash_table[index] = hash;
+            break;
+        } else if (current_entry == hash) {
+            break;
+        }
+        index += 1; 
+        index &= hash_to_index_int;
+    }
+    return;
+}
+
 static int
 SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
 {
@@ -3299,6 +3342,16 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
        still all of the sequence is being sampled.
     */
     Py_ssize_t total_fragments = (sequence_length + fragment_length - 1) / fragment_length;
+    size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
+    size_t staging_hash_size = 1ULL << staging_hash_bits;
+    if (staging_hash_size > self->staging_hash_table_size) {
+        if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
+            return -1;
+        }
+    }
+    uint64_t *staging_hash_table = self->staging_hash_table;
+    memset(staging_hash_table, 0 , staging_hash_size * sizeof(uint64_t));
+
     Py_ssize_t from_mid_point_fragments = total_fragments / 2;
     Py_ssize_t mid_point = sequence_length - (from_mid_point_fragments * fragment_length);
     bool warn_unknown = false;
@@ -3313,7 +3366,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
     }
 
     // Sample back sequences
@@ -3327,7 +3380,13 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
+    }
+    for (size_t i=0; i<staging_hash_size; i++) {
+        uint64_t hash = staging_hash_table[i];
+        if (hash != 0) {
+            Sequence_duplication_insert_hash(self, hash);
+        }
     }
     if (warn_unknown) {
         PyObject *culprit = PyUnicode_DecodeASCII((char *)sequence, sequence_length, NULL);
diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
index 2070b2d..2a52e0d 100644
--- a/src/sequali/report_modules.py
+++ b/src/sequali/report_modules.py
@@ -1426,8 +1426,9 @@ def to_html(self) -> str:
             <p class="explanation">
                 The percentage shown is an estimate based on the number of
                 occurences of the fragment in relation to the number of
-                sampled sequences. This makes the assumption that a
-                fragment only occurs once in each sequence.
+                sampled sequences. Fragments are only counted once per
+                sequence. Fragments that occur more than once in a sequence are
+                counted as one.
             </p>
             <table>
             <tr>
diff --git a/tests/test_sequence_duplication.py b/tests/test_sequence_duplication.py
index aecfa58..b7cee7f 100644
--- a/tests/test_sequence_duplication.py
+++ b/tests/test_sequence_duplication.py
@@ -146,7 +146,8 @@ def test_sequence_duplication_sampling_rate(divisor):
     ("GATTACAAA", {"ATC": 1, "GTA": 1, "AAA": 1}),
     ("GA", {}),
     ("GATT", {"ATC": 1, "AAT": 1}),
-    ("GATTACGATTAC", {"ATC": 2, "GTA": 2}),
+    # Fragments that are duplicated in the sequence should only be recorded once.
+    ("GATTACGATTAC", {"ATC": 1, "GTA": 1}),
 ])
 def test_sequence_duplication_all_fragments(sequence, result):
     seqdup = SequenceDuplication(fragment_length=3, sample_every=1)

From 9bcf3a48fbc161a85e4aa4066f0e73bcb10f44a4 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Sun, 2 Jun 2024 10:15:53 +0200
Subject: [PATCH 20/22] Make overrepresented sequences table scrollable

---
 src/sequali/report_modules.py         | 4 ++--
 src/sequali/static/sequali_report.css | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
index 2a52e0d..7442f46 100644
--- a/src/sequali/report_modules.py
+++ b/src/sequali/report_modules.py
@@ -1462,7 +1462,7 @@ def to_html(self) -> str:
             </table>
             """
         )
-        content.write("<table>")
+        content.write("<div class=\"overrepresented_table\"><table>")
         content.write("<tr><th>count</th><th>percentage</th>"
                       "<th>canonical sequence</th>"
                       "<th>reverse complemented sequence</th>"
@@ -1479,7 +1479,7 @@ def to_html(self) -> str:
                     <td style="text-align:right">
                         {item.most_matches / item.max_matches:.02%}</td>
                     <td>{html.escape(item.best_match)}</td></tr>""")
-        content.write("</table>")
+        content.write("</table></div>")
         return content.getvalue()
 
     @classmethod
diff --git a/src/sequali/static/sequali_report.css b/src/sequali/static/sequali_report.css
index db5723c..d15c30a 100644
--- a/src/sequali/static/sequali_report.css
+++ b/src/sequali/static/sequali_report.css
@@ -96,3 +96,10 @@ tr:nth-child(odd) {
 tr:hover {
     background-color: darkseagreen;
 }
+
+.overrepresented_table {
+    /* Use a non-integer height so the last row is partially visible. This
+       indicates that the table is scrollable. */
+    max-height: 35.75em;
+    overflow-y: scroll;
+}

From 34b0d8eb8d00497082a088742b6a5ca7085a1523 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 3 Jun 2024 08:33:15 +0200
Subject: [PATCH 21/22] Add scrollable table to the changelog

---
 CHANGELOG.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index bf8bd84..7380188 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,8 @@ Changelog
 
 version 0.10.0-dev
 ------------------
++ Make overrepresented sequences table scrollable and smaller so it is easier
+  to skip over when lots of entries are found.
 + Overrepresented sequence analysis now only counts unique fragments per read.
   Fragments that are duplicated inside the read are counted only once. This
   prevents long stretches of genomic repeats getting higher reported

From 5ad7a47ecf98da09505bfcf9cd2094c85c52d3df Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 3 Jun 2024 08:41:22 +0200
Subject: [PATCH 22/22] Prepare version 0.10.0

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 7380188..74a5704 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
-version 0.10.0-dev
+version 0.10.0
 ------------------
 + Make overrepresented sequences table scrollable and smaller so it is easier
   to skip over when lots of entries are found.