From 75504b0c7af48c64364f613c9cba5036b00e53db Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Tue, 18 Jul 2023 15:00:13 -0400
Subject: [PATCH 01/17] updating mask

!= 2 might have been messing up the count/scores, changed to == 0
---
 repo_utils/utmos_ssshtests.sh | 2 +-
 utmos/convert.py              | 3 +--
 utmos/select.py               | 6 +++---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/repo_utils/utmos_ssshtests.sh b/repo_utils/utmos_ssshtests.sh
index 33b9016..0b08ba1 100644
--- a/repo_utils/utmos_ssshtests.sh
+++ b/repo_utils/utmos_ssshtests.sh
@@ -1,7 +1,7 @@
 
 test -e ssshtest || curl -O https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest
 source ssshtest
-STOP_ON_FAIL=1
+#STOP_ON_FAIL=1
 # Work inside of the repo folder
 cd "$( dirname "${BASH_SOURCE[0]}" )"/../
 INDIR=repo_utils/test_files
diff --git a/utmos/convert.py b/utmos/convert.py
index edba497..8671c01 100644
--- a/utmos/convert.py
+++ b/utmos/convert.py
@@ -81,9 +81,8 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False):
         del data["calldata/GT"]
         data["GT"] = v_count
     data["AF"] = af
-
     data["GT"] = np.packbits(data["GT"], axis=1)
-
+    
     data["stats"] = {'num_het': num_hets, 'num_hom': num_homs}
     return data
 
diff --git a/utmos/select.py b/utmos/select.py
index 46a3650..b2d9858 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -37,8 +37,8 @@ def do_sum(matrix, sample_mask):
             m_tot += row
         else:
             m_tot += row != 0
-    # mask out excluded samples
-    ex_mask = sample_mask != 2
+    # mask out excluded/used samples
+    ex_mask = sample_mask == 0
     m_sum *= ex_mask
     m_tot *= ex_mask
     return m_sum, m_tot
@@ -49,7 +49,7 @@ def calculate_scores(matrix, sample_mask, sample_weights):
     calculate the best scoring sample,
     sumfunc is the method to do matrix summation
 
-    updates sample_mask in place
+    updates sample_mask in place (shouldn't be here)
     returns tuple of:
         column index of the highest score
         new_row_count for highest score column index

From 738ad9fa2394f2ccce2ab652d7d80d1af6aea72f Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Tue, 18 Jul 2023 19:07:01 -0400
Subject: [PATCH 02/17] code clean

---
 utmos/select.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/utmos/select.py b/utmos/select.py
index b2d9858..a0ade84 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -17,7 +17,6 @@
 
 MAXMEM = 2  # in GB
 
-
 #############
 # Core code #
 #############
@@ -49,7 +48,6 @@ def calculate_scores(matrix, sample_mask, sample_weights):
     calculate the best scoring sample,
     sumfunc is the method to do matrix summation
 
-    updates sample_mask in place (shouldn't be here)
     returns tuple of:
         column index of the highest score
         new_row_count for highest score column index
@@ -60,7 +58,6 @@ def calculate_scores(matrix, sample_mask, sample_weights):
         sample_scores *= sample_weights
     use_sample = np.argmax(sample_scores)
     new_variant_count = cur_sample_count[use_sample]
-    sample_mask[use_sample] = 1
 
     return use_sample, new_variant_count
 
@@ -91,8 +88,8 @@ def greedy_select(matrix,
     matrix:              genotype data
     total_variant_count: total number of variants per-sample
     select_count:        how many samples we'll be selecting
-    variant_mask:        boolean matrix of variants where True == used
-    sample_mask:         boolean matrix of samples where True == use
+    vcf_samples:         list of sample names, lines up with sample_mask
+    sample_mask:         int matrix for samples where 0 == yet to be selected
     sample_weights:      (optional) the weights to apply to each iteration's sample.sum (len == gt_matrix.shape[0])
 
     Expects input matrices to be h5py Datasets.
@@ -106,6 +103,7 @@ def greedy_select(matrix,
         use_sample_name = vcf_samples[use_sample]
         variant_count = total_variant_count[use_sample]
         tot_captured += new_variant_count
+        sample_mask[use_sample] = 1
 
         yield [
             use_sample_name,

From 53dca2eff00dc108e5d8829cf8147cc7fa6934a9 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 13:19:35 -0400
Subject: [PATCH 03/17] fixing select lowmem and maybe af

---
 imgs/coverage.svg              |  4 +-
 imgs/pylint.svg                | 12 +++---
 repo_utils/answer_key/help.txt |  2 +-
 repo_utils/utmos_ssshtests.sh  |  2 +
 utmos/convert.py               |  2 +-
 utmos/select.py                | 71 ++++++++++++++++------------------
 6 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/imgs/coverage.svg b/imgs/coverage.svg
index ea71b13..8a3cf05 100644
--- a/imgs/coverage.svg
+++ b/imgs/coverage.svg
@@ -17,7 +17,7 @@
         <text x="32.5" y="14">coverage</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">90%</text>
-        <text x="82.0" y="14">90%</text>
+        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">95%</text>
+        <text x="82.0" y="14">95%</text>
     </g>
 </svg>
diff --git a/imgs/pylint.svg b/imgs/pylint.svg
index c25bad4..fa27ae2 100644
--- a/imgs/pylint.svg
+++ b/imgs/pylint.svg
@@ -1,23 +1,23 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="68" height="20">
+<svg xmlns="http://www.w3.org/2000/svg" width="80" height="20">
     <linearGradient id="b" x2="0" y2="100%">
         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
         <stop offset="1" stop-opacity=".1"/>
     </linearGradient>
     <mask id="anybadge_1">
-        <rect width="68" height="20" rx="3" fill="#fff"/>
+        <rect width="80" height="20" rx="3" fill="#fff"/>
     </mask>
     <g mask="url(#anybadge_1)">
         <path fill="#555" d="M0 0h44v20H0z"/>
-        <path fill="#4C1" d="M44 0h24v20H44z"/>
-        <path fill="url(#b)" d="M0 0h68v20H0z"/>
+        <path fill="#4C1" d="M44 0h36v20H44z"/>
+        <path fill="url(#b)" d="M0 0h80v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="23.0" y="15" fill="#010101" fill-opacity=".3">pylint</text>
         <text x="22.0" y="14">pylint</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="57.0" y="15" fill="#010101" fill-opacity=".3">10</text>
-        <text x="56.0" y="14">10</text>
+        <text x="63.0" y="15" fill="#010101" fill-opacity=".3">9.97</text>
+        <text x="62.0" y="14">9.97</text>
     </g>
 </svg>
diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt
index 20ad794..232f7cc 100644
--- a/repo_utils/answer_key/help.txt
+++ b/repo_utils/answer_key/help.txt
@@ -1,6 +1,6 @@
 usage: utmos [-h] CMD ...
 
-Utmos v2.0.1-dev - Maximum-coverage algorithm to select samples for validation and resequencing
+Utmos v2.1.0 - Maximum-coverage algorithm to select samples for validation and resequencing
 
     CMDs:
         convert  Extract genotypes from VCFs
diff --git a/repo_utils/utmos_ssshtests.sh b/repo_utils/utmos_ssshtests.sh
index 0b08ba1..5bec6b3 100644
--- a/repo_utils/utmos_ssshtests.sh
+++ b/repo_utils/utmos_ssshtests.sh
@@ -194,6 +194,8 @@ fi
 #                                 select lowmem
 # ------------------------------------------------------------
 
+run test_select_lm_big $ut select --maxmem 0 --lowmem $OD/tiny.hdf5 $INDIR/chunk*.jl
+
 run test_select_lm $ut select --maxmem 0 --lowmem $OD/tiny.hdf5 $INDIR/chunk2.vcf
 if [ $test_select_lm ]; then
     assert_exit_code 0
diff --git a/utmos/convert.py b/utmos/convert.py
index 8671c01..a1323de 100644
--- a/utmos/convert.py
+++ b/utmos/convert.py
@@ -82,7 +82,7 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False):
         data["GT"] = v_count
     data["AF"] = af
     data["GT"] = np.packbits(data["GT"], axis=1)
-    
+
     data["stats"] = {'num_het': num_hets, 'num_hom': num_homs}
     return data
 
diff --git a/utmos/select.py b/utmos/select.py
index a0ade84..508292c 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -16,6 +16,7 @@
 from utmos.convert import read_vcf
 
 MAXMEM = 2  # in GB
+# set to 0 to h5 once and then pull in memory for test coverage
 
 #############
 # Core code #
@@ -24,23 +25,18 @@ def do_sum(matrix, sample_mask):
     """
     Vectorized sum function
     """
-    m_sum = np.zeros(matrix.shape[1])
-    m_tot = np.zeros(matrix.shape[1])
+    m_score = np.zeros(matrix.shape[1])
+    m_count = np.zeros(matrix.shape[1])
     # skip variants already used
-    c_mask = np.where(sample_mask == 1)
+    c_mask = np.where(sample_mask == 0)
     for row in matrix:
         if row[c_mask].any():
             continue
-        m_sum += row
-        if matrix.dtype == bool:
-            m_tot += row
-        else:
-            m_tot += row != 0
+        m_score += row
+        m_count += row != 0
     # mask out excluded/used samples
-    ex_mask = sample_mask == 0
-    m_sum *= ex_mask
-    m_tot *= ex_mask
-    return m_sum, m_tot
+    m_score[sample_mask != 1] = 0
+    return m_score, m_count
 
 
 def calculate_scores(matrix, sample_mask, sample_weights):
@@ -52,12 +48,12 @@ def calculate_scores(matrix, sample_mask, sample_weights):
         column index of the highest score
         new_row_count for highest score column index
     """
-    sample_scores, cur_sample_count = do_sum(matrix, sample_mask)
+    scores, counts = do_sum(matrix, sample_mask)
     if sample_weights is not None:
         logging.debug("applying weights")
-        sample_scores *= sample_weights
-    use_sample = np.argmax(sample_scores)
-    new_variant_count = cur_sample_count[use_sample]
+        scores *= sample_weights
+    use_sample = np.argmax(scores)
+    new_variant_count = counts[use_sample]
 
     return use_sample, new_variant_count
 
@@ -89,7 +85,7 @@ def greedy_select(matrix,
     total_variant_count: total number of variants per-sample
     select_count:        how many samples we'll be selecting
     vcf_samples:         list of sample names, lines up with sample_mask
-    sample_mask:         int matrix for samples where 0 == yet to be selected
+    sample_mask:         matrix for samples where 1 == can be selected
     sample_weights:      (optional) the weights to apply to each iteration's sample.sum (len == gt_matrix.shape[0])
 
     Expects input matrices to be h5py Datasets.
@@ -103,7 +99,7 @@ def greedy_select(matrix,
         use_sample_name = vcf_samples[use_sample]
         variant_count = total_variant_count[use_sample]
         tot_captured += new_variant_count
-        sample_mask[use_sample] = 1
+        sample_mask[use_sample] = 0
 
         yield [
             use_sample_name,
@@ -120,26 +116,27 @@ def greedy_select(matrix,
         # can mem? put it in
         # need to change shape by how many we could mask
         if isinstance(matrix, h5py.Dataset):
-            n_var = num_vars - tot_captured
-            n_samp = len(sample_mask) - np.count_nonzero(sample_mask)
-            if is_memsafe((n_var, n_samp)):
+            n_var = int(num_vars - tot_captured)
+            n_samp = (sample_mask == 1).sum()
+            if is_memsafe((n_var, n_samp)) or MAXMEM == 0:
                 logging.info("Dataset small enough to hold in memory")
+                # Drop used samples
+                s_mask = sample_mask == 1
+                vcf_samples = vcf_samples[s_mask]
+                total_variant_count = total_variant_count[s_mask]
+                if sample_weights is not None:
+                    sample_weights = sample_weights[s_mask]
+                # subset samples/variants
                 n_matrix = np.zeros((n_var, n_samp), dtype=matrix.dtype)
+                inspect = np.where(sample_mask == 0)
                 m_pos = 0
-                c_mask = np.where(sample_mask == 1)
                 for row in matrix:
-                    if row[c_mask].any():
+                    if row[inspect].any():
                         continue
-                    n_matrix[m_pos] = row[sample_mask]
+                    n_matrix[m_pos] = row[s_mask]
                     m_pos += 1
-                # Drop used samples
-                sub_mask = sample_mask[sample_mask != 1]
-                vcf_samples = vcf_samples[sub_mask]
-                total_variant_count = total_variant_count[sub_mask]
-                if sample_weights is not None:
-                    sample_weights = sample_weights[sub_mask]
-                sample_mask = sample_mask[sub_mask]
                 matrix = n_matrix
+                sample_mask = np.ones(n_samp, dtype='bool')
 
 
 # deprecated for now
@@ -170,18 +167,17 @@ def run_selection(data, select_count=0.02, subset=None, exclude=None, weights=No
     else:
         vcf_samples = vcf_samples.astype(str)
 
-    # Build masks
-    # 0 - use, 1 = mask, other = skip
-    sample_mask = np.zeros(num_samples, dtype='uint8')
+    # 1 = can use, 0 = mask, 2 = exclude
+    sample_mask = np.ones(num_samples, dtype='uint8')
 
     if subset:
-        sample_mask = np.where(np.isin(vcf_samples, subset), 0, 2)
+        sample_mask = np.where(np.isin(vcf_samples, subset), 1, 2)
         logging.info("Subsetting to %d samples", len(subset))
     if exclude:
         sample_mask = np.where(np.isin(vcf_samples, exclude), 2, sample_mask)
         logging.info("Excluding %d samples", len(exclude))
     if subset and exclude:
-        remain = len(sample_mask) - np.count_nonzero(sample_mask)
+        remain = len(sample_mask) - (sample_mask == 1).sum()
         logging.info("Ending with %d samples", remain)
 
     sample_weights = None
@@ -194,7 +190,7 @@ def run_selection(data, select_count=0.02, subset=None, exclude=None, weights=No
 
     matrix = data['data']
 
-    if isinstance(data, h5py.File) and is_memsafe(matrix.shape):
+    if isinstance(data, h5py.File) and is_memsafe(matrix.shape) and MAXMEM != 0:
         logging.info("Dataset small enough to hold in memory")
         matrix = matrix[:]
 
@@ -323,7 +319,6 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
         logging.info("Calculating AF Matrix")
         af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0]
         ret["data"] = ret["data"] * af_arr
-
     return ret
 #pylint: enable=too-many-statements
 

From 3591469e3ff7fcccf6e2da3fe7cdefc44bdc95de Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 19 Jul 2023 17:25:17 +0000
Subject: [PATCH 04/17] Update coverage score

---
 imgs/pylint.svg | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/imgs/pylint.svg b/imgs/pylint.svg
index fa27ae2..c25bad4 100644
--- a/imgs/pylint.svg
+++ b/imgs/pylint.svg
@@ -1,23 +1,23 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="80" height="20">
+<svg xmlns="http://www.w3.org/2000/svg" width="68" height="20">
     <linearGradient id="b" x2="0" y2="100%">
         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
         <stop offset="1" stop-opacity=".1"/>
     </linearGradient>
     <mask id="anybadge_1">
-        <rect width="80" height="20" rx="3" fill="#fff"/>
+        <rect width="68" height="20" rx="3" fill="#fff"/>
     </mask>
     <g mask="url(#anybadge_1)">
         <path fill="#555" d="M0 0h44v20H0z"/>
-        <path fill="#4C1" d="M44 0h36v20H44z"/>
-        <path fill="url(#b)" d="M0 0h80v20H0z"/>
+        <path fill="#4C1" d="M44 0h24v20H44z"/>
+        <path fill="url(#b)" d="M0 0h68v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="23.0" y="15" fill="#010101" fill-opacity=".3">pylint</text>
         <text x="22.0" y="14">pylint</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="63.0" y="15" fill="#010101" fill-opacity=".3">9.97</text>
-        <text x="62.0" y="14">9.97</text>
+        <text x="57.0" y="15" fill="#010101" fill-opacity=".3">10</text>
+        <text x="56.0" y="14">10</text>
     </g>
 </svg>

From 1b2cf2e829b4c97fada98e3cd0c13f0e4bd60cd1 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 16:41:49 -0400
Subject: [PATCH 05/17] as int

---
 utmos/select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utmos/select.py b/utmos/select.py
index 508292c..4c2d34d 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -33,7 +33,7 @@ def do_sum(matrix, sample_mask):
         if row[c_mask].any():
             continue
         m_score += row
-        m_count += row != 0
+        m_count += (row != 0).astype('int')
     # mask out excluded/used samples
     m_score[sample_mask != 1] = 0
     return m_score, m_count

From aa87b6c223148b10583977bfc7ab5e6b66c9ab2f Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 16:46:52 -0400
Subject: [PATCH 06/17] debug

---
 utmos/select.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utmos/select.py b/utmos/select.py
index 4c2d34d..f0258ea 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -33,7 +33,10 @@ def do_sum(matrix, sample_mask):
         if row[c_mask].any():
             continue
         m_score += row
+        print(row)
+        print((row != 0).astype('int'))
         m_count += (row != 0).astype('int')
+        print(m_count.max())
     # mask out excluded/used samples
     m_score[sample_mask != 1] = 0
     return m_score, m_count

From f0c8b4a77e6ec7ff97d08d99618e3ea5503947e7 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:06:03 -0400
Subject: [PATCH 07/17] remove debug

---
 utmos/select.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/utmos/select.py b/utmos/select.py
index f0258ea..4c2d34d 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -33,10 +33,7 @@ def do_sum(matrix, sample_mask):
         if row[c_mask].any():
             continue
         m_score += row
-        print(row)
-        print((row != 0).astype('int'))
         m_count += (row != 0).astype('int')
-        print(m_count.max())
     # mask out excluded/used samples
     m_score[sample_mask != 1] = 0
     return m_score, m_count

From 00e77318315942adf401b005080ccfb97d217eb0 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:07:48 -0400
Subject: [PATCH 08/17] forcing numpy version

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 18b6512..841e03d 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,7 @@ def get_version(rel_path):
     },
     install_requires=[
         "truvari>=3.5.0",
+        "numpy>=1.23.3",
         "scikit-allel==1.3.5",
         "h5py==3.7.0",
     ],

From b4d9eecf29e2cc24c13a525dbd3f4e51e16e6d45 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:15:31 -0400
Subject: [PATCH 09/17] force types

---
 utmos/select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utmos/select.py b/utmos/select.py
index 4c2d34d..0e53567 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -26,7 +26,7 @@ def do_sum(matrix, sample_mask):
     Vectorized sum function
     """
     m_score = np.zeros(matrix.shape[1])
-    m_count = np.zeros(matrix.shape[1])
+    m_count = np.zeros(matrix.shape[1], dtype='int')
     # skip variants already used
     c_mask = np.where(sample_mask == 0)
     for row in matrix:

From 95e7b9a80d2cf05e240ee06f741e389d3936bad9 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:16:07 -0400
Subject: [PATCH 10/17] remove setup numpy

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 841e03d..18b6512 100644
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,6 @@ def get_version(rel_path):
     },
     install_requires=[
         "truvari>=3.5.0",
-        "numpy>=1.23.3",
         "scikit-allel==1.3.5",
         "h5py==3.7.0",
     ],

From e5f555e44e0041ea89bea277090cd0962cabc996 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:20:48 -0400
Subject: [PATCH 11/17] af float upstream

---
 utmos/select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utmos/select.py b/utmos/select.py
index 0e53567..958bb7e 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -318,7 +318,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
     if calc_af:
         logging.info("Calculating AF Matrix")
         af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0]
-        ret["data"] = ret["data"] * af_arr
+        ret["data"] = ret["data"].astype(float) * af_arr
     return ret
 #pylint: enable=too-many-statements
 

From f0aba093613b2238c794df6ae7148ff8a74ccdb6 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:31:17 -0400
Subject: [PATCH 12/17] no convert doubling

---
 utmos/select.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/utmos/select.py b/utmos/select.py
index 958bb7e..9b57ea3 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -29,17 +29,17 @@ def do_sum(matrix, sample_mask):
     m_count = np.zeros(matrix.shape[1], dtype='int')
     # skip variants already used
     c_mask = np.where(sample_mask == 0)
-    for row in matrix:
+    for row, af in zip(matrix, afs):
         if row[c_mask].any():
             continue
         m_score += row
         m_count += (row != 0).astype('int')
     # mask out excluded/used samples
-    m_score[sample_mask != 1] = 0
-    return m_score, m_count
+    m_count[sample_mask != 1] = 0
+    return m_count
 
 
-def calculate_scores(matrix, sample_mask, sample_weights):
+def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None):
     """
     calculate the best scoring sample,
     sumfunc is the method to do matrix summation
@@ -48,7 +48,11 @@ def calculate_scores(matrix, sample_mask, sample_weights):
         column index of the highest score
         new_row_count for highest score column index
     """
-    scores, counts = do_sum(matrix, sample_mask)
+    counts = do_sum(matrix, sample_mask)
+    scores = counts if allele_freqs is None and sample_weights is None else np.copy(counts)
+    if allele_freqs is not None:
+        scores *= allele_freqs
+            
     if sample_weights is not None:
         logging.debug("applying weights")
         scores *= sample_weights
@@ -318,7 +322,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
     if calc_af:
         logging.info("Calculating AF Matrix")
         af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0]
-        ret["data"] = ret["data"].astype(float) * af_arr
+        ret["data"] = ret["data"] * af_arr #ret["data"].astype(float) * af_arr
     return ret
 #pylint: enable=too-many-statements
 

From a8f2254f6b990ca1115eb00e7c46edc710d964ab Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:32:50 -0400
Subject: [PATCH 13/17] removing

---
 utmos/select.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/utmos/select.py b/utmos/select.py
index 9b57ea3..2089163 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -29,17 +29,17 @@ def do_sum(matrix, sample_mask):
     m_count = np.zeros(matrix.shape[1], dtype='int')
     # skip variants already used
     c_mask = np.where(sample_mask == 0)
-    for row, af in zip(matrix, afs):
+    for row in matrix:
         if row[c_mask].any():
             continue
         m_score += row
         m_count += (row != 0).astype('int')
     # mask out excluded/used samples
     m_count[sample_mask != 1] = 0
-    return m_count
+    return m_score, m_count
 
 
-def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None):
+def calculate_scores(matrix, sample_mask, sample_weights=None):
     """
     calculate the best scoring sample,
     sumfunc is the method to do matrix summation
@@ -48,11 +48,7 @@ def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None
         column index of the highest score
         new_row_count for highest score column index
     """
-    counts = do_sum(matrix, sample_mask)
-    scores = counts if allele_freqs is None and sample_weights is None else np.copy(counts)
-    if allele_freqs is not None:
-        scores *= allele_freqs
-            
+    scores, counts = do_sum(matrix, sample_mask)
     if sample_weights is not None:
         logging.debug("applying weights")
         scores *= sample_weights

From 72a6d16bf1ff1a6738ffaaeae13453b052feeb58 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:35:30 -0400
Subject: [PATCH 14/17] fixing score

---
 utmos/select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utmos/select.py b/utmos/select.py
index 2089163..e6979f4 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -35,7 +35,7 @@ def do_sum(matrix, sample_mask):
         m_score += row
         m_count += (row != 0).astype('int')
     # mask out excluded/used samples
-    m_count[sample_mask != 1] = 0
+    m_score[sample_mask != 1] = 0
     return m_score, m_count
 
 

From 6f3655df6df10d6f6f3660fccae3f3248b80a03f Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 19 Jul 2023 17:37:50 -0400
Subject: [PATCH 15/17] af type on unpack

---
 utmos/select.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utmos/select.py b/utmos/select.py
index e6979f4..a28be65 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -273,7 +273,8 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
         if samples is None:
             samples = dat['samples'].astype('S')
 
-        upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(bool)
+        m_type = float if calc_af else bool
+        upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(m_type)
         uninf_filter = upack.any(axis=1)
         logging.debug("fitering %d uninformative variants", (~uninf_filter).sum())
 
@@ -318,7 +319,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
     if calc_af:
         logging.info("Calculating AF Matrix")
         af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0]
-        ret["data"] = ret["data"] * af_arr #ret["data"].astype(float) * af_arr
+        ret["data"] *= af_arr
     return ret
 #pylint: enable=too-many-statements
 

From 95adb676e69945ef9bc0316d459f2758b4a894bf Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Thu, 20 Jul 2023 00:53:08 -0400
Subject: [PATCH 16/17] multiallelic safeguards

---
 imgs/coverage.svg |  4 ++--
 utmos/convert.py  |  3 ++-
 utmos/select.py   | 46 ++++++++++++++++++++++------------------------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/imgs/coverage.svg b/imgs/coverage.svg
index 8a3cf05..9d63d78 100644
--- a/imgs/coverage.svg
+++ b/imgs/coverage.svg
@@ -17,7 +17,7 @@
         <text x="32.5" y="14">coverage</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">95%</text>
-        <text x="82.0" y="14">95%</text>
+        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">94%</text>
+        <text x="82.0" y="14">94%</text>
     </g>
 </svg>
diff --git a/utmos/convert.py b/utmos/convert.py
index a1323de..e39a368 100644
--- a/utmos/convert.py
+++ b/utmos/convert.py
@@ -71,7 +71,8 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False):
     v_count = is_het | is_hom
 
     logging.info("Calculating AFs")
-    af = gts.count_alleles().to_frequencies()[:, 1]
+    # Use maximum non-reference allele frequency
+    af = gts.count_alleles().to_frequencies()[:, 1:].max(axis=1)
     # Needs to be reshaped for future multiplications
     af = af.reshape(af.shape[0], 1)
 
diff --git a/utmos/select.py b/utmos/select.py
index a28be65..8f4c525 100644
--- a/utmos/select.py
+++ b/utmos/select.py
@@ -21,24 +21,6 @@
 #############
 # Core code #
 #############
-def do_sum(matrix, sample_mask):
-    """
-    Vectorized sum function
-    """
-    m_score = np.zeros(matrix.shape[1])
-    m_count = np.zeros(matrix.shape[1], dtype='int')
-    # skip variants already used
-    c_mask = np.where(sample_mask == 0)
-    for row in matrix:
-        if row[c_mask].any():
-            continue
-        m_score += row
-        m_count += (row != 0).astype('int')
-    # mask out excluded/used samples
-    m_score[sample_mask != 1] = 0
-    return m_score, m_count
-
-
 def calculate_scores(matrix, sample_mask, sample_weights=None):
     """
     calculate the best scoring sample,
@@ -48,13 +30,26 @@ def calculate_scores(matrix, sample_mask, sample_weights=None):
         column index of the highest score
         new_row_count for highest score column index
     """
-    scores, counts = do_sum(matrix, sample_mask)
+    scores = np.zeros(matrix.shape[1])
+    counts = np.zeros(matrix.shape[1], dtype='int')
+    # skip variants already used
+    c_mask = np.where(sample_mask == 0)
+    for row in matrix:
+        if row[c_mask].any():
+            continue
+        scores += row
+        counts += (row != 0).astype('int')
+    # mask out excluded/used samples
+    scores[sample_mask != 1] = 0
+
     if sample_weights is not None:
         logging.debug("applying weights")
         scores *= sample_weights
     use_sample = np.argmax(scores)
     new_variant_count = counts[use_sample]
-
+    # Backwards compatibility for data without max-alt-af convert
+    if scores[use_sample] == 0:
+        return None, None
     return use_sample, new_variant_count
 
 
@@ -95,7 +90,10 @@ def greedy_select(matrix,
     tot_captured = 0
     for _ in range(select_count):
         use_sample, new_variant_count = calculate_scores(matrix, sample_mask, sample_weights)
-
+        if use_sample is None:
+            # Backwards compatibility for data without max-alt-af convert
+            logging.warning("Ran out of new variants (multi-allelics)")
+            break
         use_sample_name = vcf_samples[use_sample]
         variant_count = total_variant_count[use_sample]
         tot_captured += new_variant_count
@@ -273,8 +271,8 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
         if samples is None:
             samples = dat['samples'].astype('S')
 
-        m_type = float if calc_af else bool
-        upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(m_type)
+        #m_type = float if calc_af else bool
+        upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(bool)
         uninf_filter = upack.any(axis=1)
         logging.debug("fitering %d uninformative variants", (~uninf_filter).sum())
 
@@ -319,7 +317,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False):
     if calc_af:
         logging.info("Calculating AF Matrix")
         af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0]
-        ret["data"] *= af_arr
+        ret["data"] = ret["data"] * af_arr
     return ret
 #pylint: enable=too-many-statements
 

From f10c1ad74be09934f58621e84ffc336909a797a6 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Thu, 20 Jul 2023 16:29:49 -0400
Subject: [PATCH 17/17] version bump

multi-allelic with `--af` now uses max alt AF at a site instead of the AF of GT==1
---
 README.md                      | 5 +++++
 repo_utils/answer_key/help.txt | 2 +-
 utmos/__init__.py              | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e0d0d3b..78c0518 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,11 @@ the concatenation/conversion work. Simply provide a single `in_file` of `file.hd
 parameter `--lowmem file.hdf5`. Note that if you create an hdf5 file with `select --af`, it will hold the AF weighted
 matrix and can only be reused with `select --af`.
 
+## Multi-allelic VCFs
+When running `select --af`, variant positions are weighed by their allele frequency. For multi-allelic VCFs, the site is
+weighed by the largest allele frequency observed. If this is not the desired behavior, split multi-allelics in the VCF 
+with `bcftools norm -N -m-any`.
+
 ## Performace metrics
 Running on a 2013 Mac Pro and using chr22 from 1kgp genotype  
 `ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502//ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz`
diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt
index 232f7cc..be90b19 100644
--- a/repo_utils/answer_key/help.txt
+++ b/repo_utils/answer_key/help.txt
@@ -1,6 +1,6 @@
 usage: utmos [-h] CMD ...
 
-Utmos v2.1.0 - Maximum-coverage algorithm to select samples for validation and resequencing
+Utmos v2.2.0 - Maximum-coverage algorithm to select samples for validation and resequencing
 
     CMDs:
         convert  Extract genotypes from VCFs
diff --git a/utmos/__init__.py b/utmos/__init__.py
index 3fd8dcc..bb5646c 100644
--- a/utmos/__init__.py
+++ b/utmos/__init__.py
@@ -2,4 +2,4 @@
 Utmos - a reimplementation of SVColector
 """
 
-__version__ = '2.1.0'
+__version__ = '2.2.0'