From 75504b0c7af48c64364f613c9cba5036b00e53db Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 18 Jul 2023 15:00:13 -0400 Subject: [PATCH 01/17] updating mask != 2 might have been messing up the count/scores, changed to == 0 --- repo_utils/utmos_ssshtests.sh | 2 +- utmos/convert.py | 3 +-- utmos/select.py | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/repo_utils/utmos_ssshtests.sh b/repo_utils/utmos_ssshtests.sh index 33b9016..0b08ba1 100644 --- a/repo_utils/utmos_ssshtests.sh +++ b/repo_utils/utmos_ssshtests.sh @@ -1,7 +1,7 @@ test -e ssshtest || curl -O https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest source ssshtest -STOP_ON_FAIL=1 +#STOP_ON_FAIL=1 # Work inside of the repo folder cd "$( dirname "${BASH_SOURCE[0]}" )"/../ INDIR=repo_utils/test_files diff --git a/utmos/convert.py b/utmos/convert.py index edba497..8671c01 100644 --- a/utmos/convert.py +++ b/utmos/convert.py @@ -81,9 +81,8 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False): del data["calldata/GT"] data["GT"] = v_count data["AF"] = af - data["GT"] = np.packbits(data["GT"], axis=1) - + data["stats"] = {'num_het': num_hets, 'num_hom': num_homs} return data diff --git a/utmos/select.py b/utmos/select.py index 46a3650..b2d9858 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -37,8 +37,8 @@ def do_sum(matrix, sample_mask): m_tot += row else: m_tot += row != 0 - # mask out excluded samples - ex_mask = sample_mask != 2 + # mask out excluded/used samples + ex_mask = sample_mask == 0 m_sum *= ex_mask m_tot *= ex_mask return m_sum, m_tot @@ -49,7 +49,7 @@ def calculate_scores(matrix, sample_mask, sample_weights): calculate the best scoring sample, sumfunc is the method to do matrix summation - updates sample_mask in place + updates sample_mask in place (shouldn't be here) returns tuple of: column index of the highest score new_row_count for highest score column index From 738ad9fa2394f2ccce2ab652d7d80d1af6aea72f Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 18 Jul 2023 19:07:01 -0400 Subject: [PATCH 02/17] code clean --- utmos/select.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/utmos/select.py b/utmos/select.py index b2d9858..a0ade84 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -17,7 +17,6 @@ MAXMEM = 2 # in GB - ############# # Core code # ############# @@ -49,7 +48,6 @@ def calculate_scores(matrix, sample_mask, sample_weights): calculate the best scoring sample, sumfunc is the method to do matrix summation - updates sample_mask in place (shouldn't be here) returns tuple of: column index of the highest score new_row_count for highest score column index @@ -60,7 +58,6 @@ def calculate_scores(matrix, sample_mask, sample_weights): sample_scores *= sample_weights use_sample = np.argmax(sample_scores) new_variant_count = cur_sample_count[use_sample] - sample_mask[use_sample] = 1 return use_sample, new_variant_count @@ -91,8 +88,8 @@ def greedy_select(matrix, matrix: genotype data total_variant_count: total number of variants per-sample select_count: how many samples we'll be selecting - variant_mask: boolean matrix of variants where True == used - sample_mask: boolean matrix of samples where True == use + vcf_samples: list of sample names, lines up with sample_mask + sample_mask: int matrix for samples where 0 == yet to be selected sample_weights: (optional) the weights to apply to each iteration's sample.sum (len == gt_matrix.shape[0]) Expects input matrices to be h5py Datasets. @@ -106,6 +103,7 @@ def greedy_select(matrix, use_sample_name = vcf_samples[use_sample] variant_count = total_variant_count[use_sample] tot_captured += new_variant_count + sample_mask[use_sample] = 1 yield [ use_sample_name, From 53dca2eff00dc108e5d8829cf8147cc7fa6934a9 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 13:19:35 -0400 Subject: [PATCH 03/17] fixing select lowmem and maybe af --- imgs/coverage.svg | 4 +- imgs/pylint.svg | 12 +++--- repo_utils/answer_key/help.txt | 2 +- repo_utils/utmos_ssshtests.sh | 2 + utmos/convert.py | 2 +- utmos/select.py | 71 ++++++++++++++++------------------ 6 files changed, 45 insertions(+), 48 deletions(-) diff --git a/imgs/coverage.svg b/imgs/coverage.svg index ea71b13..8a3cf05 100644 --- a/imgs/coverage.svg +++ b/imgs/coverage.svg @@ -17,7 +17,7 @@ coverage - 90% - 90% + 95% + 95% diff --git a/imgs/pylint.svg b/imgs/pylint.svg index c25bad4..fa27ae2 100644 --- a/imgs/pylint.svg +++ b/imgs/pylint.svg @@ -1,23 +1,23 @@ - + - + - - + + pylint pylint - 10 - 10 + 9.97 + 9.97 diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt index 20ad794..232f7cc 100644 --- a/repo_utils/answer_key/help.txt +++ b/repo_utils/answer_key/help.txt @@ -1,6 +1,6 @@ usage: utmos [-h] CMD ... -Utmos v2.0.1-dev - Maximum-coverage algorithm to select samples for validation and resequencing +Utmos v2.1.0 - Maximum-coverage algorithm to select samples for validation and resequencing CMDs: convert Extract genotypes from VCFs diff --git a/repo_utils/utmos_ssshtests.sh b/repo_utils/utmos_ssshtests.sh index 0b08ba1..5bec6b3 100644 --- a/repo_utils/utmos_ssshtests.sh +++ b/repo_utils/utmos_ssshtests.sh @@ -194,6 +194,8 @@ fi # select lowmem # ------------------------------------------------------------ +run test_select_lm_big $ut select --maxmem 0 --lowmem $OD/tiny.hdf5 $INDIR/chunk*.jl + run test_select_lm $ut select --maxmem 0 --lowmem $OD/tiny.hdf5 $INDIR/chunk2.vcf if [ $test_select_lm ]; then assert_exit_code 0 diff --git a/utmos/convert.py b/utmos/convert.py index 8671c01..a1323de 100644 --- a/utmos/convert.py +++ b/utmos/convert.py @@ -82,7 +82,7 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False): data["GT"] = v_count data["AF"] = af data["GT"] = np.packbits(data["GT"], axis=1) - + data["stats"] = {'num_het': num_hets, 'num_hom': num_homs} return data diff --git a/utmos/select.py b/utmos/select.py index a0ade84..508292c 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -16,6 +16,7 @@ from utmos.convert import read_vcf MAXMEM = 2 # in GB +# set to 0 to h5 once and then pull in memory for test coverage ############# # Core code # @@ -24,23 +25,18 @@ def do_sum(matrix, sample_mask): """ Vectorized sum function """ - m_sum = np.zeros(matrix.shape[1]) - m_tot = np.zeros(matrix.shape[1]) + m_score = np.zeros(matrix.shape[1]) + m_count = np.zeros(matrix.shape[1]) # skip variants already used - c_mask = np.where(sample_mask == 1) + c_mask = np.where(sample_mask == 0) for row in matrix: if row[c_mask].any(): continue - m_sum += row - if matrix.dtype == bool: - m_tot += row - else: - m_tot += row != 0 + m_score += row + m_count += row != 0 # mask out excluded/used samples - ex_mask = sample_mask == 0 - m_sum *= ex_mask - m_tot *= ex_mask - return m_sum, m_tot + m_score[sample_mask != 1] = 0 + return m_score, m_count def calculate_scores(matrix, sample_mask, sample_weights): @@ -52,12 +48,12 @@ def calculate_scores(matrix, sample_mask, sample_weights): column index of the highest score new_row_count for highest score column index """ - sample_scores, cur_sample_count = do_sum(matrix, sample_mask) + scores, counts = do_sum(matrix, sample_mask) if sample_weights is not None: logging.debug("applying weights") - sample_scores *= sample_weights - use_sample = np.argmax(sample_scores) - new_variant_count = cur_sample_count[use_sample] + scores *= sample_weights + use_sample = np.argmax(scores) + new_variant_count = counts[use_sample] return use_sample, new_variant_count @@ -89,7 +85,7 @@ def greedy_select(matrix, total_variant_count: total number of variants per-sample select_count: how many samples we'll be selecting vcf_samples: list of sample names, lines up with sample_mask - sample_mask: int matrix for samples where 0 == yet to be selected + sample_mask: matrix for samples where 1 == can be selected sample_weights: (optional) the weights to apply to each iteration's sample.sum (len == gt_matrix.shape[0]) Expects input matrices to be h5py Datasets. @@ -103,7 +99,7 @@ def greedy_select(matrix, use_sample_name = vcf_samples[use_sample] variant_count = total_variant_count[use_sample] tot_captured += new_variant_count - sample_mask[use_sample] = 1 + sample_mask[use_sample] = 0 yield [ use_sample_name, @@ -120,26 +116,27 @@ def greedy_select(matrix, # can mem? put it in # need to change shape by how many we could mask if isinstance(matrix, h5py.Dataset): - n_var = num_vars - tot_captured - n_samp = len(sample_mask) - np.count_nonzero(sample_mask) - if is_memsafe((n_var, n_samp)): + n_var = int(num_vars - tot_captured) + n_samp = (sample_mask == 1).sum() + if is_memsafe((n_var, n_samp)) or MAXMEM == 0: logging.info("Dataset small enough to hold in memory") + # Drop used samples + s_mask = sample_mask == 1 + vcf_samples = vcf_samples[s_mask] + total_variant_count = total_variant_count[s_mask] + if sample_weights is not None: + sample_weights = sample_weights[s_mask] + # subset samples/variants n_matrix = np.zeros((n_var, n_samp), dtype=matrix.dtype) + inspect = np.where(sample_mask == 0) m_pos = 0 - c_mask = np.where(sample_mask == 1) for row in matrix: - if row[c_mask].any(): + if row[inspect].any(): continue - n_matrix[m_pos] = row[sample_mask] + n_matrix[m_pos] = row[s_mask] m_pos += 1 - # Drop used samples - sub_mask = sample_mask[sample_mask != 1] - vcf_samples = vcf_samples[sub_mask] - total_variant_count = total_variant_count[sub_mask] - if sample_weights is not None: - sample_weights = sample_weights[sub_mask] - sample_mask = sample_mask[sub_mask] matrix = n_matrix + sample_mask = np.ones(n_samp, dtype='bool') # deprecated for now @@ -170,18 +167,17 @@ def run_selection(data, select_count=0.02, subset=None, exclude=None, weights=No else: vcf_samples = vcf_samples.astype(str) - # Build masks - # 0 - use, 1 = mask, other = skip - sample_mask = np.zeros(num_samples, dtype='uint8') + # 1 = can use, 0 = mask, 2 = exclude + sample_mask = np.ones(num_samples, dtype='uint8') if subset: - sample_mask = np.where(np.isin(vcf_samples, subset), 0, 2) + sample_mask = np.where(np.isin(vcf_samples, subset), 1, 2) logging.info("Subsetting to %d samples", len(subset)) if exclude: sample_mask = np.where(np.isin(vcf_samples, exclude), 2, sample_mask) logging.info("Excluding %d samples", len(exclude)) if subset and exclude: - remain = len(sample_mask) - np.count_nonzero(sample_mask) + remain = len(sample_mask) - (sample_mask == 1).sum() logging.info("Ending with %d samples", remain) sample_weights = None @@ -194,7 +190,7 @@ def run_selection(data, select_count=0.02, subset=None, exclude=None, weights=No matrix = data['data'] - if isinstance(data, h5py.File) and is_memsafe(matrix.shape): + if isinstance(data, h5py.File) and is_memsafe(matrix.shape) and MAXMEM != 0: logging.info("Dataset small enough to hold in memory") matrix = matrix[:] @@ -323,7 +319,6 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): logging.info("Calculating AF Matrix") af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0] ret["data"] = ret["data"] * af_arr - return ret #pylint: enable=too-many-statements From 3591469e3ff7fcccf6e2da3fe7cdefc44bdc95de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 19 Jul 2023 17:25:17 +0000 Subject: [PATCH 04/17] Update coverage score --- imgs/pylint.svg | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/imgs/pylint.svg b/imgs/pylint.svg index fa27ae2..c25bad4 100644 --- a/imgs/pylint.svg +++ b/imgs/pylint.svg @@ -1,23 +1,23 @@ - + - + - - + + pylint pylint - 9.97 - 9.97 + 10 + 10 From 1b2cf2e829b4c97fada98e3cd0c13f0e4bd60cd1 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 16:41:49 -0400 Subject: [PATCH 05/17] as int --- utmos/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utmos/select.py b/utmos/select.py index 508292c..4c2d34d 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -33,7 +33,7 @@ def do_sum(matrix, sample_mask): if row[c_mask].any(): continue m_score += row - m_count += row != 0 + m_count += (row != 0).astype('int') # mask out excluded/used samples m_score[sample_mask != 1] = 0 return m_score, m_count From aa87b6c223148b10583977bfc7ab5e6b66c9ab2f Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 16:46:52 -0400 Subject: [PATCH 06/17] debug --- utmos/select.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utmos/select.py b/utmos/select.py index 4c2d34d..f0258ea 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -33,7 +33,10 @@ def do_sum(matrix, sample_mask): if row[c_mask].any(): continue m_score += row + print(row) + print((row != 0).astype('int')) m_count += (row != 0).astype('int') + print(m_count.max()) # mask out excluded/used samples m_score[sample_mask != 1] = 0 return m_score, m_count From f0c8b4a77e6ec7ff97d08d99618e3ea5503947e7 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:06:03 -0400 Subject: [PATCH 07/17] remove debug --- utmos/select.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/utmos/select.py b/utmos/select.py index f0258ea..4c2d34d 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -33,10 +33,7 @@ def do_sum(matrix, sample_mask): if row[c_mask].any(): continue m_score += row - print(row) - print((row != 0).astype('int')) m_count += (row != 0).astype('int') - print(m_count.max()) # mask out excluded/used samples m_score[sample_mask != 1] = 0 return m_score, m_count From 00e77318315942adf401b005080ccfb97d217eb0 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:07:48 -0400 Subject: [PATCH 08/17] forcing numpy version --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 18b6512..841e03d 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def get_version(rel_path): }, install_requires=[ "truvari>=3.5.0", + "numpy>=1.23.3", "scikit-allel==1.3.5", "h5py==3.7.0", ], From b4d9eecf29e2cc24c13a525dbd3f4e51e16e6d45 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:15:31 -0400 Subject: [PATCH 09/17] force types --- utmos/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utmos/select.py b/utmos/select.py index 4c2d34d..0e53567 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -26,7 +26,7 @@ def do_sum(matrix, sample_mask): Vectorized sum function """ m_score = np.zeros(matrix.shape[1]) - m_count = np.zeros(matrix.shape[1]) + m_count = np.zeros(matrix.shape[1], dtype='int') # skip variants already used c_mask = np.where(sample_mask == 0) for row in matrix: From 95e7b9a80d2cf05e240ee06f741e389d3936bad9 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:16:07 -0400 Subject: [PATCH 10/17] remove setup numpy --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 841e03d..18b6512 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,6 @@ def get_version(rel_path): }, install_requires=[ "truvari>=3.5.0", - "numpy>=1.23.3", "scikit-allel==1.3.5", "h5py==3.7.0", ], From e5f555e44e0041ea89bea277090cd0962cabc996 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:20:48 -0400 Subject: [PATCH 11/17] af float upstream --- utmos/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utmos/select.py b/utmos/select.py index 0e53567..958bb7e 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -318,7 +318,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if calc_af: logging.info("Calculating AF Matrix") af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0] - ret["data"] = ret["data"] * af_arr + ret["data"] = ret["data"].astype(float) * af_arr return ret #pylint: enable=too-many-statements From f0aba093613b2238c794df6ae7148ff8a74ccdb6 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:31:17 -0400 Subject: [PATCH 12/17] no convert doubling --- utmos/select.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/utmos/select.py b/utmos/select.py index 958bb7e..9b57ea3 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -29,17 +29,17 @@ def do_sum(matrix, sample_mask): m_count = np.zeros(matrix.shape[1], dtype='int') # skip variants already used c_mask = np.where(sample_mask == 0) - for row in matrix: + for row, af in zip(matrix, afs): if row[c_mask].any(): continue m_score += row m_count += (row != 0).astype('int') # mask out excluded/used samples - m_score[sample_mask != 1] = 0 - return m_score, m_count + m_count[sample_mask != 1] = 0 + return m_count -def calculate_scores(matrix, sample_mask, sample_weights): +def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None): """ calculate the best scoring sample, sumfunc is the method to do matrix summation @@ -48,7 +48,11 @@ def calculate_scores(matrix, sample_mask, sample_weights): column index of the highest score new_row_count for highest score column index """ - scores, counts = do_sum(matrix, sample_mask) + counts = do_sum(matrix, sample_mask) + scores = counts if allele_freqs is None and sample_weights is None else np.copy(counts) + if allele_freqs is not None: + scores *= allele_freqs + if sample_weights is not None: logging.debug("applying weights") scores *= sample_weights @@ -318,7 +322,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if calc_af: logging.info("Calculating AF Matrix") af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0] - ret["data"] = ret["data"].astype(float) * af_arr + ret["data"] = ret["data"] * af_arr #ret["data"].astype(float) * af_arr return ret #pylint: enable=too-many-statements From a8f2254f6b990ca1115eb00e7c46edc710d964ab Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:32:50 -0400 Subject: [PATCH 13/17] removing --- utmos/select.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/utmos/select.py b/utmos/select.py index 9b57ea3..2089163 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -29,17 +29,17 @@ def do_sum(matrix, sample_mask): m_count = np.zeros(matrix.shape[1], dtype='int') # skip variants already used c_mask = np.where(sample_mask == 0) - for row, af in zip(matrix, afs): + for row in matrix: if row[c_mask].any(): continue m_score += row m_count += (row != 0).astype('int') # mask out excluded/used samples m_count[sample_mask != 1] = 0 - return m_count + return m_score, m_count -def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None): +def calculate_scores(matrix, sample_mask, sample_weights=None): """ calculate the best scoring sample, sumfunc is the method to do matrix summation @@ -48,11 +48,7 @@ def calculate_scores(matrix, sample_mask, allele_freqs=None, sample_weights=None column index of the highest score new_row_count for highest score column index """ - counts = do_sum(matrix, sample_mask) - scores = counts if allele_freqs is None and sample_weights is None else np.copy(counts) - if allele_freqs is not None: - scores *= allele_freqs - + scores, counts = do_sum(matrix, sample_mask) if sample_weights is not None: logging.debug("applying weights") scores *= sample_weights From 72a6d16bf1ff1a6738ffaaeae13453b052feeb58 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:35:30 -0400 Subject: [PATCH 14/17] fixing score --- utmos/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utmos/select.py b/utmos/select.py index 2089163..e6979f4 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -35,7 +35,7 @@ def do_sum(matrix, sample_mask): m_score += row m_count += (row != 0).astype('int') # mask out excluded/used samples - m_count[sample_mask != 1] = 0 + m_score[sample_mask != 1] = 0 return m_score, m_count From 6f3655df6df10d6f6f3660fccae3f3248b80a03f Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 19 Jul 2023 17:37:50 -0400 Subject: [PATCH 15/17] af type on unpack --- utmos/select.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utmos/select.py b/utmos/select.py index e6979f4..a28be65 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -273,7 +273,8 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if samples is None: samples = dat['samples'].astype('S') - upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(bool) + m_type = float if calc_af else bool + upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(m_type) uninf_filter = upack.any(axis=1) logging.debug("fitering %d uninformative variants", (~uninf_filter).sum()) @@ -318,7 +319,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if calc_af: logging.info("Calculating AF Matrix") af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0] - ret["data"] = ret["data"] * af_arr #ret["data"].astype(float) * af_arr + ret["data"] *= af_arr return ret #pylint: enable=too-many-statements From 95adb676e69945ef9bc0316d459f2758b4a894bf Mon Sep 17 00:00:00 2001 From: Adam English Date: Thu, 20 Jul 2023 00:53:08 -0400 Subject: [PATCH 16/17] multiallelic safeguards --- imgs/coverage.svg | 4 ++-- utmos/convert.py | 3 ++- utmos/select.py | 46 ++++++++++++++++++++++------------------------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/imgs/coverage.svg b/imgs/coverage.svg index 8a3cf05..9d63d78 100644 --- a/imgs/coverage.svg +++ b/imgs/coverage.svg @@ -17,7 +17,7 @@ coverage - 95% - 95% + 94% + 94% diff --git a/utmos/convert.py b/utmos/convert.py index a1323de..e39a368 100644 --- a/utmos/convert.py +++ b/utmos/convert.py @@ -71,7 +71,8 @@ def read_vcf(in_file, lowmem=False, chunk_length=2000, no_singleton=False): v_count = is_het | is_hom logging.info("Calculating AFs") - af = gts.count_alleles().to_frequencies()[:, 1] + # Use maximum non-reference allele frequency + af = gts.count_alleles().to_frequencies()[:, 1:].max(axis=1) # Needs to be reshaped for future multiplications af = af.reshape(af.shape[0], 1) diff --git a/utmos/select.py b/utmos/select.py index a28be65..8f4c525 100644 --- a/utmos/select.py +++ b/utmos/select.py @@ -21,24 +21,6 @@ ############# # Core code # ############# -def do_sum(matrix, sample_mask): - """ - Vectorized sum function - """ - m_score = np.zeros(matrix.shape[1]) - m_count = np.zeros(matrix.shape[1], dtype='int') - # skip variants already used - c_mask = np.where(sample_mask == 0) - for row in matrix: - if row[c_mask].any(): - continue - m_score += row - m_count += (row != 0).astype('int') - # mask out excluded/used samples - m_score[sample_mask != 1] = 0 - return m_score, m_count - - def calculate_scores(matrix, sample_mask, sample_weights=None): """ calculate the best scoring sample, @@ -48,13 +30,26 @@ def calculate_scores(matrix, sample_mask, sample_weights=None): column index of the highest score new_row_count for highest score column index """ - scores, counts = do_sum(matrix, sample_mask) + scores = np.zeros(matrix.shape[1]) + counts = np.zeros(matrix.shape[1], dtype='int') + # skip variants already used + c_mask = np.where(sample_mask == 0) + for row in matrix: + if row[c_mask].any(): + continue + scores += row + counts += (row != 0).astype('int') + # mask out excluded/used samples + scores[sample_mask != 1] = 0 + if sample_weights is not None: logging.debug("applying weights") scores *= sample_weights use_sample = np.argmax(scores) new_variant_count = counts[use_sample] - + # Backwards compatibility for data without max-alt-af convert + if scores[use_sample] == 0: + return None, None return use_sample, new_variant_count @@ -95,7 +90,10 @@ def greedy_select(matrix, tot_captured = 0 for _ in range(select_count): use_sample, new_variant_count = calculate_scores(matrix, sample_mask, sample_weights) - + if use_sample is None: + # Backwards compatibility for data without max-alt-af convert + logging.warning("Ran out of new variants (multi-allelics)") + break use_sample_name = vcf_samples[use_sample] variant_count = total_variant_count[use_sample] tot_captured += new_variant_count @@ -273,8 +271,8 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if samples is None: samples = dat['samples'].astype('S') - m_type = float if calc_af else bool - upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(m_type) + #m_type = float if calc_af else bool + upack = np.unpackbits(dat['GT'], axis=1, count=len(dat['samples'])).astype(bool) uninf_filter = upack.any(axis=1) logging.debug("fitering %d uninformative variants", (~uninf_filter).sum()) @@ -319,7 +317,7 @@ def load_files(in_files, lowmem=None, buffer=32768, calc_af=False): if calc_af: logging.info("Calculating AF Matrix") af_arr = np.concatenate(af_parts) if len(af_parts) > 1 else af_parts[0] - ret["data"] *= af_arr + ret["data"] = ret["data"] * af_arr return ret #pylint: enable=too-many-statements From f10c1ad74be09934f58621e84ffc336909a797a6 Mon Sep 17 00:00:00 2001 From: Adam English Date: Thu, 20 Jul 2023 16:29:49 -0400 Subject: [PATCH 17/17] version bump multi-allelic with `--af` now uses max alt AF at a site instead of the AF of GT==1 --- README.md | 5 +++++ repo_utils/answer_key/help.txt | 2 +- utmos/__init__.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e0d0d3b..78c0518 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,11 @@ the concatenation/conversion work. Simply provide a single `in_file` of `file.hd parameter `--lowmem file.hdf5`. Note that if you create an hdf5 file with `select --af`, it will hold the AF weighted matrix and can only be reused with `select --af`. +## Multi-allelic VCFs +When running `select --af`, variant positions are weighed by their allele frequency. For multi-allelic VCFs, the site is +weighed by the largest allele frequency observed. If this is not the desired behavior, split multi-allelics in the VCF +with `bcftools norm -N -m-any`. + ## Performace metrics Running on a 2013 Mac Pro and using chr22 from 1kgp genotype `ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502//ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz` diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt index 232f7cc..be90b19 100644 --- a/repo_utils/answer_key/help.txt +++ b/repo_utils/answer_key/help.txt @@ -1,6 +1,6 @@ usage: utmos [-h] CMD ... -Utmos v2.1.0 - Maximum-coverage algorithm to select samples for validation and resequencing +Utmos v2.2.0 - Maximum-coverage algorithm to select samples for validation and resequencing CMDs: convert Extract genotypes from VCFs diff --git a/utmos/__init__.py b/utmos/__init__.py index 3fd8dcc..bb5646c 100644 --- a/utmos/__init__.py +++ b/utmos/__init__.py @@ -2,4 +2,4 @@ Utmos - a reimplementation of SVColector """ -__version__ = '2.1.0' +__version__ = '2.2.0'