From 4c07e11f26c8d8f654f8f10097381d9e57c1497e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 4 Apr 2024 22:58:48 +0000 Subject: [PATCH] Deployed 1a964f9 with MkDocs version: 1.5.3 --- .nojekyll | 0 404.html | 624 ++ api/AnnotationMatrix/index.html | 3086 +++++ api/GWADataLoader/index.html | 7347 ++++++++++++ api/GenotypeMatrix/index.html | 7845 +++++++++++++ api/LDMatrix/index.html | 9918 +++++++++++++++++ api/SampleTable/index.html | 3727 +++++++ api/SumstatsTable/index.html | 5647 ++++++++++ api/overview/index.html | 896 ++ api/parsers/annotation_parsers/index.html | 1499 +++ api/parsers/misc_parsers/index.html | 1128 ++ api/parsers/plink_parsers/index.html | 1018 ++ api/parsers/sumstats_parsers/index.html | 3446 ++++++ api/plot/gwa/index.html | 1211 ++ api/plot/ld/index.html | 889 ++ .../AnnotatedPhenotypeSimulator/index.html | 1621 +++ .../MultiCohortPhenotypeSimulator/index.html | 1254 +++ api/simulation/PhenotypeSimulator/index.html | 3184 ++++++ api/stats/gwa/utils/index.html | 1742 +++ api/stats/h2/ldsc/index.html | 1138 ++ api/stats/ld/estimator/index.html | 4125 +++++++ api/stats/ld/utils/index.html | 2518 +++++ api/stats/score/utils/index.html | 1041 ++ api/stats/transforms/genotype/index.html | 835 ++ api/stats/transforms/phenotype/index.html | 1418 +++ api/stats/variant/utils/index.html | 1078 ++ api/utils/compute_utils/index.html | 1053 ++ api/utils/data_utils/index.html | 801 ++ api/utils/executors/index.html | 1436 +++ api/utils/model_utils/index.html | 2274 ++++ api/utils/system_utils/index.html | 1453 +++ assets/_mkdocstrings.css | 114 + assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.1e8ae164.min.js | 29 + assets/javascripts/bundle.1e8ae164.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++++++++++ .../workers/search.b8dbb3d2.min.js | 42 + .../workers/search.b8dbb3d2.min.js.map | 7 + assets/stylesheets/main.bcfcd587.min.css | 1 + assets/stylesheets/main.bcfcd587.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + citation/index.html | 784 ++ commandline/magenpy_ld/index.html | 837 ++ commandline/magenpy_simulate/index.html | 823 ++ commandline/overview/index.html | 723 ++ faq/index.html | 707 ++ features/index.html | 1242 +++ getting_started/index.html | 812 ++ index.html | 809 ++ installation/index.html | 920 ++ objects.inv | Bin 0 -> 3746 bytes search/search_index.json | 1 + sitemap.xml | 3 + sitemap.xml.gz | Bin 0 -> 127 bytes tutorials/overview/index.html | 707 ++ 89 files changed, 91024 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 api/AnnotationMatrix/index.html create mode 100644 api/GWADataLoader/index.html create mode 100644 api/GenotypeMatrix/index.html create mode 100644 api/LDMatrix/index.html create mode 100644 api/SampleTable/index.html create mode 100644 api/SumstatsTable/index.html create mode 100644 api/overview/index.html create mode 100644 api/parsers/annotation_parsers/index.html create mode 100644 api/parsers/misc_parsers/index.html create mode 100644 api/parsers/plink_parsers/index.html create mode 100644 api/parsers/sumstats_parsers/index.html create mode 100644 api/plot/gwa/index.html create mode 100644 api/plot/ld/index.html create mode 100644 api/simulation/AnnotatedPhenotypeSimulator/index.html create mode 100644 api/simulation/MultiCohortPhenotypeSimulator/index.html create mode 100644 api/simulation/PhenotypeSimulator/index.html create mode 100644 api/stats/gwa/utils/index.html create mode 100644 api/stats/h2/ldsc/index.html create mode 100644 api/stats/ld/estimator/index.html create mode 100644 api/stats/ld/utils/index.html create mode 100644 api/stats/score/utils/index.html create mode 100644 api/stats/transforms/genotype/index.html create mode 100644 api/stats/transforms/phenotype/index.html create mode 100644 api/stats/variant/utils/index.html create mode 100644 api/utils/compute_utils/index.html create mode 100644 api/utils/data_utils/index.html create mode 100644 api/utils/executors/index.html create mode 100644 api/utils/model_utils/index.html create mode 100644 api/utils/system_utils/index.html create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.1e8ae164.min.js create mode 100644 assets/javascripts/bundle.1e8ae164.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js.map create mode 100644 assets/stylesheets/main.bcfcd587.min.css create mode 100644 assets/stylesheets/main.bcfcd587.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 citation/index.html create mode 100644 commandline/magenpy_ld/index.html create mode 100644 commandline/magenpy_simulate/index.html create mode 100644 commandline/overview/index.html create mode 100644 faq/index.html create mode 100644 features/index.html create mode 100644 getting_started/index.html create mode 100644 index.html create mode 100644 installation/index.html create mode 100644 objects.inv create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz create mode 100644 tutorials/overview/index.html diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..51e86ea --- /dev/null +++ b/404.html @@ -0,0 +1,624 @@ + + + +
+ + + + + + + + + + + + + + +
+ Bases: object
A wrapper class for handling annotation matrices, which are essentially tables of +features for each variant in the genome. These features include information such as +whether the variant is in coding regions, enhancers, etc. It can also include continuous +features derived from experimental assays or other sources.
+The purpose of this class is to present a unified and consistent interface for handling +annotations across different tools and applications. It should be able to read and write +annotation matrices in different formats, filter annotations, and perform basic operations +on the annotation matrix. It should also allow users to define new custom annotations +that can be used for downstream statistical genetics applications.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
table |
+ + | +
+
+
+ A pandas dataframe containing the annotation information. + |
+
_annotations |
+ + | +
+
+
+ A list or array of column namees to consider as annotations. If not provided, will be inferred heuristically, though we recommend that the user specify this information. + |
+
magenpy/AnnotationMatrix.py
4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 |
|
annotations
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The list of annotation names or IDs in the annotation matrix. + |
+
binary_annotations
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A list of binary (0/1) annotations in the annotation matrix. + |
+
chromosome
+
+
+ property
+
+
+¶A convenience method to get the chromosome if there is only one chromosome in the annotation matrix.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chromosome number if there is only one chromosome in the annotation matrix. Otherwise, None. + |
+
chromosomes
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The list of unique chromosomes in the annotation matrix. + |
+
n_annotations
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of annotations in the annotation matrix. + |
+
n_snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the annotation matrix. + |
+
shape
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The dimensions of the annotation matrix (number of variants x number of annotations). + |
+
snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The list of SNP rsIDs in the annotation matrix. + |
+
__init__(annotation_table=None, annotations=None)
+
+¶Initialize an AnnotationMatrix object
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annotation_table |
+ + | +
+
+
+ A pandas dataframe containing the annotation information. + |
+
+ None
+ |
+
annotations |
+ + | +
+
+
+ A list of array of columns to consider as annotations. If not provided, will be inferred heuristically, though we recommend that the user specify this information. + |
+
+ None
+ |
+
magenpy/AnnotationMatrix.py
add_annotation(annot_vec, annotation_name)
+
+¶Add an annotation vector or list to the AnnotationMatrix object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annot_vec |
+ + | +
+
+
+ A vector/list/Series containing the annotation information for each SNP in the AnnotationMatrix. For now, it's the responsibility of the user to make sure that the annotation list or vector are sorted properly. + |
+ + required + | +
annotation_name |
+ + | +
+
+
+ The name of the annotation to create. Make sure the name is not already in the matrix! + |
+ + required + | +
magenpy/AnnotationMatrix.py
add_annotation_from_bed(bed_file, annotation_name)
+
+¶Add an annotation to the AnnotationMatrix from a BED file that lists +the range of coordinates associated with that annotation (e.g. coding regions, enhancers, etc.). +The BED file has to adhere to the format specified by, +https://uswest.ensembl.org/info/website/upload/bed.html +with the first three columns being:
+CHR StartCoordinate EndCoordinate ...
+Note
+This implementation is quite slow at the moment. May need to find more efficient +ways to do the merge over list of ranges.
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_file |
+ + | +
+
+
+ The path to the BED file containing the annotation coordinates. + |
+ + required + | +
annotation_name |
+ + | +
+
+
+ The name of the annotation to create. Make sure the name is not already in the matrix! + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ AssertionError
+ |
+
+
+
+ If the annotation name is already in the matrix. + |
+
magenpy/AnnotationMatrix.py
filter_annotations(keep_annotations)
+
+¶Filter the list of annotations in the matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_annotations |
+ + | +
+
+
+ A list or array of annotations to keep. + |
+ + required + | +
magenpy/AnnotationMatrix.py
filter_snps(extract_snps=None, extract_file=None)
+
+¶Filter variants from the annotation matrix. User must specify +either a list of variants to extract or the path to a file +with the list of variants to extract.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list or array of SNP IDs to keep in the annotation matrix. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ The path to a file with the list of variants to extract. + |
+
+ None
+ |
+
magenpy/AnnotationMatrix.py
from_file(annot_file, annot_format='magenpy', annot_parser=None, **parse_kwargs)
+
+
+ classmethod
+
+
+¶Initialize an AnnotationMatrix object from a file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annot_file |
+ + | +
+
+
+ The path to the annotation file. + |
+ + required + | +
annot_format |
+ + | +
+
+
+ The format of the annotation file. For now, we mainly support annotation files in the |
+
+ 'magenpy'
+ |
+
annot_parser |
+ + | +
+
+
+ An |
+
+ None
+ |
+
parse_kwargs |
+ + | +
+
+
+ arguments for the pandas |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An instance of the |
+
magenpy/AnnotationMatrix.py
get_binary_annotation_index(bin_annot)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bin_annot |
+ + | +
+
+
+ The name of the binary annotation for which to fetch the relevant variants. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The indices of all variants that belong to binary annotation |
+
magenpy/AnnotationMatrix.py
split_by_chromosome()
+
+¶Split the annotation matrix by chromosome.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary of |
+
magenpy/AnnotationMatrix.py
to_file(output_path, col_subset=None, compress=True, **to_csv_kwargs)
+
+¶A convenience method to write the annotation matrix to a file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_path |
+ + | +
+
+
+ The path and prefix to the file where to write the annotation matrix. + |
+ + required + | +
col_subset |
+ + | +
+
+
+ A subset of the columns to write to file. + |
+
+ None
+ |
+
compress |
+ + | +
+
+
+ Whether to compress the output file (default: True). + |
+
+ True
+ |
+
to_csv_kwargs |
+ + | +
+
+
+ Key-word arguments to the pandas csv writer. + |
+
+ {}
+ |
+
magenpy/AnnotationMatrix.py
values(add_intercept=False)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
add_intercept |
+ + | +
+
+
+ Adds a base annotation corresponding to the intercept. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The annotation matrix as a numpy matrix. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If no annotations are defined in the table. + |
+
magenpy/AnnotationMatrix.py
+ Bases: object
A class to load and manage multiple data sources for genetic association studies. +This class is designed to handle genotype matrices, summary statistics, LD matrices, +and annotation matrices. It also provides functionalities to filter samples and/or SNPs, +harmonize data sources, and compute LD matrices. This is all done in order to facilitate +downstream statistical genetics analyses that require multiple data sources to be aligned +and harmonized. The use cases include:
+Attributes:
+Name | +Type | +Description | +
---|---|---|
genotype |
+
+ Union[Dict[int, GenotypeMatrix], None]
+ |
+
+
+
+ A dictionary of |
+
sample_table |
+
+ Union[SampleTable, None]
+ |
+
+
+
+ A |
+
phenotype_likelihood |
+
+ str
+ |
+
+
+
+ The likelihood of the phenotype (e.g. |
+
ld |
+
+ Union[Dict[int, LDMatrix], None]
+ |
+
+
+
+ A dictionary of |
+
sumstats_table |
+
+ Union[Dict[int, SumstatsTable], None]
+ |
+
+
+
+ A dictionary of |
+
annotation |
+
+ Union[Dict[int, AnnotationMatrix], None]
+ |
+
+
+
+ A dictionary of |
+
backend |
+ + | +
+
+
+ The backend software used for the computation. Currently, supports |
+
temp_dir |
+ + | +
+
+
+ The temporary directory where we store intermediate files (if necessary). + |
+
output_dir |
+ + | +
+
+
+ The output directory where we store the results of the computation. + |
+
magenpy/GWADataLoader.py
21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 + 100 + 101 + 102 + 103 + 104 + 105 + 106 + 107 + 108 + 109 + 110 + 111 + 112 + 113 + 114 + 115 + 116 + 117 + 118 + 119 + 120 + 121 + 122 + 123 + 124 + 125 + 126 + 127 + 128 + 129 + 130 + 131 + 132 + 133 + 134 + 135 + 136 + 137 + 138 + 139 + 140 + 141 + 142 + 143 + 144 + 145 + 146 + 147 + 148 + 149 + 150 + 151 + 152 + 153 + 154 + 155 + 156 + 157 + 158 + 159 + 160 + 161 + 162 + 163 + 164 + 165 + 166 + 167 + 168 + 169 + 170 + 171 + 172 + 173 + 174 + 175 + 176 + 177 + 178 + 179 + 180 + 181 + 182 + 183 + 184 + 185 + 186 + 187 + 188 + 189 + 190 + 191 + 192 + 193 + 194 + 195 + 196 + 197 + 198 + 199 + 200 + 201 + 202 + 203 + 204 + 205 + 206 + 207 + 208 + 209 + 210 + 211 + 212 + 213 + 214 + 215 + 216 + 217 + 218 + 219 + 220 + 221 + 222 + 223 + 224 + 225 + 226 + 227 + 228 + 229 + 230 + 231 + 232 + 233 + 234 + 235 + 236 + 237 + 238 + 239 + 240 + 241 + 242 + 243 + 244 + 245 + 246 + 247 + 248 + 249 + 250 + 251 + 252 + 253 + 254 + 255 + 256 + 257 + 258 + 259 + 260 + 261 + 262 + 263 + 264 + 265 + 266 + 267 + 268 + 269 + 270 + 271 + 272 + 273 + 274 + 275 + 276 + 277 + 278 + 279 + 280 + 281 + 282 + 283 + 284 + 285 + 286 + 287 + 288 + 289 + 290 + 291 + 292 + 293 + 294 + 295 + 296 + 297 + 298 + 299 + 300 + 301 + 302 + 303 + 304 + 305 + 306 + 307 + 308 + 309 + 310 + 311 + 312 + 313 + 314 + 315 + 316 + 317 + 318 + 319 + 320 + 321 + 322 + 323 + 324 + 325 + 326 + 327 + 328 + 329 + 330 + 331 + 332 + 333 + 334 + 335 + 336 + 337 + 338 + 339 + 340 + 341 + 342 + 343 + 344 + 345 + 346 + 347 + 348 + 349 + 350 + 351 + 352 + 353 + 354 + 355 + 356 + 357 + 358 + 359 + 360 + 361 + 362 + 363 + 364 + 365 + 366 + 367 + 368 + 369 + 370 + 371 + 372 + 373 + 374 + 375 + 376 + 377 + 378 + 379 + 380 + 381 + 382 + 383 + 384 + 385 + 386 + 387 + 388 + 389 + 390 + 391 + 392 + 393 + 394 + 395 + 396 + 397 + 398 + 399 + 400 + 401 + 402 + 403 + 404 + 405 + 406 + 407 + 408 + 409 + 410 + 411 + 412 + 413 + 414 + 415 + 416 + 417 + 418 + 419 + 420 + 421 + 422 + 423 + 424 + 425 + 426 + 427 + 428 + 429 + 430 + 431 + 432 + 433 + 434 + 435 + 436 + 437 + 438 + 439 + 440 + 441 + 442 + 443 + 444 + 445 + 446 + 447 + 448 + 449 + 450 + 451 + 452 + 453 + 454 + 455 + 456 + 457 + 458 + 459 + 460 + 461 + 462 + 463 + 464 + 465 + 466 + 467 + 468 + 469 + 470 + 471 + 472 + 473 + 474 + 475 + 476 + 477 + 478 + 479 + 480 + 481 + 482 + 483 + 484 + 485 + 486 + 487 + 488 + 489 + 490 + 491 + 492 + 493 + 494 + 495 + 496 + 497 + 498 + 499 + 500 + 501 + 502 + 503 + 504 + 505 + 506 + 507 + 508 + 509 + 510 + 511 + 512 + 513 + 514 + 515 + 516 + 517 + 518 + 519 + 520 + 521 + 522 + 523 + 524 + 525 + 526 + 527 + 528 + 529 + 530 + 531 + 532 + 533 + 534 + 535 + 536 + 537 + 538 + 539 + 540 + 541 + 542 + 543 + 544 + 545 + 546 + 547 + 548 + 549 + 550 + 551 + 552 + 553 + 554 + 555 + 556 + 557 + 558 + 559 + 560 + 561 + 562 + 563 + 564 + 565 + 566 + 567 + 568 + 569 + 570 + 571 + 572 + 573 + 574 + 575 + 576 + 577 + 578 + 579 + 580 + 581 + 582 + 583 + 584 + 585 + 586 + 587 + 588 + 589 + 590 + 591 + 592 + 593 + 594 + 595 + 596 + 597 + 598 + 599 + 600 + 601 + 602 + 603 + 604 + 605 + 606 + 607 + 608 + 609 + 610 + 611 + 612 + 613 + 614 + 615 + 616 + 617 + 618 + 619 + 620 + 621 + 622 + 623 + 624 + 625 + 626 + 627 + 628 + 629 + 630 + 631 + 632 + 633 + 634 + 635 + 636 + 637 + 638 + 639 + 640 + 641 + 642 + 643 + 644 + 645 + 646 + 647 + 648 + 649 + 650 + 651 + 652 + 653 + 654 + 655 + 656 + 657 + 658 + 659 + 660 + 661 + 662 + 663 + 664 + 665 + 666 + 667 + 668 + 669 + 670 + 671 + 672 + 673 + 674 + 675 + 676 + 677 + 678 + 679 + 680 + 681 + 682 + 683 + 684 + 685 + 686 + 687 + 688 + 689 + 690 + 691 + 692 + 693 + 694 + 695 + 696 + 697 + 698 + 699 + 700 + 701 + 702 + 703 + 704 + 705 + 706 + 707 + 708 + 709 + 710 + 711 + 712 + 713 + 714 + 715 + 716 + 717 + 718 + 719 + 720 + 721 + 722 + 723 + 724 + 725 + 726 + 727 + 728 + 729 + 730 + 731 + 732 + 733 + 734 + 735 + 736 + 737 + 738 + 739 + 740 + 741 + 742 + 743 + 744 + 745 + 746 + 747 + 748 + 749 + 750 + 751 + 752 + 753 + 754 + 755 + 756 + 757 + 758 + 759 + 760 + 761 + 762 + 763 + 764 + 765 + 766 + 767 + 768 + 769 + 770 + 771 + 772 + 773 + 774 + 775 + 776 + 777 + 778 + 779 + 780 + 781 + 782 + 783 + 784 + 785 + 786 + 787 + 788 + 789 + 790 + 791 + 792 + 793 + 794 + 795 + 796 + 797 + 798 + 799 + 800 + 801 + 802 + 803 + 804 + 805 + 806 + 807 + 808 + 809 + 810 + 811 + 812 + 813 + 814 + 815 + 816 + 817 + 818 + 819 + 820 + 821 + 822 + 823 + 824 + 825 + 826 + 827 + 828 + 829 + 830 + 831 + 832 + 833 + 834 + 835 + 836 + 837 + 838 + 839 + 840 + 841 + 842 + 843 + 844 + 845 + 846 + 847 + 848 + 849 + 850 + 851 + 852 + 853 + 854 + 855 + 856 + 857 + 858 + 859 + 860 + 861 + 862 + 863 + 864 + 865 + 866 + 867 + 868 + 869 + 870 + 871 + 872 + 873 + 874 + 875 + 876 + 877 + 878 + 879 + 880 + 881 + 882 + 883 + 884 + 885 + 886 + 887 + 888 + 889 + 890 + 891 + 892 + 893 + 894 + 895 + 896 + 897 + 898 + 899 + 900 + 901 + 902 + 903 + 904 + 905 + 906 + 907 + 908 + 909 + 910 + 911 + 912 + 913 + 914 + 915 + 916 + 917 + 918 + 919 + 920 + 921 + 922 + 923 + 924 + 925 + 926 + 927 + 928 + 929 + 930 + 931 + 932 + 933 + 934 + 935 + 936 + 937 + 938 + 939 + 940 + 941 + 942 + 943 + 944 + 945 + 946 + 947 + 948 + 949 + 950 + 951 + 952 + 953 + 954 + 955 + 956 + 957 + 958 + 959 + 960 + 961 + 962 + 963 + 964 + 965 + 966 + 967 + 968 + 969 + 970 + 971 + 972 + 973 + 974 + 975 + 976 + 977 + 978 + 979 + 980 + 981 + 982 + 983 + 984 + 985 + 986 + 987 + 988 + 989 + 990 + 991 + 992 + 993 + 994 + 995 + 996 + 997 + 998 + 999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 |
|
chromosomes
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The list of chromosomes that were loaded to |
+
m
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the harmonized data sources. + |
+
n
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of samples in the genotype matrix. + |
+
n_annotations
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of annotations included in the annotation matrices. + |
+
n_snps
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the harmonized data sources. + |
+
sample_size
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of samples in the genotype matrix. + |
+
samples
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The list of samples retained in the sample table. + |
+
shapes
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary where the key is the chromosome number and the value is the number of variants on that chromosome. + |
+
snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ The list of SNP rsIDs retained in each chromosome. + |
+
__init__(bed_files=None, phenotype_file=None, covariates_file=None, keep_samples=None, keep_file=None, extract_snps=None, extract_file=None, min_maf=None, min_mac=None, drop_duplicated=True, phenotype_likelihood='gaussian', sumstats_files=None, sumstats_format='magenpy', ld_store_files=None, annotation_files=None, annotation_format='magenpy', backend='xarray', temp_dir='temp', output_dir='output', verbose=True, threads=1)
+
+¶Initialize the GWADataLoader
object with the data sources required for
+downstream statistical genetics analyses.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_files |
+ + | +
+
+
+ The path to the BED file(s). You may use a wildcard here to read files for multiple chromosomes. + |
+
+ None
+ |
+
phenotype_file |
+ + | +
+
+
+ The path to the phenotype file. (Default: tab-separated file with |
+
+ None
+ |
+
covariates_file |
+ + | +
+
+
+ The path to the covariates file. (Default: tab-separated file starting with the |
+
+ None
+ |
+
keep_samples |
+ + | +
+
+
+ A vector or list of sample IDs to keep when filtering the genotype matrix. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ A path to a plink-style keep file to select a subset of individuals. + |
+
+ None
+ |
+
extract_snps |
+ + | +
+
+
+ A vector or list of SNP IDs to keep when filtering the genotype matrix. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ A path to a plink-style extract file to select a subset of SNPs. + |
+
+ None
+ |
+
min_maf |
+ + | +
+
+
+ The minimum minor allele frequency cutoff. + |
+
+ None
+ |
+
min_mac |
+ + | +
+
+
+ The minimum minor allele count cutoff. + |
+
+ None
+ |
+
drop_duplicated |
+ + | +
+
+
+ If True, drop SNPs with duplicated rsID. + |
+
+ True
+ |
+
phenotype_likelihood |
+ + | +
+
+
+ The likelihood of the phenotype (e.g. |
+
+ 'gaussian'
+ |
+
sumstats_files |
+ + | +
+
+
+ The path to the summary statistics file(s). The path may be a wildcard. + |
+
+ None
+ |
+
sumstats_format |
+ + | +
+
+
+ The format for the summary statistics. Currently supports the following formats: |
+
+ 'magenpy'
+ |
+
ld_store_files |
+ + | +
+
+
+ The path to the LD matrices. This may be a wildcard to accommodate reading data for multiple chromosomes. + |
+
+ None
+ |
+
annotation_files |
+ + | +
+
+
+ The path to the annotation file(s). The path may contain a wildcard. + |
+
+ None
+ |
+
annotation_format |
+ + | +
+
+
+ The format for the summary statistics. Currently, supports the following formats: |
+
+ 'magenpy'
+ |
+
backend |
+ + | +
+
+
+ The backend software used for computations with the genotype matrix. Currently, supports |
+
+ 'xarray'
+ |
+
temp_dir |
+ + | +
+
+
+ The temporary directory where to store intermediate files. + |
+
+ 'temp'
+ |
+
output_dir |
+ + | +
+
+
+ The output directory where to store the results of the computation. + |
+
+ 'output'
+ |
+
verbose |
+ + | +
+
+
+ Verbosity of the information printed to standard output. + |
+
+ True
+ |
+
threads |
+ + | +
+
+
+ The number of threads to use for computations. + |
+
+ 1
+ |
+
magenpy/GWADataLoader.py
46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 |
|
align_with(other_gdls, axis='SNP', how='inner')
+
+¶Align the GWADataLoader
object with other GDL objects to have the same
+set of SNPs or samples. This utility method is meant to enable the user to
+align multiple data sources for downstream analyses.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other_gdls |
+ + | +
+
+
+ A |
+ + required + | +
axis |
+ + | +
+
+
+ The axis on which to perform the alignment (can be |
+
+ 'SNP'
+ |
+
how |
+ + | +
+
+
+ The type of join to perform across the datasets. For now, we support an inner join sort of operation. !!! warning Experimental for now, would like to add more features here in the near future. + |
+
+ 'inner'
+ |
+
magenpy/GWADataLoader.py
974 + 975 + 976 + 977 + 978 + 979 + 980 + 981 + 982 + 983 + 984 + 985 + 986 + 987 + 988 + 989 + 990 + 991 + 992 + 993 + 994 + 995 + 996 + 997 + 998 + 999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 |
|
cleanup()
+
+¶Clean up all temporary files and directories
+ +magenpy/GWADataLoader.py
compute_ld(estimator, output_dir, dtype='int16', compressor_name='lz4', compression_level=5, **ld_kwargs)
+
+¶Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson
+correlation matrix between genetic variants. This function only considers correlations
+between SNPs on the same chromosome. This is a utility function that calls the
+.compute_ld()
method of the GenotypeMatrix
objects associated with
+GWADataLoader.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
estimator |
+ + | +
+
+
+ The estimator for the LD matrix. We currently support 4 different estimators: |
+ + required + | +
output_dir |
+ + | +
+
+
+ The output directory where the Zarr array containing the entries of the LD matrix will be stored. + |
+ + required + | +
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compression algorithm to use for the LD matrix. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the entries of the LD matrix (1-9). + |
+
+ 5
+ |
+
ld_kwargs |
+ + | +
+
+
+ keyword arguments for the various LD estimators. Consult the implementations of |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
filter_samples(keep_samples=None, keep_file=None)
+
+¶Filter samples from the samples table. User must specify +either a list of samples to keep or the path to a file +with the list of samples to keep.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_samples |
+ + | +
+
+
+ A list or array of sample IDs to keep. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ The path to a file with the list of samples to keep. + |
+
+ None
+ |
+
magenpy/GWADataLoader.py
filter_snps(extract_snps=None, extract_file=None, chromosome=None)
+
+¶Filter the SNP set from all the GWADataLoader objects.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list or array of SNP rsIDs to keep. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ A path to a plink-style file with SNP rsIDs to keep. + |
+
+ None
+ |
+
chromosome |
+ + | +
+
+
+ Chromosome number. If specified, applies the filter to that chromosome only. + |
+
+ None
+ |
+
magenpy/GWADataLoader.py
get_ld_matrices()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The LD matrices computed for each chromosome. + |
+
harmonize_data()
+
+¶This method ensures that the data sources (reference genotype, +LD matrices, summary statistics, annotations) are all aligned in terms of the +set of variants that they operate on as well as the designation of the effect allele for +each variant.
+Note
+This method is called automatically during the initialization of the GWADataLoader
object.
+However, if you read or manipulate the data sources after initialization,
+you may need to call this method again to ensure that the data sources remain aligned.
magenpy/GWADataLoader.py
663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 |
|
load_ld()
+
+¶A utility method to load the LD matrices to memory from on-disk storage.
+ + +perform_gwas(**gwa_kwargs)
+
+¶Perform genome-wide association testing of all variants against the phenotype.
+This is a utility function that calls the .perform_gwas()
method of the
+GenotypeMatrix
objects associated with GWADataLoader.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gwa_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to the GWA functions. Consult stats.gwa.utils for relevant keyword arguments for each backend. + |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
predict(beta=None)
+
+¶Predict the phenotype for the genotyped samples using the provided effect size
+estimates beta
. For quantitative traits, this is equivalent to performing
+linear scoring. For binary phenotypes, we transform the output using probit link function.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
beta |
+ + | +
+
+
+ A dictionary where the keys are the chromosome numbers and the values are a vector of effect sizes for each variant on that chromosome. If the betas are not provided, we use the marginal betas by default (if those are available). + |
+
+ None
+ |
+
magenpy/GWADataLoader.py
read_annotations(annot_path, annot_format='magenpy', parser=None, **parse_kwargs)
+
+¶Read the annotation matrix from file. Annotations are a set of features associated
+with each SNP and are generally represented in table format.
+Consult the documentation for AnnotationMatrix
for more details.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annot_path |
+ + | +
+
+
+ The path to the annotation file(s). The path may contain a wildcard. + |
+ + required + | +
annot_format |
+ + | +
+
+
+ The format for the summary statistics. Currently, supports the following formats: |
+
+ 'magenpy'
+ |
+
parser |
+ + | +
+
+
+ If the annotation file does not follow any of the formats above, you can create your own parser by inheriting from the base |
+
+ None
+ |
+
parse_kwargs |
+ + | +
+
+
+ keyword arguments for the parser. These are mainly parameters that will be passed to |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
read_covariates(covariates_file, **read_csv_kwargs)
+
+¶Read the covariates file and integrate it with the sample tables and genotype matrices.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
covariates_file |
+ + | +
+
+
+ The path to the covariates file (Default: tab-separated file starting with the |
+ + required + | +
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments for the |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
read_genotypes(bed_paths, keep_samples=None, keep_file=None, extract_snps=None, extract_file=None, min_maf=None, min_mac=1, drop_duplicated=True)
+
+¶Read the genotype matrix and/or associated metadata from plink's BED file format.
+Consult the documentation for GenotypeMatrix
for more details.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_paths |
+ + | +
+
+
+ The path to the BED file(s). You may use a wildcard here to read files for multiple chromosomes. + |
+ + required + | +
keep_samples |
+ + | +
+
+
+ A vector or list of sample IDs to keep when filtering the genotype matrix. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ A path to a plink-style file containing sample IDs to keep. + |
+
+ None
+ |
+
extract_snps |
+ + | +
+
+
+ A vector or list of SNP IDs to keep when filtering the genotype matrix. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ A path to a plink-style file containing SNP IDs to keep. + |
+
+ None
+ |
+
min_maf |
+ + | +
+
+
+ The minimum minor allele frequency cutoff. + |
+
+ None
+ |
+
min_mac |
+ + | +
+
+
+ The minimum minor allele count cutoff. + |
+
+ 1
+ |
+
drop_duplicated |
+ + | +
+
+
+ If True, drop SNPs with duplicated rsID. + |
+
+ True
+ |
+
magenpy/GWADataLoader.py
374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 |
|
read_ld(ld_store_paths)
+
+¶Read the LD matrix files stored on-disk in Zarr array format.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ld_store_paths |
+ + | +
+
+
+ The path to the LD matrices. This may be a wildcard to accommodate reading data for multiple chromosomes. + |
+ + required + | +
magenpy/GWADataLoader.py
read_phenotype(phenotype_file, drop_na=True, **read_csv_kwargs)
+
+¶Read the phenotype file and integrate it with the sample tables and genotype matrices.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype_file |
+ + | +
+
+
+ The path to the phenotype file (Default: tab-separated file with |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop samples with missing phenotype information. + |
+
+ True
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments for the |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
read_summary_statistics(sumstats_path, sumstats_format='magenpy', parser=None, drop_duplicated=True, **parse_kwargs)
+
+¶Read GWAS summary statistics file(s) and parse them to SumstatsTable
objects.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sumstats_path |
+ + | +
+
+
+ The path to the summary statistics file(s). The path may be a wildcard. + |
+ + required + | +
sumstats_format |
+ + | +
+
+
+ The format for the summary statistics. Currently supports the following formats: |
+
+ 'magenpy'
+ |
+
parser |
+ + | +
+
+
+ If the summary statistics file does not follow any of the formats above, you can create your own parser by inheriting from the base |
+
+ None
+ |
+
drop_duplicated |
+ + | +
+
+
+ Drop SNPs with duplicated rsIDs. + |
+
+ True
+ |
+
parse_kwargs |
+ + | +
+
+
+ keyword arguments for the parser. These are mainly parameters that will be passed to |
+
+ {}
+ |
+
magenpy/GWADataLoader.py
505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 |
|
release_ld()
+
+¶A utility function to release the LD matrices from memory.
+ + +score(beta=None, standardize_genotype=False)
+
+¶Perform linear scoring, i.e. multiply the genotype matrix by the vector of effect sizes, beta
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
beta |
+ + | +
+
+
+ A dictionary where the keys are the chromosome numbers and the values are a vector of effect sizes for each variant on that chromosome. If the betas are not provided, we use the marginal betas by default (if those are available). + |
+
+ None
+ |
+
standardize_genotype |
+ + | +
+
+
+ If True, standardize the genotype matrix before scoring. + |
+
+ False
+ |
+
magenpy/GWADataLoader.py
set_phenotype(new_phenotype, phenotype_likelihood=None)
+
+¶A convenience method to update the phenotype column for the samples.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
new_phenotype |
+ + | +
+
+
+ A vector or list of phenotype values. + |
+ + required + | +
phenotype_likelihood |
+ + | +
+
+
+ The phenotype likelihood (e.g. |
+
+ None
+ |
+
magenpy/GWADataLoader.py
split_by_chromosome()
+
+¶A utility method to split a GWADataLoader object by chromosome ID, such that
+we would have one GWADataLoader
object per chromosome. The method returns a dictionary
+where the key is the chromosome number and the value is the GWADataLoader
object corresponding
+to that chromosome only.
magenpy/GWADataLoader.py
split_by_samples(proportions=None, groups=None, keep_original=True)
+
+¶Split the GWADataLoader
object by samples, if genotype or sample data
+is available. The user must provide a list or proportion of samples in each split,
+and the method will return a list of GWADataLoader
objects with only the samples
+designated for each split. This may be a useful utility for training/testing split or some
+other downstream tasks.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
proportions |
+ + | +
+
+
+ A list with the proportion of samples in each split. Must add to 1. + |
+
+ None
+ |
+
groups |
+ + | +
+
+
+ A list of lists containing the sample IDs in each split. + |
+
+ None
+ |
+
keep_original |
+ + | +
+
+
+ If True, keep the original |
+
+ True
+ |
+
magenpy/GWADataLoader.py
sync_sample_tables()
+
+¶A utility method to sync the sample tables of the
+GenotypeMatrix
objects with the sample table under
+the GWADataLoader
object. This is especially important
+when setting new phenotypes (from the simulators) or reading
+covariates files, etc.
magenpy/GWADataLoader.py
to_individual_table()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A plink-style dataframe of individual IDs, in the form of Family ID (FID) and Individual ID (IID). + |
+
to_phenotype_table()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A plink-style dataframe with each individual's Family ID (FID), Individual ID (IID), and phenotype value. + |
+
to_snp_table(col_subset=None, per_chromosome=False)
+
+¶Get a dataframe of SNP data for all variants +across different chromosomes.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ The subset of columns to obtain. + |
+
+ None
+ |
+
per_chromosome |
+ + | +
+
+
+ If True, returns a dictionary where the key is the chromosome number and the value is the SNP table per chromosome. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dataframe (or dictionary of dataframes) of SNP data. + |
+
magenpy/GWADataLoader.py
to_summary_statistics_table(col_subset=None, per_chromosome=False)
+
+¶Get a dataframe of the GWAS summary statistics for all variants +across different chromosomes.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ The subset of columns (or summary statistics) to obtain. + |
+
+ None
+ |
+
per_chromosome |
+ + | +
+
+
+ If True, returns a dictionary where the key is the chromosome number and the value is the summary statistics table per chromosome. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dataframe (or dictionary of dataframes) of summary statistics. + |
+
magenpy/GWADataLoader.py
GenotypeMatrix
+
+
+¶
+ Bases: object
A class to represent a genotype matrix. The genotype matrix is a matrix of +where the rows represent samples and the columns represent genetic variants. +In general, genotype matrices are assumed to reside on disk and this class +provides a convenient interface to interact with and perform computations +on the genotype matrix.
+Currently, we assume that the genotype matrix is stored using plink's BED +file format, with associated tables for the samples (i.e. FAM file) and genetic +variants (i.e. BIM file). Classes that inherit from this generic class support +various backends to access and performing computations on this genotype data.
+See Also
+* [xarrayGenotypeMatrix][magenpy.GenotypeMatrix.xarrayGenotypeMatrix]
+* [plinkBEDGenotypeMatrix][magenpy.GenotypeMatrix.plinkBEDGenotypeMatrix]
+
Attributes:
+Name | +Type | +Description | +
---|---|---|
sample_table |
+
+ Union[DataFrame, SampleTable, None]
+ |
+
+
+
+ A table containing information about the samples in the genotype matrix (initially read from the FAM file). + |
+
snp_table |
+
+ Union[DataFrame, None]
+ |
+
+
+
+ A table containing information about the genetic variants in the genotype matrix (initially read from the BIM file). + |
+
bed_file |
+ + | +
+
+
+ The path to the plink BED file containing the genotype matrix. + |
+
_genome_build |
+ + | +
+
+
+ The genome build or assembly under which the SNP coordinates are defined. + |
+
temp_dir |
+ + | +
+
+
+ The directory where temporary files will be stored (if needed). + |
+
cleanup_dir_list |
+ + | +
+
+
+ A list of directories to clean up after execution. + |
+
threads |
+ + | +
+
+
+ The number of threads to use for parallel computations. + |
+
magenpy/GenotypeMatrix.py
9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 |
|
a1
+
+
+ property
+
+
+¶See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The effect allele |
+
a2
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The reference allele |
+
alt_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The effect allele |
+
bp_pos
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The basepair position for the genetic variants in the genotype matrix. + |
+
chromosome
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chromosome associated with the variants in the genotype matrix. ..note:: This is a convenience method that assumes that the genotype matrix contains variants from a single chromosome. If there are multiple chromosomes, the method will return |
+
chromosomes
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The unique set of chromosomes comprising the genotype matrix. + |
+
cm_pos
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The position of genetic variants in the genotype matrix in units of Centi Morgan. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the genetic distance is not set in the genotype file. + |
+
effect_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The effect allele |
+
genome_build
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The genome build or assembly under which the SNP coordinates are defined. + |
+
m
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the genotype matrix. + |
+
maf
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The minor allele frequency (MAF) of each variant in the genotype matrix. + |
+
maf_var
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The variance in minor allele frequency (MAF) of each variant in the genotype matrix. + |
+
n
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sample size or number of individuals in the genotype matrix. + |
+
n_per_snp
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ Sample size per genetic variant (accounting for potential missing values). + |
+
n_snps
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the genotype matrix. + |
+
ref_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The reference allele |
+
sample_size
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sample size or number of individuals in the genotype matrix. + |
+
samples
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An array of sample IDs in the genotype matrix. + |
+
shape
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The shape of the genotype matrix. Rows correspond to the number of samples and columns to the number of SNPs. + |
+
snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The SNP rsIDs for variants in the genotype matrix. + |
+
__init__(sample_table=None, snp_table=None, temp_dir='temp', bed_file=None, genome_build=None, threads=1, **kwargs)
+
+¶Initialize a GenotypeMatrix object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+
+ Union[DataFrame, SampleTable, None]
+ |
+
+
+
+ A table containing information about the samples in the genotype matrix. + |
+
+ None
+ |
+
snp_table |
+
+ Union[DataFrame, None]
+ |
+
+
+
+ A table containing information about the genetic variants in the genotype matrix. + |
+
+ None
+ |
+
temp_dir |
+
+ str
+ |
+
+
+
+ The directory where temporary files will be stored (if needed). + |
+
+ 'temp'
+ |
+
bed_file |
+
+ str
+ |
+
+
+
+ The path to the plink BED file containing the genotype matrix. + |
+
+ None
+ |
+
genome_build |
+ + | +
+
+
+ The genome build or assembly under which the SNP coordinates are defined. + |
+
+ None
+ |
+
threads |
+ + | +
+
+
+ The number of threads to use for parallel computations. + |
+
+ 1
+ |
+
kwargs |
+ + | +
+
+
+ Additional keyword arguments. + |
+
+ {}
+ |
+
magenpy/GenotypeMatrix.py
cleanup()
+
+¶Clean up all temporary files and directories
+ + +compute_allele_frequency()
+
+¶Compute the allele frequency of each variant or SNP in the genotype matrix.
+ + + +Raises:
+Type | +Description | +
---|---|
+ NotImplementedError
+ |
+
+
+
+ If the method is not implemented in the subclass. + |
+
magenpy/GenotypeMatrix.py
compute_ld(estimator, output_dir, dtype='int16', compressor_name='lz4', compression_level=5, **ld_kwargs)
+
+¶Compute the Linkage-Disequilibrium (LD) or SNP-by-SNP correlation matrix +for the variants defined in the genotype matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
estimator |
+ + | +
+
+
+ The estimator for the LD matrix. We currently support 4 different estimators: |
+ + required + | +
output_dir |
+ + | +
+
+
+ The output directory where the Zarr array containing the entries of the LD matrix will be stored. + |
+ + required + | +
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the Zarr array. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level for the Zarr array (1-9) + |
+
+ 5
+ |
+
ld_kwargs |
+ + | +
+
+
+ keyword arguments for the various LD estimators. Consult the implementations of |
+
+ {}
+ |
+
magenpy/GenotypeMatrix.py
compute_sample_size_per_snp()
+
+¶Compute the sample size for each variant in the genotype matrix, accounting for +potential missing values.
+ + + +Raises:
+Type | +Description | +
---|---|
+ NotImplementedError
+ |
+
+
+
+ If the method is not implemented in the subclass. + |
+
magenpy/GenotypeMatrix.py
drop_duplicated_snps()
+
+¶A convenience method to drop variants with duplicated SNP rsIDs.
+ +magenpy/GenotypeMatrix.py
estimate_memory_allocation(dtype=np.float32)
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An estimate of the memory allocation for the genotype matrix in megabytes. + |
+
filter_by_allele_frequency(min_maf=None, min_mac=1)
+
+¶Filter variants by minimum minor allele frequency or allele count cutoffs.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
min_maf |
+ + | +
+
+
+ Minimum minor allele frequency + |
+
+ None
+ |
+
min_mac |
+ + | +
+
+
+ Minimum minor allele count (1 by default) + |
+
+ 1
+ |
+
magenpy/GenotypeMatrix.py
filter_samples(keep_samples=None, keep_file=None)
+
+¶Filter samples from the genotype matrix. User must specify +either a list of samples to keep or the path to a plink-style file +with the list of samples to keep.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_samples |
+ + | +
+
+
+ A list (or array) of sample IDs to keep in the genotype matrix. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ The path to a file with the list of samples to keep. + |
+
+ None
+ |
+
magenpy/GenotypeMatrix.py
filter_snps(extract_snps=None, extract_file=None)
+
+¶Filter variants from the genotype matrix. User must specify +either a list of variants to extract or the path to a plink-style file +with the list of variants to extract.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list (or array) of SNP IDs to keep in the genotype matrix. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ The path to a file with the list of variants to extract. + |
+
+ None
+ |
+
magenpy/GenotypeMatrix.py
from_file(file_path, temp_dir='temp', **kwargs)
+
+
+ classmethod
+
+
+¶Initialize a genotype matrix object by passing a file path + other keyword arguments.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_path |
+
+ str
+ |
+
+
+
+ The path to the plink BED file. + |
+ + required + | +
temp_dir |
+
+ str
+ |
+
+
+
+ The directory where temporary files will be stored. + |
+
+ 'temp'
+ |
+
kwargs |
+ + | +
+
+
+ Additional keyword arguments. + |
+
+ {}
+ |
+
magenpy/GenotypeMatrix.py
get_snp_attribute(attr)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
attr |
+ + | +
+
+
+ The name of the attribute to extract from the SNP table. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The values of a specific attribute for each variant in the genotype matrix. + |
+
magenpy/GenotypeMatrix.py
get_snp_table(col_subset=None)
+
+¶A convenience method to extract SNP-related information from the genotype matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ A list of columns to extract from the SNP table. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/GenotypeMatrix.py
perform_gwas(**gwa_kwargs)
+
+¶Perform genome-wide association testing of all variants against the phenotype.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gwa_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to the GWA functions. Consult |
+
+ {}
+ |
+
Raises:
+Type | +Description | +
---|---|
+ NotImplementedError
+ |
+
+
+
+ If the method is not implemented in the subclass. + |
+
magenpy/GenotypeMatrix.py
score(beta, standardize_genotype=False)
+
+¶Perform linear scoring, i.e. multiply the genotype matrix by the vector of effect sizes, beta
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
beta |
+ + | +
+
+
+ A vector of effect sizes for each variant in the genotype matrix. + |
+ + required + | +
standardize_genotype |
+ + | +
+
+
+ If True, standardized the genotype matrix when computing the score. + |
+
+ False
+ |
+
magenpy/GenotypeMatrix.py
set_sample_table(sample_table)
+
+¶A convenience method set the sample table for the genotype matrix. +This may be useful for syncing sample tables across different Genotype matrices +corresponding to different chromosomes or genomic regions.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+ + | +
+
+
+ An instance of SampleTable or a pandas dataframe containing information about the samples in the genotype matrix. + |
+ + required + | +
magenpy/GenotypeMatrix.py
split_by_chromosome()
+
+¶Split the genotype matrix by chromosome, so that we would
+have a separate GenotypeMatrix
objects for each chromosome.
+This method returns a dictionary where the key is the chromosome number
+and the value is an object of GenotypeMatrix
for that chromosome.
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary of |
+
magenpy/GenotypeMatrix.py
plinkBEDGenotypeMatrix
+
+
+¶
+ Bases: GenotypeMatrix
A class that defines methods and interfaces for interacting with genotype matrices
+using plink2
software. This class provides a convenient interface to perform various
+computations on genotype matrices stored in the plink BED format.
This class inherits all the attributes of the GenotypeMatrix
class.
magenpy/GenotypeMatrix.py
783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 |
|
__init__(sample_table=None, snp_table=None, temp_dir='temp', bed_file=None, genome_build=None, threads=1)
+
+¶Initialize a plinkBEDGenotypeMatrix
object.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+ + | +
+
+
+ A table containing information about the samples in the genotype matrix. + |
+
+ None
+ |
+
snp_table |
+ + | +
+
+
+ A table containing information about the genetic variants in the genotype matrix. + |
+
+ None
+ |
+
temp_dir |
+ + | +
+
+
+ The directory where temporary files will be stored (if needed). + |
+
+ 'temp'
+ |
+
bed_file |
+ + | +
+
+
+ The path to the plink BED file containing the genotype matrix. + |
+
+ None
+ |
+
genome_build |
+ + | +
+
+
+ The genome build or assembly under which the SNP coordinates are defined. + |
+
+ None
+ |
+
threads |
+ + | +
+
+
+ The number of threads to use for parallel computations. + |
+
+ 1
+ |
+
magenpy/GenotypeMatrix.py
compute_allele_frequency()
+
+¶Compute the allele frequency of each variant or SNP in the genotype matrix.
+This method calls specialized functions that, in turn, call plink2
to compute
+allele frequency.
magenpy/GenotypeMatrix.py
compute_sample_size_per_snp()
+
+¶Compute the sample size for each variant in the genotype matrix, accounting for +potential missing values.
+This method calls specialized functions that, in turn, call plink2
to compute sample
+size per variant.
magenpy/GenotypeMatrix.py
from_file(file_path, temp_dir='temp', **kwargs)
+
+
+ classmethod
+
+
+¶A convenience method to create a plinkBEDGenotypeMatrix
object by
+ providing a path to a PLINK BED file.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_path |
+ + | +
+
+
+ The path to the plink BED file. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ The directory where temporary files will be stored. + |
+
+ 'temp'
+ |
+
kwargs |
+ + | +
+
+
+ Additional keyword arguments. + |
+
+ {}
+ |
+
magenpy/GenotypeMatrix.py
perform_gwas(**gwa_kwargs)
+
+¶Perform genome-wide association testing of all variants against the phenotype.
+This method calls specialized functions that, in turn, call plink2
to perform
+the association testing.
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A Summary statistics table containing the results of the association testing. + |
+
magenpy/GenotypeMatrix.py
score(beta, standardize_genotype=False)
+
+¶Perform linear scoring on the genotype matrix. This function takes a vector (or matrix) of +effect sizes and returns the matrix-vector or matrix-matrix product of the genotype matrix +multiplied by the effect sizes.
+This can be used for polygenic score calculation or projecting the genotype matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
beta |
+ + | +
+
+
+ A vector or matrix of effect sizes for each variant in the genotype matrix. + |
+ + required + | +
standardize_genotype |
+ + | +
+
+
+ If True, standardize the genotype when computing the polygenic score. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The polygenic score (PGS) for each sample in the genotype matrix. + |
+
magenpy/GenotypeMatrix.py
split_by_chromosome()
+
+¶Split the genotype matrix by chromosome.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary of |
+
magenpy/GenotypeMatrix.py
xarrayGenotypeMatrix
+
+
+¶
+ Bases: GenotypeMatrix
A class that defines methods and interfaces for interacting with genotype matrices
+using the xarray
library. In particular, the class leverages functionality provided by
+the pandas-plink
package to represent on-disk genotype matrices as chunked multidimensional
+arrays that can be queried and manipulated efficiently and in parallel.
This class inherits all the attributes of the GenotypeMatrix
class.
Attributes:
+Name | +Type | +Description | +
---|---|---|
xr_mat |
+ + | +
+
+
+ The |
+
magenpy/GenotypeMatrix.py
550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 |
|
__init__(sample_table=None, snp_table=None, bed_file=None, temp_dir='temp', xr_mat=None, genome_build=None, threads=1)
+
+¶Initialize an xarrayGenotypeMatrix object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+ + | +
+
+
+ A table containing information about the samples in the genotype matrix. + |
+
+ None
+ |
+
snp_table |
+ + | +
+
+
+ A table containing information about the genetic variants in the genotype matrix. + |
+
+ None
+ |
+
bed_file |
+ + | +
+
+
+ The path to the plink BED file containing the genotype matrix. + |
+
+ None
+ |
+
temp_dir |
+ + | +
+
+
+ The directory where temporary files will be stored (if needed). + |
+
+ 'temp'
+ |
+
xr_mat |
+ + | +
+
+
+ The xarray object representing the genotype matrix. + |
+
+ None
+ |
+
genome_build |
+ + | +
+
+
+ The genome build or assembly under which the SNP coordinates are defined. + |
+
+ None
+ |
+
threads |
+ + | +
+
+
+ The number of threads to use for parallel computations. + |
+
+ 1
+ |
+
magenpy/GenotypeMatrix.py
compute_allele_frequency()
+
+¶A convenience method that calls specialized utility functions that +compute the allele frequency of each variant or SNP in the genotype matrix.
+ +magenpy/GenotypeMatrix.py
compute_sample_size_per_snp()
+
+¶A convenience method that calls specialized utility functions that compute +the sample size for each variant in the genotype matrix, accounting for +potential missing values.
+ +magenpy/GenotypeMatrix.py
filter_samples(keep_samples=None, keep_file=None)
+
+¶Filter samples from the genotype matrix. +User must specify either a list of samples to keep or the path to a file with the list of samples to keep.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_samples |
+ + | +
+
+
+ A list (or array) of sample IDs to keep in the genotype matrix. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ The path to a file with the list of samples to keep. + |
+
+ None
+ |
+
magenpy/GenotypeMatrix.py
filter_snps(extract_snps=None, extract_file=None)
+
+¶Filter variants from the genotype matrix. User must specify either a list of variants to +extract or the path to a file with the list of variants to extract.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list or array of SNP rsIDs to keep in the genotype matrix. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ The path to a file with the list of variants to extract. + |
+
+ None
+ |
+
magenpy/GenotypeMatrix.py
from_file(file_path, temp_dir='temp', **kwargs)
+
+
+ classmethod
+
+
+¶Create a GenotypeMatrix object using a PLINK BED file with the help
+of the data structures defined in pandas_plink
. The genotype matrix
+will be represented implicitly in an xarray
object, and we will use it
+to perform various computations. This method is a utility function to
+construct the genotype matrix object from a plink BED file.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_path |
+ + | +
+
+
+ Path to the plink BED file. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ The directory where the temporary files will be stored. + |
+
+ 'temp'
+ |
+
kwargs |
+ + | +
+
+
+ Additional keyword arguments. + |
+
+ {}
+ |
+
magenpy/GenotypeMatrix.py
perform_gwas(**gwa_kwargs)
+
+¶A convenience method that calls specialized utility functions that perform +genome-wide association testing of all variants against the phenotype.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A Summary statistics table containing the results of the association testing. + |
+
magenpy/GenotypeMatrix.py
score(beta, standardize_genotype=False, skip_na=True)
+
+¶Perform linear scoring on the genotype matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
beta |
+ + | +
+
+
+ A vector or matrix of effect sizes for each variant in the genotype matrix. + |
+ + required + | +
standardize_genotype |
+ + | +
+
+
+ If True, standardize the genotype when computing the polygenic score. + |
+
+ False
+ |
+
skip_na |
+ + | +
+
+
+ If True, skip missing values when computing the polygenic score. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The polygenic score (PGS) for each sample in the genotype matrix. + |
+
magenpy/GenotypeMatrix.py
set_sample_table(sample_table)
+
+¶A convenience method set the sample table for the genotype matrix. +This is useful for cases when we need to sync the sample table across chromosomes.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+ + | +
+
+
+ An instance of SampleTable or a pandas dataframe containing information about the samples in the genotype matrix. + |
+ + required + | +
magenpy/GenotypeMatrix.py
split_by_chromosome()
+
+¶Split the genotype matrix by chromosome.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary of |
+
magenpy/GenotypeMatrix.py
to_csr(dtype=np.int8)
+
+¶Convert the genotype matrix to a scipy sparse CSR matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dtype |
+ + | +
+
+
+ The data type of the scipy array. Default: Int8 + |
+
+ int8
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/GenotypeMatrix.py
to_numpy(dtype=np.int8)
+
+¶Convert the genotype matrix to a numpy array.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dtype |
+ + | +
+
+
+ The data type of the numpy array. Default: Int8 + |
+
+ int8
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A numpy array representation of the genotype matrix. + |
+
magenpy/GenotypeMatrix.py
+ Bases: object
A class that represents Linkage-Disequilibrium (LD) matrices, which record
+the SNP-by-SNP pairwise correlations in a sample of genetic data. The class
+provides various functionalities for initializing, storing, loading, and
+performing computations with LD matrices. The LD matrices are stored in a
+hierarchical format using the Zarr
library, which allows for efficient
+storage and retrieval of the data.
The class provides the following functionalities:
+LDMatrix
object from plink's LD table files.LDMatrix
object from a sparse CSR matrix.LDMatrix
object from a Zarr array store.The Zarr hierarchy is structured as follows:
+chr_22.zarr
: The Zarr group.matrix
: The subgroup containing the data of the LD matrix in Scipy Sparse CSR matrix format.data
: The array containing the non-zero entries of the LD matrix.indptr
: The array containing the index pointers for the CSR matrix.metadata
: The subgroup containing the metadata for variants included in the LD matrix.snps
: The array containing the SNP rsIDs.a1
: The array containing the alternative alleles.a2
: The array containing the reference alleles.maf
: The array containing the minor allele frequencies.bp
: The array containing the base pair positions.cm
: The array containing the centi Morgan positions.ldscore
: The array containing the LD scores.attrs
: A JSON-style metadata object containing general information about how the LD matrix
+was calculated, including the chromosome number, sample size, genome build, LD estimator,
+and estimator properties.Attributes:
+Name | +Type | +Description | +
---|---|---|
_zg |
+ + | +
+
+
+ The Zarr group object that stores the LD matrix and its metadata. + |
+
_mat |
+ + | +
+
+
+ The in-memory CSR matrix object. + |
+
in_memory |
+ + | +
+
+
+ A boolean flag indicating whether the LD matrix is in memory. + |
+
is_symmetric |
+ + | +
+
+
+ A boolean flag indicating whether the LD matrix is symmetric. + |
+
index |
+ + | +
+
+
+ An integer index for the current SNP in the LD matrix (useful for iterators). + |
+
_mask |
+ + | +
+
+
+ A boolean mask for filtering the LD matrix. + |
+
magenpy/LDMatrix.py
|
|
a1
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The alternative alleles of the variants included in the LD matrix. + |
+
a2
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The reference alleles of the variants included in the LD matrix. + |
+
bp_position
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The base pair position of each SNP in the LD matrix. + |
+
chromosome
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chromosome for which this LD matrix was calculated. + |
+
chunk_size
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chunk size for the data array of the LD matrix. + |
+
chunks
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chunks for the data array of the LD matrix. + |
+
cm_position
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The centi Morgan (cM) position of each variant in the LD matrix. + |
+
compressor
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The |
+
csr_matrix
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The in-memory CSR matrix object. ..note :: If the LD matrix is not in-memory, then it'll be loaded using default settings. + |
+
data
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The |
+
dtype
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The data type for the entries of the |
+
estimator_properties
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The properties of the LD estimator used to compute the LD matrix. + |
+
genome_build
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The genome build based on which the base pair coordinates are defined. + |
+
indices
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The column indices of the non-zero elements of the sparse, CSR representation of the LD matrix. + |
+
indptr
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The index pointers |
+
ld_boundaries
+
+
+ property
+
+
+¶The LD boundaries associated with each variant.
+The LD boundaries are defined as the index of the leftmost neighbor
+(lower boundary) and the rightmost neighbor (upper boundary) of for each variant.
+If the LD matrix is upper triangular, then the boundaries for variant i
go from i + 1
to i + k_i
,
+where k_i
is the number of neighbors that SNP i
is in LD with.
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A matrix of shape |
+
ld_estimator
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The LD estimator used to compute the LD matrix. Examples include: |
+
ld_score
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The LD score of each variant in the LD matrix. + |
+
maf
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The minor allele frequency (MAF) of the alternative allele (A1) in the LD matrix. + |
+
n_neighbors
+
+
+ property
+
+
+¶The number of variants in the LD window for each SNP.
+See Also
+Note
+This includes the variant itself if the matrix is in memory and is symmetric.
+n_snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the LD matrix. If the matrix is loaded and filtered, we return the number of variants remaining after applying the filter. + |
+
row_indices
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The row indices of the non-zero elements of the sparse, CSR representation of the LD matrix + |
+
sample_size
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sample size used to compute the LD matrix. + |
+
shape
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The shape of the square LD matrix. + |
+
snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ rsIDs of the variants included in the LD matrix. + |
+
store
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The Zarr group store object. + |
+
stored_dtype
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The data type for the stored entries of |
+
stored_n_snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants stored in the LD matrix (irrespective of any masks / filters). + |
+
stored_shape
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The shape of the stored LD matrix (irrespective of any masks / filters). + |
+
window_size
+
+
+ property
+
+
+¶See Also
+Note
+This includes the variant itself if the matrix is in memory and is symmetric.
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the LD window for each SNP. + |
+
zarr_group
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The Zarr group object that stores the LD matrix and its metadata. + |
+
__init__(zarr_group, symmetric=False)
+
+¶Initialize an LDMatrix
object from a Zarr group store.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
zarr_group |
+ + | +
+
+
+ The Zarr group object that stores the LD matrix. + |
+ + required + | +
symmetric |
+ + | +
+
+
+ A boolean flag indicating whether to represent the LD matrix as symmetric. + |
+
+ False
+ |
+
magenpy/LDMatrix.py
__iter__()
+
+¶TODO: Add a flag to allow for chunked iterator, with limited memory footprint.
+ + +compute_ld_scores(annotation_matrix=None, corrected=True, chunk_size=10000)
+
+¶Computes the LD scores for variants in the LD matrix. LD Scores are defined +as the sum of the squared pairwise Pearson Correlation coefficient between the focal SNP and +all its neighboring SNPs. See Bulik-Sullivan et al. (2015) for details.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annotation_matrix |
+ + | +
+
+
+ A matrix of annotations for each variant for which to aggregate the LD scores. + |
+
+ None
+ |
+
corrected |
+ + | +
+
+
+ Use the sample-size corrected estimator for the squared Pearson correlation coefficient. See Bulik-Sullivan et al. (2015). + |
+
+ True
+ |
+
chunk_size |
+ + | +
+
+
+ Specify the number of rows (i.e. SNPs) to compute the LD scores for simultaneously. Smaller chunk sizes should require less memory resources. If set to None, we compute LD scores for all SNPs in the LD matrix in one go. + |
+
+ 10000
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An array of LD scores for each variant in the LD matrix. + |
+
magenpy/LDMatrix.py
871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 |
|
dot(vec)
+
+¶Multiply the LD matrix with an input vector vec
.
See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The product of the LD matrix with the input vector. + |
+
magenpy/LDMatrix.py
estimate_uncompressed_size(dtype=None)
+
+¶Provide an estimate of size of the uncompressed LD matrix in megabytes (MB). +This is only a rough estimate. Depending on how the LD matrix is loaded, the actual size +may be much larger than this estimate.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The estimated size of the uncompressed LD matrix in MB. + |
+
magenpy/LDMatrix.py
filter_snps(extract_snps=None, extract_file=None)
+
+¶Filter the LDMatrix to keep a subset of variants. This mainly sets +the mask for the LD matrix, which is used to hide/remove some SNPs from the LD matrix, +without altering the stored objects on-disk.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list or array of SNP rsIDs to keep. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ A plink-style file containing the SNP rsIDs to keep. + |
+
+ None
+ |
+
magenpy/LDMatrix.py
from_csr(csr_mat, store_path, overwrite=False, dtype='int16', compressor_name='lz4', compression_level=5)
+
+
+ classmethod
+
+
+¶Initialize an LDMatrix object from a sparse CSR matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
csr_mat |
+ + | +
+
+
+ The sparse CSR matrix. + |
+ + required + | +
store_path |
+ + | +
+
+
+ The path to the Zarr LD store where the data will be stored. + |
+ + required + | +
overwrite |
+ + | +
+
+
+ If True, it overwrites the LD store at |
+
+ False
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor or compression algorithm to use with Zarr. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use with the compressor (1-9). + |
+
+ 5
+ |
+
magenpy/LDMatrix.py
from_dense_zarr_matrix(dense_zarr, ld_boundaries, store_path, overwrite=False, delete_original=False, dtype='int16', compressor_name='lz4', compression_level=5)
+
+
+ classmethod
+
+
+¶Initialize a new LD matrix object using a Zarr array object. This method is +useful for converting a dense LD matrix computed using Dask (or other distributed computing +software) to a sparse or banded one.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dense_zarr |
+ + | +
+
+
+ The path to the dense Zarr array object. + |
+ + required + | +
ld_boundaries |
+ + | +
+
+
+ The LD boundaries for each SNP in the LD matrix (delineates the indices of the leftmost and rightmost neighbors of each SNP). + |
+ + required + | +
store_path |
+ + | +
+
+
+ The path where to store the new LD matrix. + |
+ + required + | +
overwrite |
+ + | +
+
+
+ If True, it overwrites the LD store at |
+
+ False
+ |
+
delete_original |
+ + | +
+
+
+ If True, it deletes the original dense LD matrix. + |
+
+ False
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor or compression algorithm to use with Zarr. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use with the compressor (1-9). + |
+
+ 5
+ |
+
magenpy/LDMatrix.py
252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 |
|
from_dir(ld_store_path)
+
+
+ classmethod
+
+
+¶Initialize an LDMatrix
object from a Zarr array store.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ld_store_path |
+ + | +
+
+
+ The path to the Zarr array store on the filesystem. !!! seealso "See Also" * from_path + |
+ + required + | +
magenpy/LDMatrix.py
from_path(ld_store_path)
+
+
+ classmethod
+
+
+¶Initialize an LDMatrix
object from a pre-computed Zarr group store.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ld_store_path |
+ + | +
+
+
+ The path to the Zarr array store on the filesystem. !!! seealso "See Also" * from_dir + |
+ + required + | +
magenpy/LDMatrix.py
from_plink_table(plink_ld_file, snps, store_path, pandas_chunksize=None, overwrite=False, dtype='int16', compressor_name='lz4', compression_level=5)
+
+
+ classmethod
+
+
+¶Construct a Zarr LD matrix using output tables from plink1.9. +This class method takes the following inputs:
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plink_ld_file |
+ + | +
+
+
+ The path to the plink LD table file. + |
+ + required + | +
snps |
+ + | +
+
+
+ An iterable containing the list of SNPs in the LD matrix. + |
+ + required + | +
store_path |
+ + | +
+
+
+ The path to the Zarr LD store. + |
+ + required + | +
pandas_chunksize |
+ + | +
+
+
+ If the LD table is large, provide chunk size (i.e. number of rows to process at each step) to keep memory footprint manageable. + |
+
+ None
+ |
+
overwrite |
+ + | +
+
+
+ If True, it overwrites the LD store at |
+
+ False
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor or compression algorithm to use with Zarr. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use with the compressor (1-9). + |
+
+ 5
+ |
+
magenpy/LDMatrix.py
164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 |
|
from_ragged_zarr_matrix(ragged_zarr, store_path, overwrite=False, delete_original=False, dtype='int16', compressor_name='lz4', compression_level=5)
+
+
+ classmethod
+
+
+¶Initialize a new LD matrix object using a Zarr array object +conforming to the old LD Matrix format from magenpy v<=0.0.12.
+This utility function will also copy some of the stored attributes +associated with the matrix in the old format.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ragged_zarr |
+ + | +
+
+
+ The path to the ragged Zarr array object. + |
+ + required + | +
store_path |
+ + | +
+
+
+ The path where to store the new LD matrix. + |
+ + required + | +
overwrite |
+ + | +
+
+
+ If True, it overwrites the LD store at |
+
+ False
+ |
+
delete_original |
+ + | +
+
+
+ If True, it deletes the original ragged LD matrix. + |
+
+ False
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor or compression algorithm to use with Zarr. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use with the compressor (1-9). + |
+
+ 5
+ |
+
magenpy/LDMatrix.py
350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 |
|
get_mask()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The mask (a boolean flag array) used to hide/remove some SNPs from the LD matrix. + |
+
get_metadata(key, apply_mask=True)
+
+¶Get the metadata associated with each variant in the LD matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+ + | +
+
+
+ The key for the metadata item. + |
+ + required + | +
apply_mask |
+ + | +
+
+
+ If True, apply the mask (e.g. filter) to the metadata. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The metadata item for each variant in the LD matrix. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ if the metadata item is not set. + |
+
magenpy/LDMatrix.py
get_row(index, return_indices=False)
+
+¶Extract a single row from the LD matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
index |
+ + | +
+
+
+ The index of the row to extract. + |
+ + required + | +
return_indices |
+ + | +
+
+
+ If True, return the indices of the non-zero elements of that row. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The requested row of the LD matrix. + |
+
magenpy/LDMatrix.py
get_store_attr(attr)
+
+¶Get the attribute or metadata attr
associated with the LD matrix.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
attr |
+ + | +
+
+
+ The attribute name. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The value for the attribute. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ if the attribute is not set. + |
+
magenpy/LDMatrix.py
load(force_reload=False, return_symmetric=True, fill_diag=True, dtype=None)
+
+¶Load the LD matrix from on-disk storage in the form of Zarr arrays to memory, +in the form of sparse CSR matrices.
+See Also
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
force_reload |
+ + | +
+
+
+ If True, it will reload the data even if it is already in memory. + |
+
+ False
+ |
+
return_symmetric |
+ + | +
+
+
+ If True, return a full symmetric representation of the LD matrix. + |
+
+ True
+ |
+
fill_diag |
+ + | +
+
+
+ If True, fill the diagonal elements of the LD matrix with ones. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The LD matrix as a sparse CSR matrix. + |
+
magenpy/LDMatrix.py
1295 +1296 +1297 +1298 +1299 +1300 +1301 +1302 +1303 +1304 +1305 +1306 +1307 +1308 +1309 +1310 +1311 +1312 +1313 +1314 +1315 +1316 +1317 +1318 +1319 +1320 +1321 +1322 +1323 +1324 +1325 +1326 +1327 +1328 +1329 +1330 +1331 +1332 +1333 +1334 +1335 +1336 +1337 +1338 +1339 +1340 +1341 +1342 +1343 +1344 +1345 +1346 +1347 +1348 +1349 |
|
load_rows(start_row=None, end_row=None, return_symmetric=False, fill_diag=False, keep_shape=True, dtype=None)
+
+¶A utility function to allow for loading a subset of the LD matrix.
+By specifying start_row
and end_row
, the user can process or inspect small
+blocks of the LD matrix without loading the whole thing into memory.
TODO: Consider using low_memory_load
internally to avoid reconstructing the indices
array.
Note
+This method does not perform any filtering on the stored data.
+To access the LD matrix with filtering, use .load()
or low_memory_load
.
See Also
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
start_row |
+ + | +
+
+
+ The start row to load to memory + |
+
+ None
+ |
+
end_row |
+ + | +
+
+
+ The end row (not inclusive) to load to memory + |
+
+ None
+ |
+
return_symmetric |
+ + | +
+
+
+ If True, return a full symmetric representation of the LD matrix. + |
+
+ False
+ |
+
fill_diag |
+ + | +
+
+
+ If True, fill the diagonal of the LD matrix with ones. + |
+
+ False
+ |
+
keep_shape |
+ + | +
+
+
+ If True, return the LD matrix with the same shape as the original. Here, entries that are outside the requested start_row:end_row region will be zeroed out. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The requested sub-matrix of the LD matrix. + |
+
magenpy/LDMatrix.py
1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +1229 +1230 +1231 +1232 +1233 +1234 +1235 +1236 +1237 +1238 +1239 +1240 +1241 +1242 +1243 +1244 +1245 +1246 +1247 +1248 +1249 +1250 +1251 +1252 +1253 +1254 +1255 +1256 +1257 +1258 +1259 +1260 +1261 +1262 +1263 +1264 +1265 +1266 +1267 +1268 +1269 +1270 +1271 +1272 +1273 +1274 +1275 +1276 +1277 +1278 +1279 +1280 +1281 +1282 +1283 +1284 +1285 +1286 +1287 +1288 +1289 +1290 +1291 +1292 +1293 |
|
low_memory_load(dtype=None)
+
+¶A utility method to load the LD matrix in low-memory mode.
+The method will load the entries of the upper triangular portion of the matrix,
+perform filtering based on the mask (if set), and return the filtered data
+and index pointer (indptr
) arrays.
This is useful for some application, such as the low_memory
version of
+the viprs
method, because it avoids reconstructing the indices
array for the CSR matrix,
+which can potentially be a very long array of large integers.
Note
+The method, by construction, does not support loading the full symmetric matrix. If
+that's the goal, use the .load()
or .load_rows()
methods.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A tuple of the data and index pointer arrays for the LD matrix. + |
+
magenpy/LDMatrix.py
1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 |
|
multiply(vec)
+
+¶Multiply the LD matrix with an input vector vec
.
See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The product of the LD matrix with the input vector. + |
+
magenpy/LDMatrix.py
release()
+
+¶set_mask(mask)
+
+¶Set the mask (a boolean array) to hide/remove some SNPs from the LD matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
mask |
+ + | +
+
+
+ An array of indices or boolean mask for SNPs to retain. + |
+ + required + | +
magenpy/LDMatrix.py
set_metadata(key, value, overwrite=False)
+
+¶Set the metadata field associated with variants the LD matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+ + | +
+
+
+ The key for the metadata item. + |
+ + required + | +
value |
+ + | +
+
+
+ The value for the metadata item (an array with the same length as the number of variants). + |
+ + required + | +
overwrite |
+ + | +
+
+
+ If True, overwrite the metadata item if it already exists. + |
+
+ False
+ |
+
magenpy/LDMatrix.py
set_store_attr(attr, value)
+
+¶Set the attribute attr
associated with the LD matrix. This is used
+to set high-level information, such as information about the sample from which
+the matrix was computed, the LD estimator used, its properties, etc.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
attr |
+ + | +
+
+
+ The attribute name. + |
+ + required + | +
value |
+ + | +
+
+
+ The value for the attribute. + |
+ + required + | +
magenpy/LDMatrix.py
to_snp_table(col_subset=None)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ The subset of columns to add to the table. If None, it returns all available columns. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/LDMatrix.py
update_rows_inplace(new_csr, start_row=None, end_row=None)
+
+¶A utility function to perform partial updates to a subset of rows in the
+LD matrix. The function takes a new CSR matrix and, optionally, a start
+and end row delimiting the chunk of the LD matrix to update with the new_csr
.
Note
+Current implementation assumes that the update does not change the sparsity +structure of the original matrix. Updating the matrix with new sparsity structure +is a harder problem that we will try to tackle later on.
+Note
+Current implementation assumes new_csr
is upper triangular.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
new_csr |
+ + | +
+
+
+ A sparse CSR matrix ( |
+ + required + | +
start_row |
+ + | +
+
+
+ The start row for the chunk to update. + |
+
+ None
+ |
+
end_row |
+ + | +
+
+
+ The end row for the chunk to update. + |
+
+ None
+ |
+
Raises:
+Type | +Description | +
---|---|
+ AssertionError
+ |
+
+
+
+ if the column dimension of |
+
magenpy/LDMatrix.py
validate_ld_matrix()
+
+¶Checks that the LDMatrix
object has correct structure and
+checks its contents for validity.
Specifically, we check that: +* The dimensions of the matrix and its associated attributes are matching. +* The masking is working properly.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ True if the matrix has the correct structure. + |
+
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ if the matrix is not valid. + |
+
magenpy/LDMatrix.py
+ Bases: object
A class to represent sample (individual) information and attributes in
+the context of a genotype matrix. The sample table is a wrapper around
+a pandas.DataFrame
object that contains the sample information. The
+table provides methods to read and write sample information from/to
+disk, filter samples, perofm checks/validation, and extract specific columns
+from the table.
Attributes:
+Name | +Type | +Description | +
---|---|---|
table |
+
+ Union[DataFrame, None]
+ |
+
+
+
+ The sample table as a pandas |
+
_phenotype_likelihood |
+
+ Union[str, None]
+ |
+
+
+
+ The likelihood of the phenotype values (if present). + |
+
_covariate_cols |
+ + | +
+
+
+ The names or IDs of covariates that are present in the sample table. + |
+
magenpy/SampleTable.py
7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 |
|
covariates
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The column names for the covariates stored in the sample table. + |
+
fid
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The family ID of each individual in the sample table. + |
+
iid
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The individual ID of each individual in the sample table. + |
+
n
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sample size (number of individuals) in the sample table. + |
+
original_index
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The original index of each individual in the sample table (before applying any filters). + |
+
phenotype
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The phenotype column from the sample table. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the phenotype is not set. + |
+
phenotype_likelihood
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The phenotype likelihood family. + |
+
sample_size
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ he sample size (number of individuals) in the sample table. + |
+
shape
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The shape of the sample table (mainly sample size) as a tuple (n,). + |
+
__init__(table=None, phenotype_likelihood=None)
+
+¶Initialize the sample table object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
table |
+
+ Union[DataFrame, None]
+ |
+
+
+
+ A pandas DataFrame with the sample information. + |
+
+ None
+ |
+
phenotype_likelihood |
+
+ Union[str, None]
+ |
+
+
+
+ The likelihood of the phenotype values. + |
+
+ None
+ |
+
magenpy/SampleTable.py
filter_samples(keep_samples=None, keep_file=None)
+
+¶Filter samples from the samples table. User must specify +either a list of samples to keep or the path to a file +with the list of samples to keep.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_samples |
+ + | +
+
+
+ A list (or array) of sample IDs to keep. + |
+
+ None
+ |
+
keep_file |
+ + | +
+
+
+ The path to a file with the list of samples to keep. + |
+
+ None
+ |
+
magenpy/SampleTable.py
from_covariate_file(covar_file, **read_csv_kwargs)
+
+
+ classmethod
+
+
+¶Initialize a sample table from a file of covariates.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
covar_file |
+ + | +
+
+
+ The path to the covariates file. + |
+ + required + | +
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments to pass to the |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/SampleTable.py
from_fam_file(fam_file)
+
+
+ classmethod
+
+
+¶Initialize a sample table object from a path to PLINK FAM file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fam_file |
+ + | +
+
+
+ The path to the FAM file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/SampleTable.py
from_phenotype_file(phenotype_file, filter_na=True, **read_csv_kwargs)
+
+
+ classmethod
+
+
+¶Initialize a sample table from a phenotype file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype_file |
+ + | +
+
+
+ The path to the phenotype file. + |
+ + required + | +
filter_na |
+ + | +
+
+
+ Filter samples with missing phenotype values (Default: True). + |
+
+ True
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments to pass to the |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/SampleTable.py
get_covariates(covar_subset=None)
+
+¶Get the covariates associated with each individual in the sample table as a matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
covar_subset |
+ + | +
+
+
+ A subset of the covariate names or IDs to include in the matrix. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A numpy array with the covariate values. + |
+
magenpy/SampleTable.py
get_covariates_table(covar_subset=None)
+
+¶Get a table of covariates associated with each individual in the +sample table. The table will be formatted as (FID, IID, covar1, covar2, ...).
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
covar_subset |
+ + | +
+
+
+ A subset of the covariate names or IDs to include in the table. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame with the covariate information. + |
+
magenpy/SampleTable.py
get_individual_table()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A table of individual IDs (FID, IID) present in the sample table. + |
+
get_phenotype_table()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A table of individual IDs and phenotype values (FID IID phenotype) in the sample table. + |
+
magenpy/SampleTable.py
post_check_phenotype()
+
+¶Apply some simple heuristics to check the phenotype values +provided by the user and infer the phenotype likelihood (if feasible).
+ + + +Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the phenotype values could not be matched with the inferred phenotype likelihood. + |
+
magenpy/SampleTable.py
read_covariates_file(covar_file, **read_csv_kwargs)
+
+¶Read the covariates file from the provided path. The expected format is Family ID (FID
),
+Individual ID (IID
) and the remaining columns are assumed to be covariates. You may adjust
+the parsing configurations with keyword arguments that will be passed to pandas.read_csv
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
covar_file |
+ + | +
+
+
+ The path to the covariates file. + |
+ + required + | +
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments to pass to the |
+
+ {}
+ |
+
magenpy/SampleTable.py
read_phenotype_file(phenotype_file, drop_na=True, **read_csv_kwargs)
+
+¶Read the phenotype file from disk. The expected format is Family ID (FID
),
+Individual ID (IID
) and the phenotype column phenotype
. You may adjust
+the parsing configurations with keyword arguments that will be passed to pandas.read_csv
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype_file |
+ + | +
+
+
+ The path to the phenotype file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop samples whose phenotype value is missing (Default: True). + |
+
+ True
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ keyword arguments to pass to the |
+
+ {}
+ |
+
magenpy/SampleTable.py
set_phenotype(phenotype, phenotype_likelihood=None)
+
+¶Update the phenotype in the sample table using the provided values.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype |
+ + | +
+
+
+ The new phenotype values, represented by a numpy array or Iterable. + |
+ + required + | +
phenotype_likelihood |
+ + | +
+
+
+ The likelihood of the phenotype values. + |
+
+ None
+ |
+
magenpy/SampleTable.py
to_file(output_file, col_subset=None, **to_csv_kwargs)
+
+¶Write the contents of the sample table to file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_file |
+ + | +
+
+
+ The path to the file where to write the sample table. + |
+ + required + | +
col_subset |
+ + | +
+
+
+ A subset of the columns to write to file. + |
+
+ None
+ |
+
to_csv_kwargs |
+ + | +
+
+
+ keyword arguments to pass to the |
+
+ {}
+ |
+
magenpy/SampleTable.py
to_table(col_subset=None)
+
+¶Get the sample table as a pandas DataFrame.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ A subset of the columns to include in the table. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame with the sample information. + |
+
magenpy/SampleTable.py
+ Bases: object
A wrapper class for representing the summary statistics obtained from +Genome-wide Association Studies (GWAS). GWAS software tools publish their +results in the form of summary statistics, which include the SNP rsIDs, +the effect/reference alleles tested, the marginal effect sizes (BETA), +the standard errors (SE), the Z-scores, the p-values, etc.
+This class provides a convenient way to access/manipulate/harmonize these summary statistics +across various formats. Particularly, given the heterogeneity in summary statistics +formats, this class provides a common interface to access these statistics +in a consistent manner. The class also supports computing some derived statistics +from the summary statistics, such as the pseudo-correlation between the SNP and the +phenotype, the Chi-squared statistics, etc.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
table |
+
+ DataFrame
+ |
+
+
+
+ A pandas DataFrame containing the summary statistics. + |
+
magenpy/SumstatsTable.py
8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 |
|
a1
+
+
+ property
+
+
+¶See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The alternative or effect allele for each variant in the summary statistics table. + |
+
a2
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The reference allele for each variant in the summary statistics table. + |
+
alt_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The alternative or effect allele for each variant in the summary statistics table. + |
+
beta_hat
+
+
+ property
+
+
+¶See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The marginal beta from the association test of each variant on the phenotype. + |
+
bp_pos
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The base pair position for each variant in the summary statistics table. + |
+
chromosome
+
+
+ property
+
+
+¶A convenience method to return the chromosome number if there is only one chromosome in the summary statistics. +If multiple chromosomes are present, it returns None.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The chromosome number if there is only one chromosome in the summary statistics. + |
+
chromosomes
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The unique chromosomes in the summary statistics table. + |
+
effect_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The alternative or effect allele for each variant in the summary statistics table. + |
+
effect_sign
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sign for the effect size (1 for positive effect, -1 for negative effect) of each genetic variant ib the phenotype. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the sign could not be inferred from available data. + |
+
log10_p_value
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The negative log10 of the p-value (-log10(p_value)) of association test of each variant on the phenotype. + |
+
m
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the summary statistics table. + |
+
maf
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The minor allele frequency for each variant in the summary statistics table. + |
+
maf_var
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The variance of the minor allele frequency for each variant in the summary statistics table. + |
+
marginal_beta
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The marginal beta from the association test of each variant on the phenotype. + |
+
n
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The sample size for the association test of each variant in the summary statistics table. + |
+
n_per_snp
+
+
+ property
+
+
+¶n_snps
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of variants in the summary statistics table. + |
+
odds_ratio
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The odds ratio from the association test of each variant on case-control phenotypes. + |
+
p_value
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The p-value from the association test of each variant on the phenotype. + |
+
pval
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The p-value from the association test of each variant on the phenotype. + |
+
ref_allele
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The reference allele for each variant in the summary statistics table. + |
+
se
+
+
+ property
+
+
+¶See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The standard error from the association test of each variant on the phenotype. + |
+
shape
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ he shape of the summary statistics table. + |
+
snps
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The rsIDs associated with each variant in the summary statistics table. + |
+
standard_error
+
+
+ property
+
+
+¶See Also
+Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The standard error from the association test of each variant on the phenotype. + |
+
standardized_marginal_beta
+
+
+ property
+
+
+¶Get the marginal BETAs assuming that both the genotype matrix +and the phenotype vector are standardized column-wise to have mean zero and variance 1. +In some contexts, this is also known as the per-SNP correlation or +pseudo-correlation with the phenotype.
+See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The standardized marginal beta from the association test of each variant on the phenotype. + |
+
z_score
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The Z-score from the association test of each SNP on the phenotype. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the Z-score statistic is not available and could not be inferred from available data. + |
+
__init__(ss_table)
+
+¶Initialize the summary statistics table.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ss_table |
+
+ DataFrame
+ |
+
+
+
+ A pandas DataFrame containing the summary statistics. !!! seealso "See Also" * from_file + |
+ + required + | +
magenpy/SumstatsTable.py
drop_duplicates()
+
+¶Drop variants with duplicated rsIDs from the summary statistics table.
+ + +filter_by_allele_frequency(min_maf=None, min_mac=None)
+
+¶Filter variants in the summary statistics table by minimum minor allele frequency or allele count
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
min_maf |
+ + | +
+
+
+ Minimum minor allele frequency + |
+
+ None
+ |
+
min_mac |
+ + | +
+
+
+ Minimum minor allele count + |
+
+ None
+ |
+
magenpy/SumstatsTable.py
filter_snps(extract_snps=None, extract_file=None, extract_index=None)
+
+¶Filter the summary statistics table to keep a subset of SNPs.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
extract_snps |
+ + | +
+
+
+ A list or array of SNP IDs to keep. + |
+
+ None
+ |
+
extract_file |
+ + | +
+
+
+ A plink-style file containing the SNP IDs to keep. + |
+
+ None
+ |
+
extract_index |
+ + | +
+
+
+ A list or array of the indices of SNPs to retain. + |
+
+ None
+ |
+
magenpy/SumstatsTable.py
from_file(sumstats_file, sumstats_format=None, parser=None, **parse_kwargs)
+
+
+ classmethod
+
+
+¶Initialize a summary statistics table from file. The user must provide either
+the format for the summary statistics file or the parser object
+(see parsers.sumstats_parsers
).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sumstats_file |
+ + | +
+
+
+ The path to the summary statistics file. + |
+ + required + | +
sumstats_format |
+ + | +
+
+
+ The format for the summary statistics file. Currently, we support the following summary statistics formats: |
+
+ None
+ |
+
parser |
+ + | +
+
+
+ An instance of SumstatsParser parser, implements basic parsing/conversion functionalities. + |
+
+ None
+ |
+
parse_kwargs |
+ + | +
+
+
+ arguments for the pandas |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A |
+
magenpy/SumstatsTable.py
get_chisq_statistic()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The Chi-Squared statistic from the association test of each variant on the phenotype. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the Chi-Squared statistic is not available and could not be inferred from available data. + |
+
magenpy/SumstatsTable.py
get_col(col_name)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+ + | +
+
+
+ The name of the column to extract. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The column associated with |
+
magenpy/SumstatsTable.py
get_snp_pseudo_corr()
+
+¶Computes the pseudo-correlation coefficient (standardized beta) between the SNP and +the phenotype (X_jTy / N) from GWAS summary statistics.
+This method uses Equation 15 in Mak et al. 2017
+$$
+beta = z_j / sqrt(n - 1 + z_j^2)
+$$
+
Where z_j
is the marginal GWAS Z-score.
See Also
+ +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The pseudo-correlation coefficient between the SNP and the phenotype. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the Z-scores are not available or the sample size is not available. + |
+
magenpy/SumstatsTable.py
get_yy_per_snp()
+
+¶Computes the quantity (y'y)_j/n_j following SBayesR (Lloyd-Jones 2019) and Yang et al. (2012).
+(y'y)_j/n_j is defined as the empirical variance for continuous phenotypes and may be estimated +from GWAS summary statistics by re-arranging the equation for the +squared standard error:
+$$
+SE(b_j)^2 = (Var(y) - Var(x_j)*b_j^2) / (Var(x)*n)
+$$
+
Which gives the following estimate:
+$$
+(y'y)_j / n_j = (n_j - 2)*SE(b_j)^2 + b_j^2
+$$
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The quantity (y'y)_j/n_j for each SNP in the summary statistics table. + |
+
Raises:
+Type | +Description | +
---|---|
+ KeyError
+ |
+
+
+
+ If the marginal betas, standard errors or sample sizes are not available. + |
+
magenpy/SumstatsTable.py
infer_a2(reference_table, allow_na=False)
+
+¶Infer the reference allele A2 (if not present in the SumstatsTable) +from a reference table. Make sure that the reference table contains the SNP ID, +the reference allele A2 and the alternative (i.e. effect) allele A1. It is the +user's responsibility to make sure that the reference table matches the summary +statistics in terms of the specification of reference vs. alternative. They are +allowed to be flipped, but they have to be consistent across the two tables.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
reference_table |
+ + | +
+
+
+ A pandas table containing the following columns at least: |
+ + required + | +
allow_na |
+ + | +
+
+
+ If True, allow the reference allele to be missing from the final result. + |
+
+ False
+ |
+
magenpy/SumstatsTable.py
match(reference_table, correct_flips=True)
+
+¶Match the summary statistics table with a reference table, +correcting for potential flips in the effect alleles.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
reference_table |
+ + | +
+
+
+ The SNP table to use as a reference. Must be a pandas table with at least three columns: SNP, A1, A2. + |
+ + required + | +
correct_flips |
+ + | +
+
+
+ If True, correct the direction of effect size estimates if the effect allele is reversed. + |
+
+ True
+ |
+
magenpy/SumstatsTable.py
set_sample_size(n)
+
+¶Set the sample size for each variant in the summary table. +This can be useful when the overall sample size from the GWAS analysis is available, +but not on a per-SNP basis.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
n |
+ + | +
+
+
+ A scalar or array of sample sizes for each variant. + |
+ + required + | +
magenpy/SumstatsTable.py
split_by_chromosome(snps_per_chrom=None)
+
+¶Split the summary statistics table by chromosome, so that we would
+have a separate SumstatsTable
object for each chromosome.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
snps_per_chrom |
+ + | +
+
+
+ A dictionary where the keys are the chromosome number and the value is an array or list of SNPs on that chromosome. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary where the keys are the chromosome number and the value is a |
+
magenpy/SumstatsTable.py
to_file(output_file, col_subset=None, **to_csv_kwargs)
+
+¶A convenience method to write the summary statistics table to file.
+TODO: Add a format argument to this method and allow the user to output summary statistics +according to supported formats (e.g. COJO, plink, fastGWA, etc.).
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_file |
+ + | +
+
+
+ The path to the file where to write the summary statistics. + |
+ + required + | +
col_subset |
+ + | +
+
+
+ A subset of the columns to write to file. + |
+
+ None
+ |
+
to_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
+ {}
+ |
+
magenpy/SumstatsTable.py
to_table(col_subset=None)
+
+¶A convenience method to extract the summary statistics table or subsets of it.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_subset |
+ + | +
+
+
+ A list corresponding to a subset of columns to return. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame containing the summary statistics with the requested column subset. + |
+
magenpy/SumstatsTable.py
plink
.AnnotationMatrixParser
+
+
+¶
+ Bases: object
A generic annotation matrix parser class.
+ +magenpy/parsers/annotation_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names for the various SNP features in the annotation matrix. + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
+ {}
+ |
+
magenpy/parsers/annotation_parsers.py
parse(annotation_file, drop_na=True)
+
+¶Parse the annotation matrix file
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annotation_file |
+ + | +
+
+
+ The path to the annotation file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop any entries with missing values. + |
+
+ True
+ |
+
magenpy/parsers/annotation_parsers.py
LDSCAnnotationMatrixParser
+
+
+¶
+ Bases: AnnotationMatrixParser
magenpy/parsers/annotation_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names for the various SNP features in the annotation matrix. + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/annotation_parsers.py
parse(annotation_file, drop_na=True)
+
+¶Parse the annotation matrix file
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annotation_file |
+ + | +
+
+
+ The path to the annotation file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop any entries with missing values. + |
+
+ True
+ |
+
magenpy/parsers/annotation_parsers.py
parse_annotation_bed_file(annot_bed_file)
+
+¶Parse an annotation bed file in the format specified by Ensemble: +https://uswest.ensembl.org/info/website/upload/bed.html
+The file contains 3-12 columns, starting with Chromosome, start_coordinate, end_coordinate, etc. +After reading the raw file, we let pandas infer whether the file has a header or not and we +standardize the names of the first 3 columns and convert the chromosome column into an integer.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annot_bed_file |
+
+ str
+ |
+
+
+
+ The path to the annotation BED file. + |
+ + required + | +
magenpy/parsers/annotation_parsers.py
parse_cluster_assignment_file(cluster_assignment_file)
+
+¶Parses a file that maps each individual in the sample table to a cluster, +and returns the pandas dataframe. The expected file should be whitespace delimited +and contain three columns: FID, IID, and Cluster
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
cluster_assignment_file |
+
+ str
+ |
+
+
+
+ The path to the cluster assignment file. + |
+ + required + | +
magenpy/parsers/misc_parsers.py
parse_ld_block_data(ldb_file_path)
+
+¶This function takes a path to a file with the LD blocks +and returns a dictionary with the chromosome ID and a list of the +start and end positions for the blocks in that chromosome. +The parser assumes that the LD block files have the ldetect format: +https://bitbucket.org/nygcresearch/ldetect-data/src/master/
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ldb_file_path |
+
+ str
+ |
+
+
+
+ The path (or URL) to the LD blocks file + |
+ + required + | +
magenpy/parsers/misc_parsers.py
read_sample_filter_file(filename)
+
+¶Read plink-style file listing sample IDs. +The file should not have a header, be tab-separated, and has two +columns corresponding to Family ID (FID) and Individual ID (IID). +You may also pass a file with a single-column of Individual IDs instead.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
filename |
+
+ str
+ |
+
+
+
+ The path to the file containing the sample IDs + |
+ + required + | +
magenpy/parsers/misc_parsers.py
read_snp_filter_file(filename, snp_id_col=0)
+
+¶Read plink-style file listing variant IDs. +The file should not have a header and only has a single column.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
filename |
+
+ str
+ |
+
+
+
+ The path to the file containing the SNP IDs + |
+ + required + | +
snp_id_col |
+
+ int
+ |
+
+
+
+ The column index containing the SNP IDs + |
+
+ 0
+ |
+
magenpy/parsers/misc_parsers.py
parse_bim_file(plink_bfile)
+
+¶From the plink documentation: +https://www.cog-genomics.org/plink/1.9/formats#bim
+A text file with no header line, and one line per variant with the following six fields:
+
+- Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
+- Variant identifier
+- Position in morgans or centimorgans (safe to use dummy value of '0')
+- Base-pair coordinate (1-based; limited to 231-2)
+- Allele 1 (corresponding to clear bits in .bed; usually minor)
+- Allele 2 (corresponding to set bits in .bed; usually major)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plink_bfile |
+
+ str
+ |
+
+
+
+ The path to the plink bfile (with or without the extension). + |
+ + required + | +
magenpy/parsers/plink_parsers.py
parse_fam_file(plink_bfile)
+
+¶From the plink documentation: +https://www.cog-genomics.org/plink/1.9/formats#fam
+A text file with no header line, and one line per sample with the following six fields:
+
+- Family ID ('FID')
+- Within-family ID ('IID'; cannot be '0')
+- Within-family ID of father ('0' if father isn't in dataset)
+- Within-family ID of mother ('0' if mother isn't in dataset)
+- Sex code ('1' = male, '2' = female, '0' = unknown)
+- Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plink_bfile |
+
+ str
+ |
+
+
+
+ The path to the plink bfile (with or without the extension). + |
+ + required + | +
magenpy/parsers/plink_parsers.py
COJOSSParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics files generated by the COJO
software.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the COJO summary statistics parser.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
FastGWASSParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics files generated by the FastGWA
software.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
Plink1SSParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics files generated by plink1.9
.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the plink1.9
summary statistics parser.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
Plink2SSParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics files generated by plink2
.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 |
|
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the plink2
summary statistics parser.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
parse(file_name, drop_na=True)
+
+¶Parse a summary statistics file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_name |
+ + | +
+
+
+ The path to the summary statistics file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop any entries with missing values. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame containing the parsed summary statistics. + |
+
magenpy/parsers/sumstats_parsers.py
SSFParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics that are formatted according
+ to the standardized summary statistics format adopted by the GWAS Catalog. This format is
+ sometimes denoted as GWAS-SSF
.
Reference and details: +https://github.com/EBISPOT/gwas-summary-statistics-standard
+See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the standardized summary statistics parser.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
SaigeSSParser
+
+
+¶
+ Bases: SumstatsParser
A specialized class for parsing GWAS summary statistics files generated by the SAIGE
software.
+Reference and details:
+https://saigegit.github.io/SAIGE-doc/docs/single_step2.html
TODO: Ensure that the column names are correct across different trait types +and the inference of the sample size is correct.
+See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 |
|
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the SAIGE
summary statistics parser.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
parse(file_name, drop_na=True)
+
+¶Parse the summary statistics file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_name |
+ + | +
+
+
+ The path to the summary statistics file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ Drop any entries with missing values. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame containing the parsed summary statistics. + |
+
magenpy/parsers/sumstats_parsers.py
SumstatsParser
+
+
+¶
+ Bases: object
A wrapper class for parsing summary statistics files that are written by statistical genetics software +for Genome-wide Association testing. A common challenge is the fact that different software tools +output summary statistics in different formats and with different column names. Thus, this class +provides a common interface for parsing summary statistics files from different software tools +and aims to make this process as seamless as possible.
+The class is designed to be extensible, so that users can easily add new parsers for different software tools.
+See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary mapping column names in the original table to magenpy's column names. + |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' |
+
magenpy/parsers/sumstats_parsers.py
__init__(col_name_converter=None, **read_csv_kwargs)
+
+¶Initialize the summary statistics parser.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_converter |
+ + | +
+
+
+ A dictionary/string mapping column names in the original table to magenpy's column names for the various summary statistics. If a string, it should be a comma-separated list of key-value pairs (e.g. 'rsid=SNP,pos=POS'). + |
+
+ None
+ |
+
read_csv_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to pandas' read_csv + |
+
+ {}
+ |
+
magenpy/parsers/sumstats_parsers.py
parse(file_name, drop_na=True)
+
+¶Parse a summary statistics file.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_name |
+ + | +
+
+
+ The path to the summary statistics file. + |
+ + required + | +
drop_na |
+ + | +
+
+
+ If True, drop any entries with missing values. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame containing the parsed summary statistics. + |
+
magenpy/parsers/sumstats_parsers.py
manhattan(input_data, y=None, y_label=None, chrom_sep_color='#f0f0f0', snp_color='#808080', snp_marker='o', snp_alpha=0.3, add_bonf_line=True, bonf_line_color='#b06a7a')
+
+¶Generate Manhattan plot where the x-axis is the genomic position (in BP) +and the y-axis is the -log10(p-value) or some other statistic of the user's choice.
+TODO: Add functionality to highlight certain SNPs or markers on the plot.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_data |
+
+ Union[GWADataLoader, SumstatsTable]
+ |
+
+
+
+ An instance of |
+ + required + | +
y |
+ + | +
+
+
+ An optional vector of values to plot on the y-axis. If not provided, the -log10(p-value) will be plotted by default. + |
+
+ None
+ |
+
y_label |
+ + | +
+
+
+ A label for the quantity or statistic that will be plotted on the y-axis. + |
+
+ None
+ |
+
chrom_sep_color |
+ + | +
+
+
+ The color for the chromosome separator block. + |
+
+ '#f0f0f0'
+ |
+
snp_color |
+ + | +
+
+
+ The color of the dots on the Manhattan plot. + |
+
+ '#808080'
+ |
+
snp_marker |
+ + | +
+
+
+ The shape of the marker on the Manhattan plot. + |
+
+ 'o'
+ |
+
snp_alpha |
+ + | +
+
+
+ The opacity level for the markers. + |
+
+ 0.3
+ |
+
add_bonf_line |
+ + | +
+
+
+ If True, add a line indicating the Bonferroni significance threshold. + |
+
+ True
+ |
+
bonf_line_color |
+ + | +
+
+
+ The color of the Bonferroni significance threshold line. + |
+
+ '#b06a7a'
+ |
+
magenpy/plot/gwa.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 |
|
qq_plot(input_data, statistic='p_value')
+
+¶Generate a quantile-quantile (QQ) plot for the GWAS summary statistics. +The function supports plotting QQ plots for the -log10(p-values) as well as +the z-score (if available).
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_data |
+
+ Union[GWADataLoader, SumstatsTable]
+ |
+
+
+
+ An instance of |
+ + required + | +
statistic |
+ + | +
+
+
+ The statistic to generate the QQ plot for. We currently support |
+
+ 'p_value'
+ |
+
magenpy/plot/gwa.py
plot_ld_matrix(ldm, row_subset=None, display='full', cmap='OrRd', include_colorbar=True)
+
+¶Plot a heatmap representing the LD matrix or portions of it.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ldm |
+
+ LDMatrix
+ |
+
+
+
+ An instance of |
+ + required + | +
row_subset |
+ + | +
+
+
+ A boolean or integer index array for the subset of rows/columns to extract from the LD matrix. + |
+
+ None
+ |
+
display |
+ + | +
+
+
+ A string indicating what part of the matrix to display. Can be 'full', 'upper', 'lower'. If upper, only the upper triangle of the matrix will be displayed. If lower, only the lower triangle will be displayed. + |
+
+ 'full'
+ |
+
cmap |
+ + | +
+
+
+ The color map for the LD matrix plot. + |
+
+ 'OrRd'
+ |
+
include_colorbar |
+ + | +
+
+
+ If True, include a colorbar in the plot. + |
+
+ True
+ |
+
magenpy/plot/ld.py
AnnotatedPhenotypeSimulator
+
+
+¶
+ Bases: PhenotypeSimulator
Simulate complex traits by incorporating genomic functional +annotations into the mixture densities that govern the effect size +of each variant on the trait.
+Warning
+This code is experimental and needs much further validation.
+magenpy/simulation/AnnotatedPhenotypeSimulator.py
6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 |
|
__init__(bed_files, **kwargs)
+
+¶Create an instance of the AnnotatedPhenotypeSimulator class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_files |
+ + | +
+
+
+ A list of BED files that contain the genotype data. + |
+ + required + | +
kwargs |
+ + | +
+
+
+ Additional keyword arguments for the PhenotypeSimulator class. + |
+
+ {}
+ |
+
magenpy/simulation/AnnotatedPhenotypeSimulator.py
get_heritability_enrichment()
+
+¶Estimate the enrichment of heritability per annotation.
+ +magenpy/simulation/AnnotatedPhenotypeSimulator.py
set_per_snp_heritability()
+
+¶Set the per-SNP heritability values using the annotation weights.
+ +magenpy/simulation/AnnotatedPhenotypeSimulator.py
set_per_snp_mixture_probability()
+
+¶Set the per-SNP mixture probabilities using the annotation weights.
+ +magenpy/simulation/AnnotatedPhenotypeSimulator.py
set_w_h2(w_h2)
+
+¶Set the annotation weights for the per SNP heritability
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
w_h2 |
+ + | +
+
+
+ A vector of weights for each annotation. + |
+ + required + | +
magenpy/simulation/AnnotatedPhenotypeSimulator.py
set_w_pi(w_pi)
+
+¶Set the annotation weights for the per SNP causal probability
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
w_pi |
+ + | +
+
+
+ A vector of weights for each annotation. + |
+ + required + | +
magenpy/simulation/AnnotatedPhenotypeSimulator.py
simulate_w_h2(enrichment=None)
+
+¶Simulate the annotation weights for the per-SNP heritability
+ + +simulate_w_pi(enrichment=None)
+
+¶Simulate the annotation weights for the per-SNP causal probability
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
enrichment |
+ + | +
+
+
+ A dictionary of enrichment values where the key is the annotation and the value is the enrichment + |
+
+ None
+ |
+
magenpy/simulation/AnnotatedPhenotypeSimulator.py
MultiCohortPhenotypeSimulator
+
+
+¶
+ Bases: GWADataLoader
A module for simulating GWAS data for separate cohorts or clusters of the data. +This includes scenarios such as multi-population or multi-ethnic datasets, or +datasets that can be stratified by a discrete variable.
+Warning
+This code is experimental and needs much further validation.
+magenpy/simulation/MultiCohortPhenotypeSimulator.py
8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 |
|
__init__(bed_files, cluster_assignments_file, prop_shared_causal=1.0, rho=1.0, **kwargs)
+
+¶Simulate phenotypes using the linear additive model while accounting +for heterogeneous genetic architectures across cohorts.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_files |
+ + | +
+
+
+ A path (or list of paths) to PLINK BED files. + |
+ + required + | +
cluster_assignments_file |
+ + | +
+
+
+ A file mapping each sample in the BED files to their corresponding cohort or cluster. + |
+ + required + | +
prop_shared_causal |
+ + | +
+
+
+ Proportion of causal variants that are shared across clusters. + |
+
+ 1.0
+ |
+
rho |
+ + | +
+
+
+ The correlation coefficient for the effect size across clusters. + |
+
+ 1.0
+ |
+
magenpy/simulation/MultiCohortPhenotypeSimulator.py
PhenotypeSimulator
+
+
+¶
+ Bases: GWADataLoader
A wrapper class that supports simulating complex traits with a variety of +genetic architectures and heritability values, using the standard linear model. The +basic implementation supports simulating effect sizes from a sparse Gaussian mixture density, +allowing some variants to have larger effects than others. The class also supports simulating +binary phenotypes (case-control) by thresholding the continuous phenotype at a specified threshold.
+To be concrete, the generative model for the simulation is as follows:
+1) Simulate the mixture assignment for each variant based on the mixing proportions pi
.
+2) Simulate the effect sizes for each variant from the corresponding Gaussian density that they were assigned.
+3) Compute the polygenic score for each individual based on the simulated effect sizes.
+4) Simulate the residual component of the phenotype, in such a way that the total heritability is preserved.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
pi |
+ + | +
+
+
+ The mixing proportions for the Gaussian mixture density. + |
+
h2 |
+ + | +
+
+
+ The trait SNP heritability, or proportion of variance explained by SNPs. + |
+
d |
+ + | +
+
+
+ The variance multipliers for each component of the Gaussian mixture density. + |
+
prevalence |
+ + | +
+
+
+ The (disease) prevalence for binary (case-control) phenotypes. + |
+
per_snp_h2 |
+ + | +
+
+
+ The per-SNP heritability for each variant in the dataset. + |
+
per_snp_pi |
+ + | +
+
+
+ The per-SNP mixing proportions for each variant in the dataset. + |
+
beta |
+ + | +
+
+
+ The effect sizes for each variant in the dataset. + |
+
mixture_assignment |
+ + | +
+
+
+ The assignment of each variant to a mixture component. + |
+
magenpy/simulation/PhenotypeSimulator.py
8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 |
|
n_components
+
+
+ property
+
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The number of Gaussian mixture components for the effect size distribution. + |
+
__init__(bed_files, h2=0.2, pi=0.1, d=(0.0, 1.0), prevalence=0.15, **kwargs)
+
+¶Initialize the PhenotypeSimulator object with the necessary parameters.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bed_files |
+ + | +
+
+
+ A path (or list of paths) to PLINK BED files containing the genotype information. + |
+ + required + | +
h2 |
+ + | +
+
+
+ The trait SNP heritability, or proportion of variance explained by SNPs. + |
+
+ 0.2
+ |
+
pi |
+ + | +
+
+
+ The mixing proportions for the mixture of Gaussians (our model for the distribution of effect sizes). If a float is provided, it is converted to a tuple (1-pi, pi), where pi is the proportion of causal variants. + |
+
+ 0.1
+ |
+
d |
+ + | +
+
+
+ The variance multipliers for each component of the Gaussian mixture density. By default, all components have the same variance multiplier. + |
+
+ (0.0, 1.0)
+ |
+
prevalence |
+ + | +
+
+
+ The (disease) prevalence for binary (case-control) phenotypes. + |
+
+ 0.15
+ |
+
magenpy/simulation/PhenotypeSimulator.py
get_causal_status()
+
+¶Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A dictionary where the keys are the chromosome numbers and the values are binary vectors indicating which SNPs are causal for the simulated phenotype. + |
+
Raises:
+Type | +Description | +
---|---|
+ AssertionError
+ |
+
+
+
+ If the mixture assignment is not set. + |
+
magenpy/simulation/PhenotypeSimulator.py
set_beta(new_beta)
+
+¶Set the variant effect sizes (beta) according to user-provided dictionary.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
new_beta |
+ + | +
+
+
+ A dictionary where the keys are the chromosomes and the values are the beta (effect size) for each SNP on that chromosome. + |
+ + required + | +
magenpy/simulation/PhenotypeSimulator.py
set_causal_snps(causal_snps)
+
+¶A utility method to set the causal variants in the simulation based on an array or
+list of SNPs specified by the user. The method takes an iterable (e.g. list or array) of causal_snps
+and then creates a new mixture assignment object where only the causal_snps
+contribute to the phenotype.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
causal_snps |
+ + | +
+
+
+ A list or array of SNP rsIDs. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If all mixture components are causal. + |
+
magenpy/simulation/PhenotypeSimulator.py
set_h2(new_h2)
+
+¶Set the total heritability (proportion of additive variance due to SNPs) for the trait
+ + +set_mixture_assignment(new_assignment)
+
+¶Set the mixture assignments according to user-provided dictionary. The mixture +assignment indicates which mixture component the effect size of a particular +variant comes from.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
new_assignment |
+ + | +
+
+
+ A dictionary where the keys are the chromosomes and the values are the mixture assignment for each SNP on that chromosome. + |
+ + required + | +
magenpy/simulation/PhenotypeSimulator.py
set_per_snp_heritability()
+
+¶Set the per-SNP heritability (effect size variance) for each variant in the dataset. +This is a convenience method that may come in handy for more flexible generative models.
+ +magenpy/simulation/PhenotypeSimulator.py
set_per_snp_mixture_probability()
+
+¶Set the per-SNP mixing proportions for each variant in the dataset. +This is a convenience method that may come in handy for more flexible generative models.
+ +magenpy/simulation/PhenotypeSimulator.py
set_pi(new_pi)
+
+¶Set the mixture proportions (proportion of variants in each +Gaussian mixture component).
+ + +simulate(reset_beta=True, reset_mixture_assignment=True, perform_gwas=False)
+
+¶A convenience method to simulate all the components of the generative model. +Specifically, the simulation follows the standard linear model, where the phenotype is +dependent on the genotype + environmental components that are assumed to be uncorrelated:
+Y = XB + e
Where Y
is the vector of phenotypes, X
is the genotype matrix, B
is the vector of effect sizes,
+and e
represents the residual effects. The generative model proceeds by:
1) Drawing the effect sizes beta
from a Gaussian mixture density.
+ 2) Drawing the residual effect from an isotropic Gaussian density.
+ 3) Setting the phenotype according to the equation above.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
reset_beta |
+ + | +
+
+
+ If True, reset the effect sizes by drawing new ones from the prior density. + |
+
+ True
+ |
+
reset_mixture_assignment |
+ + | +
+
+
+ If True, reset the assignment of SNPs to mixture components. Set to False if you'd like to keep the same configuration of causal SNPs. + |
+
+ True
+ |
+
perform_gwas |
+ + | +
+
+
+ If True, automatically perform genome-wide association on the newly simulated phenotype. + |
+
+ False
+ |
+
magenpy/simulation/PhenotypeSimulator.py
simulate_beta()
+
+¶Simulate the causal effect size for variants included +in the dataset. Here, the variant effect size is drawn from +a Gaussian density with mean zero and scale given by +the root of per-SNP heritability.
+ +magenpy/simulation/PhenotypeSimulator.py
simulate_mixture_assignment()
+
+¶Simulate assigning SNPs to the various mixture components
+with probabilities given by mixing proportions pi
.
magenpy/simulation/PhenotypeSimulator.py
simulate_phenotype()
+
+¶Simulate complex phenotypes for the samples present in the genotype matrix, given their
+genotype information and fixed effect sizes beta
that were simulated previous steps.
Given the simulated effect sizes, the phenotype is generated as follows:
+Y = XB + e
Where Y
is the vector of phenotypes, X
is the genotype matrix, B
is the vector of effect sizes,
+and e
represents the residual effects.
magenpy/simulation/PhenotypeSimulator.py
to_true_beta_table(per_chromosome=False)
+
+¶Export the simulated true effect sizes and causal status into a pandas dataframe.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
per_chromosome |
+ + | +
+
+
+ If True, return a dictionary of tables for each chromosome separately. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A pandas DataFrame with the true effect sizes and causal status for each variant. + |
+
magenpy/simulation/PhenotypeSimulator.py
inflation_factor(sumstats_input)
+
+¶Compute the genomic control (GC) inflation factor (also known as lambda) +from GWAS summary statistics.
+The inflation factor can be used to detect and correct inflation in the test statistics.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sumstats_input |
+
+ Union[GWADataLoader, SumstatsTable, array]
+ |
+
+
+
+ The input can be one of three classes of objects: A GWADataLoader object, a SumstatsTable object, or a numpy array of chi-squared statistics to compute the inflation factor. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The inflation factor (lambda) computed from the chi-squared statistics. + |
+
magenpy/stats/gwa/utils.py
perform_gwa_plink1p9(genotype_matrix, temp_dir='temp', **phenotype_transform_kwargs)
+
+¶Perform genome-wide association testing using plink 1.9 +This function takes a GenotypeMatrix object and gwas-related flags and +calls plink to perform GWA on the genotype and phenotype data referenced +by the GenotypeMatrix object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A plinkBEDGenotypeMatrix object. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ Path to a directory where we keep intermediate temporary files from plink. + |
+
+ 'temp'
+ |
+
phenotype_transform_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to the |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A SumstatsTable object containing the summary statistics from the association tests. + |
+
magenpy/stats/gwa/utils.py
140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 |
|
perform_gwa_plink2(genotype_matrix, temp_dir='temp', **phenotype_transform_kwargs)
+
+¶Perform genome-wide association testing using plink 2.0 +This function takes a GenotypeMatrix object and gwas-related flags and +calls plink to perform GWA on the genotype and phenotype data referenced +by the GenotypeMatrix object.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A plinkBEDGenotypeMatrix object. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ Path to a directory where we keep intermediate temporary files from plink. + |
+
+ 'temp'
+ |
+
phenotype_transform_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to the |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A SumstatsTable object containing the summary statistics from the association tests. + |
+
magenpy/stats/gwa/utils.py
36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 |
|
perform_gwa_xarray(genotype_matrix, standardize_genotype=False, **phenotype_transform_kwargs)
+
+¶Perform genome-wide association testing using xarray and the PyData ecosystem.
+This function takes a GenotypeMatrix object and gwas-related flags and
+calls performs (simple) GWA on the genotype and phenotype data referenced
+by the GenotypeMatrix object. This function only implements GWA testing for
+continuous phenotypes. For other functionality (e.g. case-control GWAS),
+please use plink
as a backend or consult other GWAS software (e.g. GCTA or REGENIE).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A GenotypeMatrix object. + |
+ + required + | +
standardize_genotype |
+ + | +
+
+
+ If True, the genotype matrix will be standardized such that the columns (i.e. SNPs) have zero mean and unit variance. + |
+
+ False
+ |
+
phenotype_transform_kwargs |
+ + | +
+
+
+ Keyword arguments to pass to the |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A SumstatsTable object containing the summary statistics from the association tests. + |
+
magenpy/stats/gwa/utils.py
244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 |
|
LDSCRegression
+
+
+¶
+ Bases: object
Perform LD Score Regression using the jackknife method.
+ +magenpy/stats/h2/ldsc.py
__init__(gdl, n_blocks=200, max_chisq=None)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gdl |
+
+ GWADataLoader
+ |
+
+
+
+ An instance of GWADataLoader + |
+ + required + | +
n_blocks |
+ + | +
+
+
+ The number of blocks to use for the jackknife method. + |
+
+ 200
+ |
+
max_chisq |
+ + | +
+
+
+ The maximum Chi-Squared statistic to consider. + |
+
+ None
+ |
+
magenpy/stats/h2/ldsc.py
fit()
+
+¶Perform LD Score Regression estimation using the jackknife method.
+ + + +Raises:
+Type | +Description | +
---|---|
+ NotImplementedError
+ |
+
+
+
+ If method is not implemented. + |
+
simple_ldsc(gdl)
+
+¶Provides an estimate of SNP heritability from summary statistics using +a simplified version of the LD Score Regression framework. +E[X_j^2] = h^2*l_j + int +Where the response is the Chi-Squared statistic for SNP j +and the variable is its LD score.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gdl |
+
+ GWADataLoader
+ |
+
+
+
+ An instance of |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The estimated SNP heritability. + |
+
magenpy/stats/h2/ldsc.py
BlockLD
+
+
+¶
+ Bases: SampleLD
A wrapper class to facilitate computing block-based Linkage-Disequilibrium (LD) matrices. +Block-based LD matrices are a way to reduce the memory requirements of the LD matrix by +computing the pairwise correlation coefficients only between SNPs that are within the same LD block.
+LD blocks can be inferred by external software tools, such as LDetect
of Berisa and Pickrell (2016):
Berisa T, Pickrell JK. Approximately independent linkage disequilibrium blocks in human populations. +Bioinformatics. 2016 Jan 15;32(2):283-5. doi: 10.1093/bioinformatics/btv546. +Epub 2015 Sep 22. PMID: 26395773; PMCID: PMC4731402.
+The BlockLD
estimator requires the LD blocks to be provided as input. The LD blocks are a Bx2 matrix
+where B is the number of blocks and the columns are the start and end of each block, respectively.
See Also
+ +Attributes:
+Name | +Type | +Description | +
---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+
ld_blocks |
+ + | +
+
+
+ The LD blocks, a Bx2 matrix where B is the number of blocks and the columns are the start and end of each block, respectively. + |
+
magenpy/stats/ld/estimator.py
407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 |
|
__init__(genotype_matrix, ld_blocks=None, ld_blocks_file=None)
+
+¶Initialize the block-based LD estimator with a genotype matrix and LD blocks.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+ + required + | +
ld_blocks |
+ + | +
+
+
+ The LD blocks, a Bx2 matrix where B is the number of blocks and the columns are the start and end of each block, respectively. + |
+
+ None
+ |
+
ld_blocks_file |
+ + | +
+
+
+ The path to the LD blocks file + |
+
+ None
+ |
+
magenpy/stats/ld/estimator.py
compute(output_dir, temp_dir='temp', overwrite=True, delete_original=True, dtype='int16', compressor_name='lz4', compression_level=5)
+
+¶Compute the block-based LD matrix and store in Zarr array format.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_dir |
+ + | +
+
+
+ The path where to store the resulting LD matrix. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory to store intermediate files and results. + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, overwrite any existing LD matrices in |
+
+ True
+ |
+
delete_original |
+ + | +
+
+
+ If True, deletes dense or intermediate LD matrices generated along the way. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the LD matrix. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the LD matrix (1-9). + |
+
+ 5
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An instance of |
+
magenpy/stats/ld/estimator.py
compute_ld_boundaries()
+
+¶Compute the per-SNP Linkage-Disequilibrium (LD) boundaries for the block-based estimator.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A 2xM matrix of LD boundaries. + |
+
magenpy/stats/ld/estimator.py
SampleLD
+
+
+¶
+ Bases: object
A basic wrapper class to facilitate computing Linkage-Disequilibrium (LD) matrices.
+Linkage-Disequilibrium (LD) is a measure of the SNP-by-SNP pairwise correlation between +genetic variants in a population. LD tends to decay with genomic distance, and the rate +of decay is influenced by many factors. Therefore, LD matrices are often diagonally-dominant.
+This class SampleLD
provides a basic interface to compute sample correlation coefficient between
+ all variants defined in a genotype matrix. The resulting LD matrix is a square and dense matrix.
For sparse LD matrices, consider using the WindowedLD
, ShrinkageLD
or BlockLD
estimators instead.
!!! seealso "See Also" + * WindowedLD + * ShrinkageLD + * BlockLD
+:ivar genotype_matrix: The genotype matrix, an instance of GenotypeMatrix
or its children.
magenpy/stats/ld/estimator.py
4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 |
|
__init__(genotype_matrix)
+
+¶Initialize the LD estimator with a genotype matrix.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+ + required + | +
magenpy/stats/ld/estimator.py
compute(output_dir, temp_dir='temp', overwrite=True, delete_original=True, dtype='int16', compressor_name='lz4', compression_level=5)
+
+¶A utility method to compute the LD matrix and store in Zarr array format. +The computes the LD matrix and stores it in Zarr array format, set its attributes, +and performs simple validation at the end.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_dir |
+ + | +
+
+
+ The path where to store the resulting LD matrix. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory to store intermediate files and results. + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, overwrite any existing LD matrices in |
+
+ True
+ |
+
delete_original |
+ + | +
+
+
+ If True, deletes dense or intermediate LD matrices generated along the way. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the LD matrix. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the LD matrix (1-9). + |
+
+ 5
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An instance of |
+
magenpy/stats/ld/estimator.py
59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 |
|
compute_ld_boundaries()
+
+¶Compute the Linkage-Disequilibrium (LD) boundaries. LD boundaries define the window +for which we compute the correlation coefficient between the focal SNP and all other SNPs in +the genome. Typically, this window is local, since the LD decays exponentially with +genomic distance.
+The LD boundaries are a 2xM matrix, where M is the number of SNPs on the chromosome. +The first row contains the start position for the window and the second row contains +the end position.
+For the sample LD matrix, we simply take the entire square matrix as our window, +so the start position is 0 and end position is M for all SNPs.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A 2xM matrix of LD boundaries. + |
+
magenpy/stats/ld/estimator.py
ShrinkageLD
+
+
+¶
+ Bases: SampleLD
A wrapper class to facilitate computing shrinkage-based Linkage-Disequilibrium (LD) matrices. +Shrinkage LD matrices are a way to reduce noise in the LD matrix by shrinking the off-diagonal pairwise +correlation coefficients towards zero. This is useful for reducing the noise in the LD matrix and +improving the quality of downstream analyses.
+The shrinkage estimator implemented uses the shrinking procedure derived in:
+Wen X, Stephens M. USING LINEAR PREDICTORS TO IMPUTE ALLELE FREQUENCIES FROM SUMMARY OR POOLED GENOTYPE DATA. +Ann Appl Stat. 2010 Sep;4(3):1158-1182. doi: 10.1214/10-aoas338. PMID: 21479081; PMCID: PMC3072818.
+Computing the shrinkage intensity requires specifying the effective population size (Ne) and the sample size +used to infer the genetic map. In addition, it requires specifying a threshold below which the LD is set to zero.
+See Also
+Attributes:
+Name | +Type | +Description | +
---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+
genetic_map_ne |
+ + | +
+
+
+ The effective population size (Ne) from which the genetic map is derived. + |
+
genetic_map_sample_size |
+ + | +
+
+
+ The sample size used to infer the genetic map. + |
+
threshold |
+ + | +
+
+
+ The shrinkage cutoff below which the LD is set to zero. + |
+
magenpy/stats/ld/estimator.py
284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 |
|
__init__(genotype_matrix, genetic_map_ne, genetic_map_sample_size, threshold=0.001)
+
+¶Initialize the shrinkage LD estimator with a genotype matrix and shrinkage parameters.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+ + required + | +
genetic_map_ne |
+ + | +
+
+
+ The effective population size (Ne) from which the genetic map is derived. + |
+ + required + | +
genetic_map_sample_size |
+ + | +
+
+
+ The sample size used to infer the genetic map. + |
+ + required + | +
threshold |
+ + | +
+
+
+ The shrinkage cutoff below which the LD is set to zero. + |
+
+ 0.001
+ |
+
magenpy/stats/ld/estimator.py
compute(output_dir, temp_dir='temp', overwrite=True, delete_original=True, dtype='int16', compressor_name='lz4', compression_level=5, chunk_size=1000)
+
+¶TODO: Add a mechanism to either automatically adjust the shrinkage threshold depending on the +float precision (dtype) or purge trailing zero entries that got quantized to zero. For example, +if we select a shrinkage threshold of 1e-3 with (int8), then we will have a lot of +trailing zeros stored in the resulting LD matrix. It's better if we got rid of those zeros to +minimize storage requirements and computation time.
+Note
+LD Scores are computed before applying shrinkage.
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_dir |
+ + | +
+
+
+ The path where to store the resulting LD matrix. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory to store intermediate files and results. + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, overwrite any existing LD matrices in |
+
+ True
+ |
+
delete_original |
+ + | +
+
+
+ If True, deletes dense or intermediate LD matrices generated along the way. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the LD matrix. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the LD matrix (1-9). + |
+
+ 5
+ |
+
chunk_size |
+ + | +
+
+
+ An optional parameter that sets the maximum number of rows processed simultaneously. The smaller the |
+
+ 1000
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An instance of |
+
magenpy/stats/ld/estimator.py
344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 |
|
compute_ld_boundaries()
+
+¶Compute the shrinkage-based Linkage-Disequilibrium (LD) boundaries.
+ + + +Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A 2xM matrix of LD boundaries. + |
+
magenpy/stats/ld/estimator.py
WindowedLD
+
+
+¶
+ Bases: SampleLD
A wrapper class to facilitate computing windowed Linkage-Disequilibrium (LD) matrices. +Windowed LD matrices only record pairwise correlations between variants that are within a certain +distance of each other along the chromosome. This is useful for reducing the memory requirements +and noise in the LD matrix.
+The WindowedLD
estimator supports a variety of ways for defining the window size:
window_size
: The number of neighboring SNPs to consider on each side when computing LD.kb_window_size
: The maximum distance in kilobases to consider when computing LD.cm_window_size
: The maximum distance in centi Morgan to consider when computing LD.The LD boundaries computed here are the intersection of the windows defined by the window size around
+each SNP (window_size
), the window size in kilobases (kb_window_size
), and the window size in centi Morgan
+(cm_window_size
).
See Also
+Attributes:
+Name | +Type | +Description | +
---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+
window_size |
+ + | +
+
+
+ The number of neighboring SNPs to consider on each side when computing LD. + |
+
kb_window_size |
+ + | +
+
+
+ The maximum distance in kilobases to consider when computing LD. + |
+
cm_window_size |
+ + | +
+
+
+ The maximum distance in centi Morgan to consider when computing LD. + |
+
magenpy/stats/ld/estimator.py
139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 |
|
__init__(genotype_matrix, window_size=None, kb_window_size=None, cm_window_size=None)
+
+¶Initialize the windowed LD estimator with a genotype matrix and window size parameters.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ The genotype matrix, an instance of |
+ + required + | +
window_size |
+ + | +
+
+
+ The number of neighboring SNPs to consider on each side when computing LD. + |
+
+ None
+ |
+
kb_window_size |
+ + | +
+
+
+ The maximum distance in kilobases to consider when computing LD. + |
+
+ None
+ |
+
cm_window_size |
+ + | +
+
+
+ The maximum distance in centi Morgan to consider when computing LD. + |
+
+ None
+ |
+
magenpy/stats/ld/estimator.py
compute(output_dir, temp_dir='temp', overwrite=True, delete_original=True, dtype='int16', compressor_name='lz4', compression_level=5)
+
+¶Compute the windowed LD matrix and store in Zarr array format.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_dir |
+ + | +
+
+
+ The path where to store the resulting LD matrix. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory to store intermediate files and results. + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, overwrite any existing LD matrices in |
+
+ True
+ |
+
delete_original |
+ + | +
+
+
+ If True, deletes dense or intermediate LD matrices generated along the way. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix. + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the LD matrix. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the LD matrix (1-9). + |
+
+ 5
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ An instance of |
+
magenpy/stats/ld/estimator.py
compute_ld_boundaries()
+
+¶Compute the windowed Linkage-Disequilibrium (LD) boundaries.
+The LD boundaries computed here are the intersection of the windows defined by
+the window size around each SNP (window_size
), the window size in kilobases (kb_window_size
),
+and the window size in centi Morgan (cm_window_size
).
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A 2xM matrix of LD boundaries. + |
+
magenpy/stats/ld/estimator.py
clump_snps(ldm, statistic=None, rsq_threshold=0.9, extract=True, sort_key=None)
+
+¶This function takes an LDMatrix object and clumps SNPs based
+on the stat
vector (usually p-value) and the provided r-squared threshold.
+If two SNPs have an r-squared greater than the threshold,
+the SNP with the higher stat
value is excluded.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ldm |
+ + | +
+
+
+ An LDMatrix object + |
+ + required + | +
statistic |
+ + | +
+
+
+ A vector of statistics (e.g. p-values) for each SNP that will determine which SNPs to discard. + |
+
+ None
+ |
+
rsq_threshold |
+ + | +
+
+
+ The r^2 threshold to use for filtering variants. + |
+
+ 0.9
+ |
+
extract |
+ + | +
+
+
+ If True, return remaining SNPs. If False, return removed SNPs. + |
+
+ True
+ |
+
sort_key |
+ + | +
+
+
+ The key function for the sorting algorithm that will decide how to sort the |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A list of SNP rsIDs that are left after clumping (or discarded if |
+
magenpy/stats/ld/utils.py
compute_ld_plink1p9(genotype_matrix, ld_boundaries, output_dir, temp_dir='temp', overwrite=True, dtype='int16', compressor_name='lz4', compression_level=5)
+
+¶Compute LD matrices using plink 1.9.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A plinkBEDGenotypeMatrix object + |
+ + required + | +
ld_boundaries |
+ + | +
+
+
+ An array of LD boundaries for every SNP + |
+ + required + | +
output_dir |
+ + | +
+
+
+ The output directory for the final LD matrix file (after processing). + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory to store intermediate files (e.g. files created for and by plink). + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, it overwrites any LD matrices in |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the Zarr arrays. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the Zarr arrays (1-9). + |
+
+ 5
+ |
+
magenpy/stats/ld/utils.py
253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 |
|
compute_ld_xarray(genotype_matrix, ld_boundaries, output_dir, temp_dir='temp', overwrite=True, delete_original=True, dtype='int16', compressor_name='lz4', compression_level=5)
+
+¶Compute the Linkage Disequilibrium matrix or snp-by-snp
+correlation matrix assuming that the genotypes are represented
+by xarray
or dask
-like matrix objects. This function computes the
+entire X'X/N and stores the result on-disk in Zarr arrays. Then, we call the utilities
+from the LDMatrix
class to sparsify the dense matrix according to the parameters
+specified by the ld_boundaries
matrix.
NOTE: We don't recommend using this for large-scale genotype matrices.
+Use compute_ld_plink1p9
instead if you have plink installed on your system.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ An |
+ + required + | +
ld_boundaries |
+ + | +
+
+
+ An array of LD boundaries for every SNP + |
+ + required + | +
output_dir |
+ + | +
+
+
+ The output directory for the final LD matrix file. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ A temporary directory where to store intermediate results. + |
+
+ 'temp'
+ |
+
overwrite |
+ + | +
+
+
+ If True, overwrites LD matrices in |
+
+ True
+ |
+
delete_original |
+ + | +
+
+
+ If True, it deletes the original dense matrix after generating the sparse alternative. + |
+
+ True
+ |
+
dtype |
+ + | +
+
+
+ The data type for the entries of the LD matrix (supported data types are float32, float64 and integer quantized data types int8 and int16). + |
+
+ 'int16'
+ |
+
compressor_name |
+ + | +
+
+
+ The name of the compressor to use for the Zarr arrays. + |
+
+ 'lz4'
+ |
+
compression_level |
+ + | +
+
+
+ The compression level to use for the Zarr arrays (1-9). + |
+
+ 5
+ |
+
magenpy/stats/ld/utils.py
379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 |
|
delete_ld_store(ld_mat)
+
+¶Delete the LD store from disk.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ld_mat |
+ + | +
+
+
+ An LDMatrix object + |
+ + required + | +
estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128)
+
+¶Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB. +The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
rows |
+ + | +
+
+
+ Total number of rows in the matrix. + |
+ + required + | +
cols |
+ + | +
+
+
+ Total number of columns. If sparse matrix with uneven columns, provide average column size. + |
+ + required + | +
dtype |
+ + | +
+
+
+ The data type for the matrix entries. + |
+
+ 'int16'
+ |
+
mem_size |
+ + | +
+
+
+ Size of the chunk in memory (MB) + |
+
+ 128
+ |
+
magenpy/stats/ld/utils.py
expand_snps(seed_snps, ldm, rsq_threshold=0.9)
+
+¶Given an initial set of SNPs, expand the set by adding +"neighbors" whose squared correlation with the is higher than +a user-specified threshold.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
seed_snps |
+ + | +
+
+
+ An iterable containing initial set of SNP rsIDs. + |
+ + required + | +
ldm |
+ + | +
+
+
+ An |
+ + required + | +
rsq_threshold |
+ + | +
+
+
+ The r^2 threshold to use for including variants. + |
+
+ 0.9
+ |
+
magenpy/stats/ld/utils.py
move_ld_store(z_arr, target_path, overwrite=True)
+
+¶Move an LD store from its current path to the target_path
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
z_arr |
+ + | +
+
+
+ An LDMatrix object + |
+ + required + | +
target_path |
+ + | +
+
+
+ The target path where to move the LD store + |
+ + required + | +
overwrite |
+ + | +
+
+
+ If True, overwrites the target path if it exists. + |
+
+ True
+ |
+
magenpy/stats/ld/utils.py
shrink_ld_matrix(ld_mat_obj, cm_pos, maf_var, genmap_ne, genmap_sample_size, shrinkage_cutoff=0.001, phased_haplotype=False, chunk_size=1000)
+
+¶Shrink the entries of the LD matrix using the shrinkage estimator +described in Lloyd-Jones (2019) and Wen and Stephens (2010). The estimator +is also implemented in the RSS software by Xiang Zhu:
+https://github.com/stephenslab/rss/blob/master/misc/get_corr.R
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ld_mat_obj |
+ + | +
+
+
+ An |
+ + required + | +
cm_pos |
+ + | +
+
+
+ The position of each variant in the LD matrix in centi Morgan. + |
+ + required + | +
maf_var |
+ + | +
+
+
+ A vector of the variance in minor allele frequency (MAF) for each SNP in the LD matrix. Should be equivalent to 2pj(1. - pj), where pj is the MAF of SNP j. + |
+ + required + | +
genmap_ne |
+ + | +
+
+
+ The effective population size for the genetic map. + |
+ + required + | +
genmap_sample_size |
+ + | +
+
+
+ The sample size used to estimate the genetic map. + |
+ + required + | +
shrinkage_cutoff |
+ + | +
+
+
+ The cutoff value below which we assume that the shrinkage factor is zero. + |
+
+ 0.001
+ |
+
phased_haplotype |
+ + | +
+
+
+ A flag indicating whether the LD was calculated from phased haplotypes. + |
+
+ False
+ |
+
chunk_size |
+ + | +
+
+
+ An optional parameter that sets the maximum number of rows processed simultaneously. The smaller the |
+
+ 1000
+ |
+
magenpy/stats/ld/utils.py
127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 |
|
score_plink2(genotype_matrix, betas, standardize_genotype=False, temp_dir='temp')
+
+¶Perform linear scoring using PLINK2. +This function takes a genotype matrix object encapsulating and referencing +plink BED files as well as a matrix of effect sizes (betas) and performs +linear scoring of the form:
+y = X * betas
+This is useful for computing polygenic scores (PGS). The function supports
+a matrix of beta
values, in which case the function returns a matrix of
+PGS values, one for each column of beta
. For example, if there are 10 sets
+of betas, the function will compute 10 polygenic scores for each individual represented
+in the genotype matrix X
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ An instance of |
+ + required + | +
betas |
+ + | +
+
+
+ A matrix of effect sizes (betas). + |
+ + required + | +
standardize_genotype |
+ + | +
+
+
+ If True, standardize the genotype to have mean zero and unit variance before scoring. + |
+
+ False
+ |
+
temp_dir |
+ + | +
+
+
+ The directory where the temporary files will be stored. + |
+
+ 'temp'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A numpy array of polygenic scores. + |
+
magenpy/stats/score/utils.py
6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 |
|
standardize(g_mat, fill_na=True)
+
+¶Standardize the genotype matrix, such that the columns (i.e. snps) +have zero mean and unit variance.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
g_mat |
+ + | +
+
+
+ A two-dimensional matrix (numpy, dask, xarray, etc.) where the rows are samples (individuals) and the columns are genetic variants. + |
+ + required + | +
fill_na |
+ + | +
+
+
+ If true, fill the missing values with zero after standardizing. + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The standardized genotype matrix. + |
+
magenpy/stats/transforms/genotype.py
adjust_for_covariates(phenotype, covariates)
+
+¶This function takes a phenotype vector and a matrix of covariates +and applies covariate correction on the phenotype. Concretely, +this involves fitting a linear model where the response is the +phenotype and the predictors are the covariates and then returning +the residuals.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype |
+ + | +
+
+
+ A vector of continuous or quantitative phenotypes. + |
+ + required + | +
covariates |
+ + | +
+
+
+ A matrix where each row corresponds to an individual and each column corresponds to a covariate (e.g. age, sex, PCs, etc.) :return: The residuals of the linear model fit. + |
+ + required + | +
magenpy/stats/transforms/phenotype.py
chained_transform(sample_table, adjust_covariates=False, standardize_phenotype=False, rint_phenotype=False, outlier_sigma_threshold=None, transform_order=('standardize', 'covariate_adjust', 'rint', 'outlier_removal'))
+
+¶Apply a chain of transformations to the phenotype vector.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sample_table |
+ + | +
+
+
+ An instance of SampleTable that contains phenotype information and other covariates about the samples in the dataset. + |
+ + required + | +
adjust_covariates |
+ + | +
+
+
+ If true, regress out the covariates from the phenotype. By default, we regress out all the covariates present in the SampleTable. + |
+
+ False
+ |
+
standardize_phenotype |
+ + | +
+
+
+ If true, standardize the phenotype. + |
+
+ False
+ |
+
rint_phenotype |
+ + | +
+
+
+ If true, apply Rank-based inverse normal transform. + |
+
+ False
+ |
+
outlier_sigma_threshold |
+ + | +
+
+
+ The multiple of standard deviations or sigmas after which we consider the phenotypic value an outlier. + |
+
+ None
+ |
+
transform_order |
+ + | +
+
+
+ A tuple specifying the order in which to apply the transformations. By default, the order is standardize, covariate_adjust, rint, and outlier_removal. + |
+
+ ('standardize', 'covariate_adjust', 'rint', 'outlier_removal')
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The transformed phenotype vector and a boolean mask indicating the samples that were not removed. + |
+
magenpy/stats/transforms/phenotype.py
detect_outliers(phenotype, sigma_threshold=5)
+
+¶Detect samples with outlier phenotype values.
+This function takes a vector of quantitative phenotypes,
+computes the z-score for every individual, and returns a
+boolean vector indicating whether individual i has phenotype value
+within the specified standard deviations sigma_threshold
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype |
+ + | +
+
+
+ A numpy vector of continuous or quantitative phenotypes. + |
+ + required + | +
sigma_threshold |
+ + | +
+
+
+ The multiple of standard deviations or sigmas after which we consider the phenotypic value an outlier. + |
+
+ 5
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A boolean array indicating whether the phenotype value is an outlier. + |
+
magenpy/stats/transforms/phenotype.py
rint(phenotype, offset=3.0 / 8)
+
+¶Apply Rank-based inverse normal transform on the phenotype.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype |
+ + | +
+
+
+ A vector of continuous or quantitative phenotypes. + |
+ + required + | +
offset |
+ + | +
+
+
+ The offset to use in the INT transformation (Blom's offset by default). + |
+
+ 3.0 / 8
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The RINT-transformed phenotype. + |
+
magenpy/stats/transforms/phenotype.py
standardize(phenotype)
+
+¶Standardize the phenotype vector to have mean zero and unit variance
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
phenotype |
+ + | +
+
+
+ A numpy vector of continuous or quantitative phenotypes. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The standardized phenotype array. + |
+
magenpy/stats/transforms/phenotype.py
compute_allele_frequency_plink2(genotype_matrix, temp_dir='temp')
+
+¶Compute the allele frequency for each SNP in the genotype matrix using PLINK2.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A GenotypeMatrix object. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ The temporary directory where to store intermediate files. + |
+
+ 'temp'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A numpy array of allele frequencies. + |
+
magenpy/stats/variant/utils.py
compute_sample_size_per_snp_plink2(genotype_matrix, temp_dir='temp')
+
+¶Compute the sample size per SNP in the genotype matrix using PLINK2.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genotype_matrix |
+ + | +
+
+
+ A GenotypeMatrix object. + |
+ + required + | +
temp_dir |
+ + | +
+
+
+ The temporary directory where to store intermediate files. + |
+
+ 'temp'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ A numpy array of sample sizes per SNP. + |
+
magenpy/stats/variant/utils.py
generate_slice_dictionary(vec)
+
+¶This utility function takes a sorted vector (e.g. numpy array), +identifies the unique elements and generates a dictionary of slices +delineating the start and end positions of each element in the vector.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
vec |
+ + | +
+
+
+ A numpy array + |
+ + required + | +
magenpy/utils/compute_utils.py
intersect_arrays(arr1, arr2, return_index=False)
+
+¶This utility function takes two arrays and returns the shared +elements (intersection) between them. If return_index is set to True, +it returns the index of shared elements in the first array.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
arr1 |
+ + | +
+
+
+ The first array + |
+ + required + | +
arr2 |
+ + | +
+
+
+ The second array + |
+ + required + | +
return_index |
+ + | +
+
+
+ Return the index of shared elements in the first array + |
+
+ False
+ |
+
magenpy/utils/compute_utils.py
iterable(arg)
+
+¶Check if an object is iterable (but not a string).
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
arg |
+ + | +
+
+
+ A python object. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ True if the object is iterable, False otherwise. + |
+
magenpy/utils/compute_utils.py
tgp_eur_data_path()
+
+¶Return the path of the attached 1000G genotype data for +European samples (N=378) and a subset of chromosome 22 (p=15938)
+ +magenpy/utils/data_utils.py
ukb_height_sumstats_path()
+
+¶Return the path of the attached GWAS summary statistics file +for standing height. The file contains summary statistics for +HapMap3 variants on CHR22 and is a snapshot of the summary statistics +published on the fastGWA database: +https://yanglab.westlake.edu.cn/data/fastgwa_data/UKB/50.v1.1.fastGWA.gz
+ +magenpy/utils/data_utils.py
plink1Executor
+
+
+¶
+ Bases: object
A wrapper class for interfacing with the plink1.9
command line tool.
magenpy/utils/executors.py
__init__(threads='auto', verbose=True)
+
+¶Initialize the plink1.9 executor
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
threads |
+
+ int | str
+ |
+
+
+
+ The number of threads to use for computations. If set to 'auto', the number of available CPUs will be used. + |
+
+ 'auto'
+ |
+
verbose |
+
+ bool
+ |
+
+
+
+ Whether to print the output of the command + |
+
+ True
+ |
+
magenpy/utils/executors.py
execute(cmd)
+
+¶Execute a plink command
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
cmd |
+
+ list of strings
+ |
+
+
+
+ The flags to pass to plink. For example, ['--bfile', 'file', '--out', 'output'] + |
+ + required + | +
magenpy/utils/executors.py
plink2Executor
+
+
+¶
+ Bases: object
A wrapper class for interfacing with the plink2
command line tool.
magenpy/utils/executors.py
__init__(threads='auto', verbose=True)
+
+¶Initialize the plink2 executor
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
threads |
+
+ int | str
+ |
+
+
+
+ The number of threads to use for computations. If set to 'auto', the number of available CPUs will be used. + |
+
+ 'auto'
+ |
+
verbose |
+
+ bool
+ |
+
+
+
+ Whether to print the output of the command + |
+
+ True
+ |
+
magenpy/utils/executors.py
execute(cmd)
+
+¶Execute a plink2
command
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
cmd |
+
+ list of strings
+ |
+
+
+
+ The flags to pass to plink2. For example, ['--bfile', 'file', '--out', 'output'] + |
+ + required + | +
magenpy/utils/executors.py
dequantize(ints, float_dtype=np.float32)
+
+¶Dequantize integers to the specified floating point type. +NOTE: Assumes original floats are in the range [-1, 1].
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ints |
+ + | +
+
+
+ A numpy array of integers + |
+ + required + | +
float_dtype |
+ + | +
+
+
+ The floating point type to dequantize to. + |
+
+ float32
+ |
+
magenpy/utils/model_utils.py
get_shared_distance_matrix(tree, tips=None)
+
+¶This function takes a Biopython tree and returns the +shared distance matrix, i.e. for a pair of clades or populations, +time to most recent common ancestor of the pair minus the time of +the most recent common ancestor (MRCA).
+ +magenpy/utils/model_utils.py
identify_mismatched_snps(gdl, chrom=None, n_iter=10, G=100, p_dentist_threshold=5e-08, p_gwas_threshold=0.01, rsq_threshold=0.95, max_removed_per_iter=0.005)
+
+¶This function implements a simple quality control procedures +that checks that the GWAS summary statistics (Z-scores) +are consistent with the LD reference panel. This is done +using a simplified version of the framework outlined in the DENTIST paper:
+Improved analyses of GWAS summary statistics by reducing data heterogeneity and errors +Chen et al. 2021
+Compared to DENTIST, the simplifications we make are:
+ - For each SNP, we sample one neighboring SNP at a time and compute the T statistic
+ using that neighbor's information. The benefit of this is that we don't need to
+ invert any matrices, so it's a fast operation to run.
+ - To arrive at a more robust estimate, we sample up to k
neighbors and average
+ the T-statistic across those k
neighbors.
NOTE: May need to re-implement this to apply some of the constraints genome-wide +rather than on a per-chromosome basis.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gdl |
+ + | +
+
+
+ A |
+ + required + | +
chrom |
+ + | +
+
+
+ Perform checking only on chromosome |
+
+ None
+ |
+
n_iter |
+ + | +
+
+
+ Number of iterations + |
+
+ 10
+ |
+
G |
+ + | +
+
+
+ The number of neighboring SNPs to sample (default: 100) + |
+
+ 100
+ |
+
p_dentist_threshold |
+ + | +
+
+
+ The Bonferroni-corrected P-value threshold (default: 5e-8) + |
+
+ 5e-08
+ |
+
p_gwas_threshold |
+ + | +
+
+
+ The nominal GWAS P-value threshold for partitioning variants (default: 1e-2) + |
+
+ 0.01
+ |
+
rsq_threshold |
+ + | +
+
+
+ The R^2 threshold to select neighbors (neighbor's squared correlation coefficient must be less than specified threshold). + |
+
+ 0.95
+ |
+
max_removed_per_iter |
+ + | +
+
+
+ The maximum proportion of variants removed in each iteration + |
+
+ 0.005
+ |
+
magenpy/utils/model_utils.py
169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 |
|
match_chromosomes(chrom_1, chrom_2, check_patterns=('chr_', 'chr:', 'chr'), return_both=False)
+
+¶Given two lists of chromosome IDs, this function returns the
+chromosomes that are common to both lists. By default, the returned chromosomes
+follow the data type and order of the first list. If return_both
is set to True,
+the function returns the common chromosomes in both lists.
The function also accounts for common ways to encode chromosomes, such as +chr18, chr_18, 18, etc.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
chrom_1 |
+ + | +
+
+
+ A list or numpy array of chromosome IDs + |
+ + required + | +
chrom_2 |
+ + | +
+
+
+ A list or numpy array of chromosome IDs + |
+ + required + | +
check_patterns |
+ + | +
+
+
+ A list of patterns to check for and replace in the chromosome IDs + |
+
+ ('chr_', 'chr:', 'chr')
+ |
+
return_both |
+ + | +
+
+
+ If True, return the common chromosomes in both lists + |
+
+ False
+ |
+
magenpy/utils/model_utils.py
merge_snp_tables(ref_table, alt_table, how='inner', on='auto', signed_statistics=('BETA', 'STD_BETA', 'Z'), drop_duplicates=True, correct_flips=True, return_ref_indices=False, return_alt_indices=False)
+
+¶This function takes a reference SNP table with at least 3 columns ('SNP', 'A1', A2
)
+and matches it with an alternative table that also has these 3 columns defined. In the most recent
+implementation, we allow users to merge on any set of columns that they wish by specifying the on
+parameter. For example, instead of SNP
, the user can join the SNP tables on CHR
and POS
, the
+chromosome number and base pair position of the SNP.
The manner in which the join operation takes place depends on the how
argument.
+Currently, the function supports inner
and left
joins.
The function removes duplicates if drop_duplicates
parameter is set to True
If correct_flips
is set to True, the function will correct summary statistics in
+the alternative table alt_table
(e.g. BETA, MAF) based whether the A1 alleles agree between the two tables.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ref_table |
+ + | +
+
+
+ The reference table (pandas dataframe). + |
+ + required + | +
alt_table |
+ + | +
+
+
+ The alternative table (pandas dataframe) + |
+ + required + | +
how |
+ + | +
+
+
+
|
+
+ 'inner'
+ |
+
on |
+ + | +
+
+
+ Which columns to use as anchors when merging. By default, we automatically infer which columns to use, but the user can specify this directly. When |
+
+ 'auto'
+ |
+
signed_statistics |
+ + | +
+
+
+ The columns with signed statistics to flip if |
+
+ ('BETA', 'STD_BETA', 'Z')
+ |
+
drop_duplicates |
+ + | +
+
+
+ Drop duplicate SNPs + |
+
+ True
+ |
+
correct_flips |
+ + | +
+
+
+ Correct SNP summary statistics that depend on status of alternative allele + |
+
+ True
+ |
+
return_ref_indices |
+ + | +
+
+
+ Return the indices of the remaining entries in the reference table before merging. + |
+
+ False
+ |
+
return_alt_indices |
+ + | +
+
+
+ Return the indices of the remaining entries in the alternative table before merging. + |
+
+ False
+ |
+
magenpy/utils/model_utils.py
52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 |
|
multinomial_rvs(n, p)
+
+¶Copied from Warren Weckesser: +https://stackoverflow.com/a/55830796
+Sample from the multinomial distribution with multiple p vectors.
+The return value has the same shape as p.
+ +magenpy/utils/model_utils.py
quantize(floats, int_dtype=np.int8)
+
+¶Quantize floating point numbers to the specified integer type. +NOTE: Assumes that the floats are in the range [-1, 1].
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
floats |
+ + | +
+
+
+ A numpy array of floats + |
+ + required + | +
int_dtype |
+ + | +
+
+
+ The integer type to quantize to. + |
+
+ int8
+ |
+
magenpy/utils/model_utils.py
tree_to_rho(tree, min_corr)
+
+¶This function takes a Biopython tree and a minimum correlation +parameter and returns the correlation matrix for the effect sizes +across populations.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
tree |
+ + | +
+
+
+ a Biopython Phylo object + |
+ + required + | +
min_corr |
+ + | +
+
+
+ minimum correlation + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+
+ |
+
magenpy/utils/model_utils.py
available_cpu()
+
+¶delete_temp_files(prefix)
+
+¶Delete temporary files with the given prefix
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
prefix |
+ + | +
+
+
+ A string with the prefix of the temporary files to delete. + |
+ + required + | +
magenpy/utils/system_utils.py
get_filenames(path, extension=None)
+
+¶Obtain valid and full path names given the provided path
or prefix and extensions.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+ + | +
+
+
+ A string with the path prefix or full path. + |
+ + required + | +
extension |
+ + | +
+
+
+ The extension for the class of files to search for. + |
+
+ None
+ |
+
magenpy/utils/system_utils.py
get_memory_usage()
+
+¶Get the memory usage of the current process in Mega Bytes (MB)
+ + +is_cmd_tool(name)
+
+¶Check whether name
is on PATH and marked as executable.
+From: https://stackoverflow.com/a/34177358
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
name |
+ + | +
+
+
+ A string with the name of the command-line tool. + |
+ + required + | +
magenpy/utils/system_utils.py
is_path_writable(path)
+
+¶Check whether the user has write-access to the provided path
.
+This function supports checking for nested directories (i.e.,
+we iterate upwards until finding a parent directory that currently
+exists, and we check the write-access of that directory).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+ + | +
+
+
+ A string with the path to check. + |
+ + required + | +
magenpy/utils/system_utils.py
makedir(dirs)
+
+¶Create directories on the filesystem, recursively.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dirs |
+ + | +
+
+
+ A string or list of strings with the paths to create. + |
+ + required + | +
magenpy/utils/system_utils.py
run_shell_script(cmd)
+
+¶Run the shell script given the command prompt in cmd
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
cmd |
+ + | +
+
+
+ A string with the shell command to run. + |
+ + required + | +
magenpy/utils/system_utils.py
valid_url(path)
+
+¶Check whether the provided path
is a valid URL.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+ + | +
+
+
+ A string with the URL to check. + |
+ + required + | +
magenpy/utils/system_utils.py
{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var Va=/["'&<>]/;qn.exports=za;function za(e){var t=""+e,r=Va.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i