-
Notifications
You must be signed in to change notification settings - Fork 111
/
functions.yml
519 lines (510 loc) · 27.6 KB
/
functions.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
# This file contains the manifest of SparkSQL functions provided by Glow. The definitions in this
# file are used to generate Scala and Python APIs.
#
# The top level names are groups of functions. These groups primarily matter for documentation. Each
# group contains an ordered list of functions. Most of the parameters are self-evident.
#
# Potentially confusing function options are:
# - expr_class: The class name of the expression in Glow that implements the function. This value is
# *not* user-visible. It's used only for client generation.
# - exclude_python: If true, the function is not exposed via the Python API
# - exclude_scala: If true, the function is not exposed via the Scala API.
#
# Potentially confusing argument options are:
# - type: The argument's type. This can currently be one of [str, double, lambda1, lambda2]. These
# types do not correspond to any real programming language; they are just placeholders used for
# client generation. More types can be added as needed. If an argument does not have an explicit
# type, it is assumed to be of Spark SQL Column type.
# - is_var_args: If true, the parameters can be repeated. Only the last argument in the args list
# can be var args.
# - is_optional: If true, the parameter can be omitted. Only the last argument in the args list can
# be optional.
#
# Warning: Spark version changes can modify the expected output. Known changes:
# - PYSPARK_ROW_FIELD_SORTING_ENABLED was introduced in Spark 3.0, changing the schema ordering
#
# For more details about how the definitions in this file are eventually turned into clients,
# consult the generator script at project/render_template.py.
complex_type_manipulation:
functions:
- name: add_struct_fields
doc: Adds fields to a struct.
since: 0.3.0
examples:
python: |
>>> df = spark.createDataFrame([Row(struct=Row(a=1))])
>>> df.select(glow.add_struct_fields('struct', lit('b'), lit(2)).alias('struct')).collect()
[Row(struct=Row(a=1, b=2))]
expr_class: io.projectglow.sql.expressions.AddStructFields
args:
- name: struct
doc: The struct to which fields will be added
- name: fields
doc: The new fields to add. The arguments must alternate between string-typed literal field names and field values.
is_var_args: true
returns: A struct consisting of the input struct and the added fields
- name: array_summary_stats
doc: Computes the minimum, maximum, mean, standard deviation for an array of numerics.
since: 0.3.0
examples:
python: |
>>> df = spark.createDataFrame([Row(arr=[1, 2, 3])])
>>> df.select(glow.expand_struct(glow.array_summary_stats('arr'))).collect()
[Row(mean=2.0, stdDev=1.0, min=1.0, max=3.0)]
expr_class: io.projectglow.sql.expressions.ArrayStatsSummary
args:
- name: arr
doc: An array of any numeric type
returns: A struct containing double ``mean``, ``stdDev``, ``min``, and ``max`` fields
- name: array_to_dense_vector
doc: Converts an array of numerics into a ``spark.ml`` ``DenseVector``.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.ArrayToDenseVector
examples:
python: |
>>> from pyspark.ml.linalg import DenseVector
>>> df = spark.createDataFrame([Row(arr=[1, 2, 3])])
>>> df.select(glow.array_to_dense_vector('arr').alias('v')).collect()
[Row(v=DenseVector([1.0, 2.0, 3.0]))]
args:
- name: arr
doc: The array of numerics
returns: A ``spark.ml`` ``DenseVector``
- name: array_to_sparse_vector
doc: Converts an array of numerics into a ``spark.ml`` ``SparseVector``.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.ArrayToSparseVector
examples:
python: |
>>> from pyspark.ml.linalg import SparseVector
>>> df = spark.createDataFrame([Row(arr=[1, 0, 2, 0, 3, 0])])
>>> df.select(glow.array_to_sparse_vector('arr').alias('v')).collect()
[Row(v=SparseVector(6, {0: 1.0, 2: 2.0, 4: 3.0}))]
args:
- name: arr
doc: The array of numerics
returns: A ``spark.ml`` ``SparseVector``
- name: expand_struct
doc: Promotes fields of a nested struct to top-level columns similar to using ``struct.*`` from SQL, but can be used in more contexts.
since: 0.3.0
examples:
python: |
>>> df = spark.createDataFrame([Row(struct=Row(a=1, b=2))])
>>> df.select(glow.expand_struct(col('struct'))).collect()
[Row(a=1, b=2)]
expr_class: io.projectglow.sql.expressions.ExpandStruct
args:
- name: struct
doc: The struct to expand
returns: Columns corresponding to fields of the input struct
- name: explode_matrix
doc: Explodes a ``spark.ml`` ``Matrix`` (sparse or dense) into multiple arrays, one per row of the matrix.
since: 0.3.0
examples:
python: |
>>> from pyspark.ml.linalg import DenseMatrix
>>> m = DenseMatrix(numRows=3, numCols=2, values=[1, 2, 3, 4, 5, 6])
>>> df = spark.createDataFrame([Row(matrix=m)])
>>> df.select(glow.explode_matrix('matrix').alias('row')).collect()
[Row(row=[1.0, 4.0]), Row(row=[2.0, 5.0]), Row(row=[3.0, 6.0])]
expr_class: io.projectglow.sql.expressions.ExplodeMatrix
args:
- name: matrix
doc: The ``sparl.ml`` ``Matrix`` to explode
returns: An array column in which each row is a row of the input matrix
- name: subset_struct
doc: Selects fields from a struct.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.SubsetStruct
examples:
python: |
>>> df = spark.createDataFrame([Row(struct=Row(a=1, b=2, c=3))])
>>> df.select(glow.subset_struct('struct', 'a', 'c').alias('struct')).collect()
[Row(struct=Row(a=1, c=3))]
args:
- name: struct
doc: Struct from which to select fields
- name: fields
doc: Fields to select
type: str
is_var_args: true
returns: A struct containing only the indicated fields
- name: vector_to_array
doc: Converts a ``spark.ml`` ``Vector`` (sparse or dense) to an array of doubles.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.VectorToArray
examples:
python: |
>>> from pyspark.ml.linalg import DenseVector, SparseVector
>>> df = spark.createDataFrame([Row(v=SparseVector(3, {0: 1.0, 2: 2.0})), Row(v=DenseVector([3.0, 4.0]))])
>>> df.select(glow.vector_to_array('v').alias('arr')).collect()
[Row(arr=[1.0, 0.0, 2.0]), Row(arr=[3.0, 4.0])]
args:
- name: vector
doc: Vector to convert
returns: An array of doubles
etl:
functions:
- name: hard_calls
doc: Converts an array of probabilities to hard calls. The probabilities are assumed to be diploid. See :ref:`variant-data-transformations`
for more details.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.HardCalls
examples:
python: |
>>> df = spark.createDataFrame([Row(probs=[0.95, 0.05, 0.0])])
>>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect()
[Row(calls=[0, 0])]
>>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])])
>>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect()
[Row(calls=[0, 1])]
>>> # Use the threshold parameter to change the minimum probability required for a call
>>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])])
>>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False), threshold=0.99).alias('calls')).collect()
[Row(calls=[-1, -1])]
args:
- name: probabilities
doc: The array of probabilities to convert
- name: numAlts
doc: The number of alternate alleles
- name: phased
doc: Whether the probabilities are phased. If phased, we expect one ``2 * numAlts`` values
in the probabilities array. If unphased, we expect one probability per possible genotype.
- name: threshold
doc: The minimum probability to make a call. If no probability falls into the range
of ``[0, 1 - threshold]`` or ``[threshold, 1]``, a no-call (represented by ``-1`` s) will be emitted.
If not provided, this parameter defaults to ``0.9``.
type: double
is_optional: true
returns: An array of hard calls
- name: lift_over_coordinates
doc: Performs liftover for the coordinates of a variant. To perform liftover of alleles and add additional metadata, see :ref:`liftover`.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.LiftOverCoordinatesExpr
examples:
python: |
>>> df = spark.read.format('vcf').load('test-data/liftover/unlifted.test.vcf').where('start = 18210071')
>>> chain_file = 'test-data/liftover/hg38ToHg19.over.chain.gz'
>>> reference_file = 'test-data/liftover/hg19.chr20.fa.gz'
>>> df.select('contigName', 'start', 'end').head()
Row(contigName='chr20', start=18210071, end=18210072)
>>> lifted_df = df.select(glow.expand_struct(glow.lift_over_coordinates('contigName', 'start', 'end', chain_file)))
>>> lifted_df.head()
Row(contigName='chr20', start=18190715, end=18190716)
args:
- name: contigName
doc: The current contig name
- name: start
doc: The current start
- name: end
doc: The current end
- name: chainFile
doc: Location of the chain file on each node in the cluster
type: str
- name: minMatchRatio
doc: Minimum fraction of bases that must remap to do liftover successfully. If not provided, defaults to ``0.95``.
type: double
is_optional: true
returns: A struct containing ``contigName``, ``start``, and ``end`` fields after liftover
- name: normalize_variant
doc: |
Normalizes the variant with a behavior similar to vt normalize or bcftools norm.
Creates a StructType column including the normalized ``start``, ``end``, ``referenceAllele`` and
``alternateAlleles`` fields (whether they are changed or unchanged as the result of
normalization) as well as a StructType field called ``normalizationStatus`` that
contains the following fields:
``changed``: A boolean field indicating whether the variant data was changed as a result of normalization
``errorMessage``: An error message in case the attempt at normalizing the row hit an error. In this case, the ``changed`` field will be set to ``false``. If no errors occur, this field will be ``null``.
In case of an error, the ``start``, ``end``, ``referenceAllele`` and ``alternateAlleles`` fields in the generated struct will be ``null``.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.NormalizeVariantExpr
examples:
python: |
>>> df = spark.read.format('vcf').load('test-data/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf')
>>> ref_genome = 'test-data/variantsplitternormalizer-test/Homo_sapiens_assembly38.20.21_altered.fasta'
>>> df.select('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles').head()
Row(contigName='chr20', start=400, end=401, referenceAllele='G', alternateAlleles=['GATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGGTTTGAG'])
>>> normalized_df = df.select('contigName', glow.expand_struct(glow.normalize_variant('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles', ref_genome)))
>>> normalized_df.head()
Row(contigName='chr20', start=268, end=269, referenceAllele='A', alternateAlleles=['ATTTGAGATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGG'], normalizationStatus=Row(changed=True, errorMessage=None))
args:
- name: contigName
doc: The current contig name
- name: start
doc: The current start
- name: end
doc: The current end
- name: refAllele
doc: The current reference allele
- name: altAlleles
doc: The current array of alternate alleles
- name: refGenomePathString
doc: |
A path to the reference genome ``.fasta`` file. The ``.fasta`` file must be accompanied with a ``.fai`` index file in the same folder.
type: str
returns: A struct as explained above
- name: mean_substitute
doc: Substitutes the missing values of a numeric array using the mean of the non-missing values. Any values that
are NaN, null or equal to the missing value parameter are considered missing. See
:ref:`variant-data-transformations` for more details.
since: 0.4.0
expr_class: io.projectglow.sql.expressions.MeanSubstitute
examples:
python: |
>>> df = spark.createDataFrame([Row(unsubstituted_values=[float('nan'), None, 0.0, 1.0, 2.0, 3.0, 4.0])])
>>> df.select(glow.mean_substitute('unsubstituted_values', lit(0.0)).alias('substituted_values')).collect()
[Row(substituted_values=[2.5, 2.5, 2.5, 1.0, 2.0, 3.0, 4.0])]
>>> df = spark.createDataFrame([Row(unsubstituted_values=[0, 1, 2, 3, -1, None])])
>>> df.select(glow.mean_substitute('unsubstituted_values').alias('substituted_values')).collect()
[Row(substituted_values=[0.0, 1.0, 2.0, 3.0, 1.5, 1.5])]
args:
- name: array
doc: A numeric array that may contain missing values
- name: missingValue
doc: A value that should be considered missing. If not provided, this parameter defaults to ``-1``.
is_optional: true
returns: A numeric array with substituted missing values
quality_control:
functions:
- name: call_summary_stats
doc: Computes call summary statistics for an array of genotype structs. See :ref:`variant-qc` for more details.
since: 0.3.0
examples:
python: |
>>> schema = 'genotypes: array<struct<calls: array<int>>>'
>>> df = spark.createDataFrame([Row(genotypes=[Row(calls=[0, 0]), Row(calls=[1, 0]), Row(calls=[1, 1])])], schema)
>>> df.select(glow.expand_struct(glow.call_summary_stats('genotypes'))).collect()
[Row(callRate=1.0, nCalled=3, nUncalled=0, nHet=1, nHomozygous=[1, 1], nNonRef=2, nAllelesCalled=6, alleleCounts=[3, 3], alleleFrequencies=[0.5, 0.5])]
expr_class: io.projectglow.sql.expressions.CallStats
args:
- name: genotypes
doc: The array of genotype structs with ``calls`` field
returns: A struct containing ``callRate``, ``nCalled``, ``nUncalled``, ``nHet``, ``nHomozygous``, ``nNonRef``, ``nAllelesCalled``, ``alleleCounts``, ``alleleFrequencies`` fields. See :ref:`variant-qc`.
- name: dp_summary_stats
doc: Computes summary statistics for the depth field from an array of genotype structs. See :ref:`variant-qc`.
examples:
python: |
>>> df = spark.createDataFrame([Row(genotypes=[Row(depth=1), Row(depth=2), Row(depth=3)])], 'genotypes: array<struct<depth: int>>')
>>> df.select(glow.expand_struct(glow.dp_summary_stats('genotypes'))).collect()
[Row(mean=2.0, stdDev=1.0, min=1.0, max=3.0)]
expr_class: io.projectglow.sql.expressions.DpSummaryStats
since: 0.3.0
args:
- name: genotypes
doc: An array of genotype structs with ``depth`` field
returns: A struct containing ``mean``, ``stdDev``, ``min``, and ``max`` of genotype depths
- name: hardy_weinberg
doc: Computes statistics relating to the Hardy Weinberg equilibrium. See :ref:`variant-qc` for more details.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.HardyWeinberg
examples:
python: |
>>> genotypes = [
... Row(calls=[1, 1]),
... Row(calls=[1, 0]),
... Row(calls=[0, 0])]
>>> df = spark.createDataFrame([Row(genotypes=genotypes)], 'genotypes: array<struct<calls: array<int>>>')
>>> df.select(glow.expand_struct(glow.hardy_weinberg('genotypes'))).collect()
[Row(hetFreqHwe=0.6, pValueHwe=0.7)]
args:
- name: genotypes
doc: The array of genotype structs with ``calls`` field
returns: A struct containing two fields, ``hetFreqHwe`` (the expected heterozygous frequency according to Hardy-Weinberg equilibrium) and ``pValueHwe`` (the associated p-value)
- name: gq_summary_stats
doc: Computes summary statistics about the genotype quality field for an array of genotype structs. See :ref:`variant-qc`.
since: 0.3.0
examples:
python: |
>>> genotypes = [
... Row(conditionalQuality=1),
... Row(conditionalQuality=2),
... Row(conditionalQuality=3)]
>>> df = spark.createDataFrame([Row(genotypes=genotypes)], 'genotypes: array<struct<conditionalQuality: int>>')
>>> df.select(glow.expand_struct(glow.gq_summary_stats('genotypes'))).collect()
[Row(mean=2.0, stdDev=1.0, min=1.0, max=3.0)]
expr_class: io.projectglow.sql.expressions.GqSummaryStats
args:
- name: genotypes
doc: The array of genotype structs with ``conditionalQuality`` field
returns: A struct containing ``mean``, ``stdDev``, ``min``, and ``max`` of genotype qualities
- name: sample_call_summary_stats
doc: Computes per-sample call summary statistics. See :ref:`sample-qc` for more details.
since: 0.3.0
examples:
python: |
>>> sites = [
... {'refAllele': 'C', 'alternateAlleles': ['G'], 'genotypes': [{'sampleId': 'NA12878', 'calls': [0, 0]}]},
... {'refAllele': 'A', 'alternateAlleles': ['G'], 'genotypes': [{'sampleId': 'NA12878', 'calls': [1, 1]}]},
... {'refAllele': 'AG', 'alternateAlleles': ['A'], 'genotypes': [{'sampleId': 'NA12878', 'calls': [1, 0]}]}]
>>> df = spark.createDataFrame(sites, 'refAllele: string, alternateAlleles: array<string>, genotypes: array<struct<sampleId: string, calls: array<int>>>')
>>> df.select(glow.sample_call_summary_stats('genotypes', 'refAllele', 'alternateAlleles').alias('stats')).collect()
[Row(stats=[Row(sampleId='NA12878', callRate=1.0, nCalled=3, nUncalled=0, nHomRef=1, nHet=1, nHomVar=1, nSnp=2, nInsertion=0, nDeletion=1, nTransition=2, nTransversion=0, nSpanningDeletion=0, rTiTv=inf, rInsertionDeletion=0.0, rHetHomVar=1.0)])]
expr_class: io.projectglow.sql.expressions.CallSummaryStats
args:
- name: genotypes
doc: An array of genotype structs with ``calls`` field
- name: refAllele
doc: The reference allele
- name: alternateAlleles
doc: An array of alternate alleles
returns: A struct containing ``sampleId``, ``callRate``, ``nCalled``, ``nUncalled``, ``nHomRef``, ``nHet``, ``nHomVar``, ``nSnp``, ``nInsertion``, ``nDeletion``, ``nTransition``, ``nTransversion``, ``nSpanningDeletion``, ``rTiTv``, ``rInsertionDeletion``, ``rHetHomVar`` fields. See :ref:`sample-qc`.
- name: sample_dp_summary_stats
doc: Computes per-sample summary statistics about the depth field in an array of genotype structs.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.SampleDpSummaryStatistics
examples:
python: |
>>> sites = [
... {'genotypes': [{'sampleId': 'NA12878', 'depth': 1}]},
... {'genotypes': [{'sampleId': 'NA12878', 'depth': 2}]},
... {'genotypes': [{'sampleId': 'NA12878', 'depth': 3}]}]
>>> df = spark.createDataFrame(sites, 'genotypes: array<struct<depth: int, sampleId: string>>')
>>> df.select(glow.sample_dp_summary_stats('genotypes').alias('stats')).collect()
[Row(stats=[Row(sampleId='NA12878', mean=2.0, stdDev=1.0, min=1.0, max=3.0)])]
args:
- name: genotypes
doc: An array of genotype structs with ``depth`` field
returns: An array of structs where each struct contains ``mean``, ``stDev``, ``min``, and ``max`` of the genotype depths for a sample. If ``sampleId`` is present in a genotype, it will be propagated to the resulting struct as an extra field.
- name: sample_gq_summary_stats
doc: Computes per-sample summary statistics about the genotype quality field in an array of genotype structs.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.SampleGqSummaryStatistics
examples:
python: |
>>> sites = [
... Row(genotypes=[Row(sampleId='NA12878', conditionalQuality=1)]),
... Row(genotypes=[Row(sampleId='NA12878', conditionalQuality=2)]),
... Row(genotypes=[Row(sampleId='NA12878', conditionalQuality=3)])]
>>> df = spark.createDataFrame(sites, 'genotypes: array<struct<sampleId: string, conditionalQuality: int>>')
>>> df.select(glow.sample_gq_summary_stats('genotypes').alias('stats')).collect()
[Row(stats=[Row(sampleId='NA12878', mean=2.0, stdDev=1.0, min=1.0, max=3.0)])]
args:
- name: genotypes
doc: An array of genotype structs with ``conditionalQuality`` field
returns: An array of structs where each struct contains ``mean``, ``stDev``, ``min``, and ``max`` of the genotype qualities for a sample. If ``sampleId`` is present in a genotype, it will be propagated to the resulting struct as an extra field.
- name: assert_true_or_error
doc: Asserts a boolean condition is true.
since: 0.5.0
expr_class: io.projectglow.sql.expressions.AssertTrueOrError
exclude_python: true
exclude_scala: true
examples:
python: |
>>> df = spark.createDataFrame([Row(v=1), Row(v=1)])
>>> df.select(glow.assert_true_or_error(df.v == 1, 'the value is not one!').alias('is_one')).collect()
[Row(is_one=None), Row(is_one=None)]
args:
- name: condition
doc: Boolean condition to check
- name: errMsg
doc: Error message if condition fails
type: str
returns: Null if true, or throws an exception if not true
- name: array_quantile
doc: Array quantile
since: 2.1.0
expr_class: io.projectglow.sql.expressions.ArrayQuantile
args:
- name: arr
doc: An array of numeric values
- name: quantile
doc: The desired quantile
type: double
- name: is_sorted
doc: If true, the input array is assumed to already be sorted
is_optional: true
examples:
python: |
>>> df = spark.createDataFrame([Row(arr=[1, 2, 3, 4, 5])])
>>> df.select(glow.array_quantile(df.arr, 0.7).alias('p70')).collect()
[Row(p70=3.8)]
gwas_functions:
functions:
- name: linear_regression_gwas
doc: Performs a linear regression association test optimized for performance in a GWAS setting.
See :ref:`linear-regression` for details.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.LinearRegressionExpr
examples:
python: |
>>> from pyspark.ml.linalg import DenseMatrix
>>> phenotypes = [2, 3, 4]
>>> genotypes = [0, 1, 2]
>>> covariates = DenseMatrix(numRows=3, numCols=1, values=[1, 1, 1])
>>> df = spark.createDataFrame([Row(genotypes=genotypes, phenotypes=phenotypes, covariates=covariates)])
>>> df.select(glow.expand_struct(glow.linear_regression_gwas('genotypes', 'phenotypes', 'covariates'))).collect()
[Row(beta=0.9999999999999998, standardError=1.4901161193847656e-08, pValue=9.486373847239922e-09)]
args:
- name: genotypes
doc: A numeric array of genotypes
- name: phenotypes
doc: A numeric array of phenotypes
- name: covariates
doc: A ``spark.ml`` ``Matrix`` of covariates
returns: A struct containing ``beta``, ``standardError``, and ``pValue`` fields. See :ref:`linear-regression`.
- name: logistic_regression_gwas
doc: Performs a logistic regression association test optimized for performance in a GWAS setting.
See :ref:`logistic-regression` for more details.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.LogisticRegressionExpr
examples:
python: |
>>> from pyspark.ml.linalg import DenseMatrix
>>> phenotypes = [1, 0, 0, 1, 1]
>>> genotypes = [0, 0, 1, 2, 2]
>>> covariates = DenseMatrix(numRows=5, numCols=1, values=[1, 1, 1, 1, 1])
>>> offset = [1, 0, 1, 0, 1]
>>> df = spark.createDataFrame([Row(genotypes=genotypes, phenotypes=phenotypes, covariates=covariates, offset=offset)])
>>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth'))).collect()
[Row(beta=0.7418, oddsRatio=2.0999, waldConfidenceInterval=[0.2509, 17.5690], pValue=0.3952)]
>>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT'))).collect()
[Row(beta=1.1658, oddsRatio=3.2087, waldConfidenceInterval=[0.2970, 34.6567], pValue=0.2943)]
>>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth', 'offset'))).collect()
[Row(beta=0.8024, oddsRatio=2.2310, waldConfidenceInterval=[0.2540, 19.5903], pValue=0.3754)]
>>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT', 'offset'))).collect()
[Row(beta=1.1996, oddsRatio=3.3188, waldConfidenceInterval=[0.3071, 35.8638], pValue=0.2857)]
args:
- name: genotypes
doc: An numeric array of genotypes
- name: phenotypes
doc: A double array of phenotype values
- name: covariates
doc: A ``spark.ml`` ``Matrix`` of covariates
- name: test
doc: Which logistic regression test to use. Can be ``LRT`` or ``Firth``
type: str
- name: offset
doc: An optional double array of offset values. The offset vector is added with coefficient 1 to the linear predictor term X*b.
is_optional: true
returns: A struct containing ``beta``, ``oddsRatio``, ``waldConfidenceInterval``, and ``pValue`` fields. See :ref:`logistic-regression`.
- name: genotype_states
doc: Gets the number of alternate alleles for an array of genotype structs. Returns ``-1`` if there are any ``-1`` s (no-calls) in the calls array.
since: 0.3.0
expr_class: io.projectglow.sql.expressions.GenotypeStates
examples:
python: |
>>> genotypes = [
... Row(calls=[1, 1]),
... Row(calls=[1, 0]),
... Row(calls=[0, 0]),
... Row(calls=[-1, -1])]
>>> df = spark.createDataFrame([Row(genotypes=genotypes)], 'genotypes: array<struct<calls: array<int>>>')
>>> df.select(glow.genotype_states('genotypes').alias('states')).collect()
[Row(states=[2, 1, 0, -1])]
args:
- name: genotypes
doc: An array of genotype structs with ``calls`` field
returns: An array of integers containing the number of alternate alleles in each call array
# These functions are intended to only be called from Glow code. They are available in SQL but not in the Scala
# or Python APIs.
private:
functions:
- name: comb
doc: Compute binomial coefficient
since: 2.1.0
expr_class: io.projectglow.sql.expressions.Comb
args:
- name: n
doc: n
- name: k
doc: k