diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e5eb28e5..474bf4cc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -54,6 +54,7 @@ jobs: - name: Check docs links run: (cd docs && make linkcheck) + continue-on-error: true spark-tests: diff --git a/README.md b/README.md index 24079f79..31bd5e38 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ conda env update -f python/environment.yml Start an sbt shell using the `sbt` command. -> **FYI**: The following SBT projects are built on Spark 3.5.1/Scala 2.12.15 by default. To change the Spark version and +> **FYI**: The following SBT projects are built on Spark 3.5.1/Scala 2.12.19 by default. To change the Spark version and Scala version, set the environment variables `SPARK_VERSION` and `SCALA_VERSION`. To compile the main code: diff --git a/build.sbt b/build.sbt index 3a435662..f086a9ad 100644 --- a/build.sbt +++ b/build.sbt @@ -8,7 +8,7 @@ import sbt.librarymanagement.ModuleID import sbt.nio.Keys._ // Scala version used by DBR 13.3 LTS and 14.0 -lazy val scala212 = "2.12.15" +lazy val scala212 = "2.12.19" lazy val scala213 = "2.13.12" lazy val spark3 = "3.5.1" diff --git a/levels_ridge_regression_tutorial.ipynb b/levels_ridge_regression_tutorial.ipynb index 35e84897..c4232a3b 100644 --- a/levels_ridge_regression_tutorial.ipynb +++ b/levels_ridge_regression_tutorial.ipynb @@ -2,8 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "922c085d-0cda-44ff-be65-268ca3b3d8b8", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -17,8 +25,16 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0951d9a7-4c87-48f6-a256-8e6161657994", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "spark = SparkSession.builder.appName('levels').getOrCreate()\n", @@ -27,8 +43,16 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "12c91c53-ef6b-411c-8200-98be4e8b9adf", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "test_data_root = '/Users/leland.barnard/glow/glow-wgr/test-data/levels/ridge-regression' #path to glow levels test data" @@ -36,7 +60,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "287406ae-65ed-4a3c-ab3d-9236525f7429", + "showTitle": false, + "title": "" + } + }, "source": [ "We need three objects to get started:\n", "* A Spark DataFrame representing the block genotype matrix\n", @@ -46,8 +78,16 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "abd84cb6-e98c-498e-88af-ded9ee95ebdd", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "blockdf_lvl0 = spark.read.parquet(f'{test_data_root}/blockedGT.snappy.parquet') #block genotype matrix\n", @@ -57,7 +97,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "098333d9-a84b-4886-9a93-400a1866e8bd", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The block genotype matrix as a DataFrame\n", "If we imagine the block genotype matrix conceptually, we think of an *NxM* matrix *X* where each row *n* represents an individual sample, each column *m* represents a variant, and each cell *(n, m)* contains a genotype value for sample *n* at variant *m*. We then imagine laying a coarse grid on top of this matrix such that matrix cells within the same coarse grid cell are all assigned to the same block *x*. Each block *x* is indexed by a sample block ID (corresponding to a list of rows belonging to the block) and a header block ID (corresponding to a list of columns belonging to the block). The sample block IDs are generally just integers 0 through the number of sample blocks. The header block IDs are strings of the form 'chr_C_block_B', which refers to the Bth block on chromosome C. The Spark DataFrame representing this block matrix can be thought of as the transpose of each block *xT* all stacked one atop another. Each row represents the values from a particular column from *X*, for the samples corresponding to a particular sample block. The fields in the DataFrame are:\n", @@ -74,33 +122,32 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "| header|size| indices| values| header_block|sample_block| position| mu| sig|\n", - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "|2:231414300:T:C| 9| [0, 1, 6, 7, 8]|[1.0, 1.0, 1.0, 1...|chr_2_block_6| 7|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 10|[0, 1, 2, 3, 4, 5...|[2.0, 1.0, 1.0, 1...|chr_2_block_6| 1|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 12|[1, 3, 4, 5, 7, 8...|[2.0, 1.0, 1.0, 1...|chr_2_block_6| 8|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 13|[0, 1, 2, 3, 4, 5...|[2.0, 1.0, 2.0, 1...|chr_2_block_6| 9|231414300|0.8686868686868686|0.6730002176294544|\n", - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "19d318c5-6513-4c21-8ba2-0f49311e4493", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl0.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a2f3933a-0d72-45f3-b594-0eba2ffb0e29", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The sample block mapping\n", "This is a comparitively simple key-value store where each key is a sample block ID and each value is a list of sample IDs contained in that sample block. As a Spark DataFrame, this is represented as a two column DataFrame with the following fields:\n", @@ -110,33 +157,32 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+--------------------+\n", - "|sample_block| sample_ids|\n", - "+------------+--------------------+\n", - "| 3|[1008962444, 1035...|\n", - "| 9|[1083737921, 1041...|\n", - "| 7|[1048623585, 1030...|\n", - "| 1|[1073111137, 1082...|\n", - "+------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0d110459-be02-4e04-b570-afbfcbf8213c", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "indexdf.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "125bff86-49dc-41e2-a058-e03565311355", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The phenotype data\n", "The phenotype data is represented as a Pandas DataFrame indexed by the sample ID. Each column represents a single phenotype, and it is assumed that there are no missing phenotype values, and that the phenotypes mean centered at 0." @@ -144,105 +190,32 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sim100sim92sim58sim16
sample_id
1042204109-0.905058-1.171217-1.437376-1.703535
1035505158-0.616539-0.411283-0.206027-0.000770
1008166305-0.946014-0.482639-0.0192630.444112
1068805020-1.155375-0.660005-0.1646340.330736
1095012035-1.024889-0.4921790.0405300.573240
\n", - "
" - ], - "text/plain": [ - " sim100 sim92 sim58 sim16\n", - "sample_id \n", - "1042204109 -0.905058 -1.171217 -1.437376 -1.703535\n", - "1035505158 -0.616539 -0.411283 -0.206027 -0.000770\n", - "1008166305 -0.946014 -0.482639 -0.019263 0.444112\n", - "1068805020 -1.155375 -0.660005 -0.164634 0.330736\n", - "1095012035 -1.024889 -0.492179 0.040530 0.573240" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "66f73276-8f93-43fb-8464-d1fe48c4f688", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "labeldf.head()" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bc02343a-0353-487a-b68b-8943e102ae40", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Reducer model fitting\n", "The first step in the fitting procedure is to apply a dimensionality reduction to the block matrix *X* using the `RidgeReducer`. This is accomplished by fitting multiple ridge models within each block *x* and producing a new block matrix where each column represents the prediction of one ridge model applied within one block. This approach to model building is generally referred to as **stacking**. We will call the block genotype matrix we started with the **level 0** matrix in the stack *X0*, and the output of the ridge reduction step the **level 1** matrix *X1*. The `RidgeReducer` class is used for this step, which is initiallized with a list of ridge regularization values (referred to here as alpha). Since ridge models are indexed by these alpha values, the `RidgeReducer` will generate one ridge model per value of alpha provided, which in turn will produce one column per block in *X0*, so the final dimensions of matrix *X1* will be *Nx(LxK)*, where *L* is the number of header blocks in *X0* and *K* is the number of alpha values provided to the `RidgeReducer`. In practice, we can estimate a span of alpha values in a reasonable order of magnitude based on guesses at the heritability of the phenotype we are fitting, but here we will just pick some values." @@ -250,8 +223,16 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "05aca85c-23b3-421b-8080-a829dcc66271", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl0 = np.logspace(2, 5, 10)\n", @@ -260,51 +241,63 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c37cf140-97f9-4e10-9106-b34301c19d54", + "showTitle": false, + "title": "" + } + }, "source": [ "When the `RidgeReducer` is initialized, it will assign names to the provided alphas and store them in a dict accessible as `RidgeReducer.alphas`. This is mostly just to give an easily readable and sortable name to the models produced for each ridge value." ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'alpha_0': 100.0,\n", - " 'alpha_1': 215.44346900318845,\n", - " 'alpha_2': 464.15888336127773,\n", - " 'alpha_3': 1000.0,\n", - " 'alpha_4': 2154.4346900318824,\n", - " 'alpha_5': 4641.588833612777,\n", - " 'alpha_6': 10000.0,\n", - " 'alpha_7': 21544.346900318822,\n", - " 'alpha_8': 46415.888336127726,\n", - " 'alpha_9': 100000.0}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dbc31936-dadc-4996-bf08-035a8fb59dc8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "stack_lvl0.alphas" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b242244-0b6a-4b4b-a7c0-f6c4f131ac2c", + "showTitle": false, + "title": "" + } + }, "source": [ "The `RidgeReducer.fit(blockdf, labeldf, indexdf)` method generates a Spark DataFrame representing the model that we can use to reduce *X0* to *X1*." ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6b1d6749-19b5-4ad7-ae4e-493fa42faacd", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl0 = stack_lvl0.fit(blockdf_lvl0, labeldf, indexdf)" @@ -312,7 +305,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7ea1248f-bfe1-4ec3-92f7-272dd9e4e194", + "showTitle": false, + "title": "" + } + }, "source": [ "In explicit terms, the reduction of a block *x0* from *X0* to the corresponding block *x1* from *X1* is accomplished by the matrix multiplication *x0 * B = x1*, where *B* is a coefficient matrix of size *mxK*, where *m* is the number of columns in block *x0* and *K* is the number of alpha values used in the reduction. As an added wrinkle, if the ridge reduction is being performed against multiple phenotypes at once, each phenotype will have its own *B*, and for convenience we panel these next to each other in the output into a single matrix, so *B* in that case has dimensions *mx(K*P)* where *P* is the number of phenotypes. Each matrix *B* is specific to a particular block in *X0*, so the Spark DataFrame produced by the `RidgeReducer` can be thought of all of as the matrices *B* from all of the blocks stacked one atop another. The fields in the model DataFrame are:\n", "* header_block: An ID assigned to the block *x0* corresponding to the coefficients in this row.\n", @@ -325,33 +326,32 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "| header_block|sample_block| header| position| alphas| labels| coefficients|\n", - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "|chr_3_block_8| 0|3:160741710:G:A|160741710|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.07462677364336...|\n", - "|chr_3_block_8| 0|3:175345110:C:T|175345110|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.07834053929928...|\n", - "|chr_3_block_8| 0|3:183469890:A:G|183469890|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.02152237814164...|\n", - "|chr_3_block_8| 0|3:195047160:C:T|195047160|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.01153728383795...|\n", - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e70a537-0417-4a24-9dc8-97b9cd72db76", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl0.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dd5e8481-773c-4d9c-99af-64cb9ffe43f3", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Reducer transformation\n", "After fitting, the `RidgeReducer.transform(blockdf, labeldf, modeldf)` method can be used to generate `X1` from `X0`." @@ -359,8 +359,16 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "02bd487b-4d0d-4e77-b6fd-151bacc59eba", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "blockdf_lvl1 = stack_lvl0.transform(blockdf_lvl0, labeldf, modeldf_lvl0)" @@ -368,7 +376,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "94b0e0f4-b0e8-45d0-9d50-abed3fd252ca", + "showTitle": false, + "title": "" + } + }, "source": [ "The output of the transformation is closely analogous to the block matrix DataFrame we started with. The main difference is that, rather than representing a single block matrix, it really represents multiple block matrices, with one such matrix per label (phenotype). Comparing the schema of this block matrix DataFrame (`blockdf_lvl1`) with the DataFrame we started with (`blockdf_lvl0`), the new columns are:\n", "* alpha: This is the name of the alpha value used in fitting the model that produced the values in this row.\n", @@ -377,66 +393,64 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "| header|size| values|header_block|sample_block|position| mu| sig| alpha| label|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "|chr_3_block_8_alp...| 13|[0.08337895454032...| chr_3| 0| 80| 0.04148112816674154|0.19099426058493266|alpha_0|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.04796003873174...| chr_3| 0| 81| 0.02402075708176127|0.11316256614620662|alpha_1|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.02504256254617...| chr_3| 0| 82|0.012596289114544081|0.06030642726717367|alpha_2|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.01234023662311...| chr_3| 0| 83|0.006221371128544...|0.03006776034645892|alpha_3|sim100|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "90e303b1-09fe-4546-b4f1-b61b726d9521", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2fffc641-a2cd-4785-a5f0-b04b4e46c9b2", + "showTitle": false, + "title": "" + } + }, "source": [ "The headers in the *X1* block matrix are derived from a combination of the source block in *X0*, the alpha value used in fitting the ridge model, and the label they were fit with. These headers are assigned to header blocks that correspond to the chromosome of the source block in *X0*." ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------------------------------+------------+\n", - "|header |header_block|\n", - "+----------------------------------+------------+\n", - "|chr_3_block_8_alpha_0_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_1_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_2_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_3_label_sim100|chr_3 |\n", - "+----------------------------------+------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5571df67-1419-40a8-b700-d8e63b8c3486", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl1.select('header', 'header_block').show(4, False)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "47174c5e-3dfd-453a-89d4-96276974a4fb", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Regression fitting\n", "The block matrix *X1* can be used to fit a final predictive model that can generate phenotype predictions *y_hat* using the `RidgeRegression` class. As with the `RidgeReducer` class, this class is initialized with a list of alpha values." @@ -444,8 +458,16 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5e0bcef0-4e89-4390-8a05-b5c3495d1846", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl1 = np.logspace(1, 4, 10)\n", @@ -454,8 +476,16 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e32c5923-dfcd-44e3-bebd-5920cddf3c53", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl1_est, cvdf_lvl1 = estimator_lvl1.fit(blockdf_lvl1, labeldf, indexdf)" @@ -463,47 +493,62 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6e5936c1-2770-495f-b50c-dfd3f535a96d", + "showTitle": false, + "title": "" + } + }, "source": [ "The `RidgeRegression.fit(blockdf, labeldf, indexdf)` works in much the same way as the `RidgeReducer.fit(blockdf, labeldf, indexdf)` method, except that it returns two DataFrames:" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f6c42ccd-843d-4011-a1c9-6105dc996900", + "showTitle": false, + "title": "" + } + }, "source": [ "A model DataFrame analogous to the model DataFrame provided by the `RidgeReducer`. An important difference is that the header block ID for all rows will be 'all', indicating that all headers from all blocks have been used in a single fit, rather than fitting within blocks." ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "|header_block|sample_block| header|position| alphas| labels| coefficients|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "| all| 1|chr_1_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[0.02787784249249...|\n", - "| all| 1|chr_2_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0164002560049...|\n", - "| all| 1|chr_3_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0234168451974...|\n", - "| all| 1|chr_1_block_0_alp...| 1|[alpha_0, alpha_1...|[sim16, sim16, si...|[0.00381390574280...|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f9f9a80c-b2ed-4c46-81e5-17143eb3c017", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl1_est.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9c90c408-8dbc-4c8b-a3b7-881cba82f5a9", + "showTitle": false, + "title": "" + } + }, "source": [ "A cross validation (cv) report DataFrame, which reports the results of the hyperparameter (i.e., alpha) value optimization routine.\n", "* label: This is the label corresponding to the cross cv results on the row.\n", @@ -513,32 +558,32 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+-------+--------------------+\n", - "| label| alpha| r2_mean|\n", - "+------+-------+--------------------+\n", - "| sim92|alpha_5| 0.18389799898047948|\n", - "| sim16|alpha_8|-0.22499071350515992|\n", - "| sim58|alpha_6|-0.02504464471643515|\n", - "|sim100|alpha_5| 0.2566748993770534|\n", - "+------+-------+--------------------+\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8a93813f-00e2-4012-9687-d54f2ad84db8", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "cvdf_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b84d440-8d54-4ad9-9ac7-0520cc9d13a8", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Producing phenotype predictions *y_hat*\n", "After fitting the `RidgeRegression` model, the model DataFrame and cv DataFrame are used to apply the model to the block matrix DataFrame to produce predictions (*y_hat*) for each label in each sample block using the `RidgeRegression.transform(blockdf, labeldf, modeldf, cvdf)` method" @@ -546,8 +591,16 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2fe19547-76ea-4302-af02-d9cadc0d87ae", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl1 = estimator_lvl1.transform(blockdf_lvl1, labeldf, modeldf_lvl1_est, cvdf_lvl1)" @@ -555,7 +608,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f163d829-d752-4258-882b-dd0b1d7f9349", + "showTitle": false, + "title": "" + } + }, "source": [ "The resulting *y_hat* DataFrame has the following fields:\n", "* sample_block: The sample block ID for the samples corresponding to the *y_hat* values on this row.\n", @@ -566,33 +627,32 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 1|sim16|alpha_8|[0.08461773658136...|\n", - "| 4|sim16|alpha_8|[0.08343907935865...|\n", - "| 7|sim16|alpha_8|[-0.0976335915514...|\n", - "| 8|sim16|alpha_8|[-0.0461222342349...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3bc9c2bc-ca22-4791-972f-0c0ae6d6b7e7", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "52f548da-497e-45d3-97b4-aa7b41763cde", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Fitting a second round of ridge reduction instead of ridge regression\n", "After fitting the first ridge reduction step and producing *X1* from *X0*, we can go directly into fitting the final ridge regression model, as we have just seen. Alternatively, we can fit a second round of ridge reduction to squeeze *X1* into an even smaller feature matrix, which we will call the **level 2** matrix *X2*. This has some advantages when it comes to generating the leave-one-chromosome-out versions of the *y_hat*s and does not come at much additional cost. The procedure for fitting the second round of ridge reduction is identical to the first (we will reuse the same alphas we chose for the ridge regression fit above):" @@ -600,8 +660,16 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0f2efd36-54b7-4699-98a9-fdebdaee18ac", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "stack_lvl1 = RidgeReducer(alphas_lvl1)" @@ -609,8 +677,16 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9b15c1b0-c22a-4909-858e-c550d0bebef0", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl1 = stack_lvl1.fit(blockdf_lvl1, labeldf, indexdf)\n", @@ -619,81 +695,95 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "619e9d61-0458-40a2-92bf-0fe7bc86c088", + "showTitle": false, + "title": "" + } + }, "source": [ "The **level 2** block matrix DataFrame produced here has an identical schema to the **level 1** block matrix. A key difference is that the header block ID for all headers is now \"all\" for all headers, indicating that there are now no more blocks to collapse." ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "| header|size| values|header_block|sample_block|position| mu| sig| alpha|label|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "|all_block_1_alpha...| 13|[-0.0796642628265...| all| 9| 10|-1.49453099468771...| 0.4138556129030118|alpha_0|sim16|\n", - "|all_block_1_alpha...| 13|[-0.0957216101643...| all| 9| 11| 0.0|0.40982580962691184|alpha_1|sim16|\n", - "|all_block_1_alpha...| 13|[-0.1070572367031...| all| 9| 12|5.124106267500723...|0.40028561461234197|alpha_2|sim16|\n", - "|all_block_1_alpha...| 13|[-0.1163419313886...| all| 9| 13|1.708035422500241...| 0.3798792747100316|alpha_3|sim16|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "91e7f3d8-c503-4ecf-a8de-97eef8129edc", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl2.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a4016386-8851-4c64-9b4c-13523759f652", + "showTitle": false, + "title": "" + } + }, "source": [ "The headers for each column now follow the name convention 'all_block_B_alpha_A_label_L', which refer to the ridge model prediction using alpha A and for label L fit using the features from header block B from block matrix *X1*. Since the blocks in *X1* refer to chromosomes, the block number B here can be interpreted as a chromosome. The 'all' token reflects the fact that we are not assigning the columns in *X2* to any new blocks (i.e, *X2* only has sample blocks, but there is only one header block which encompasses the entire matrix)." ] }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------------------------+\n", - "|header |\n", - "+-------------------------------+\n", - "|all_block_1_alpha_0_label_sim16|\n", - "|all_block_1_alpha_1_label_sim16|\n", - "|all_block_1_alpha_2_label_sim16|\n", - "|all_block_1_alpha_3_label_sim16|\n", - "+-------------------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "507eb550-5afe-4f9f-ad41-0935c08fba48", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl2.select('header').show(4, False)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8c6ab06b-8dc4-4231-985b-9c1101196b92", + "showTitle": false, + "title": "" + } + }, "source": [ "We can now fit a ridge regression model as we did above, except that we will use the matrix *X2* instead of *X1*" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c6dbbd53-a209-419f-9cbd-5bc568b3f987", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl2 = np.logspace(0, 3, 10)\n", @@ -702,8 +792,16 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f41eea9c-4fa9-4266-b860-27d5d551701b", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl2_est, cvdf_lvl2 = estimator_lvl2.fit(blockdf_lvl2, labeldf, indexdf)" @@ -711,59 +809,50 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "|header_block|sample_block| header|position| alphas| labels| coefficients|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "| all| 1|all_block_1_alpha...| 10|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0939792871878...|\n", - "| all| 1|all_block_1_alpha...| 11|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0788683759104...|\n", - "| all| 1|all_block_1_alpha...| 12|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0693010949556...|\n", - "| all| 1|all_block_1_alpha...| 13|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0446945065691...|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ea0c3e52-bd80-4c3b-9c4e-8fd33443a5ae", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl2_est.show(4)" ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+-------+--------------------+\n", - "| label| alpha| r2_mean|\n", - "+------+-------+--------------------+\n", - "| sim92|alpha_7| 0.199251090828654|\n", - "| sim16|alpha_9|-0.22903758326079596|\n", - "| sim58|alpha_7|0.005461670993813417|\n", - "|sim100|alpha_8| 0.2314559298409073|\n", - "+------+-------+--------------------+\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6d746257-9d90-4aea-857f-d3a6d02d1887", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "cvdf_lvl2.show(4)" ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "93746bb0-6a6c-42c1-8b6c-b40c7934008d", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl2 = estimator_lvl2.transform(blockdf_lvl2, labeldf, modeldf_lvl2_est, cvdf_lvl2)" @@ -771,41 +860,48 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 9|sim58|alpha_7|[-0.2126330471314...|\n", - "| 6|sim58|alpha_7|[0.18042213283121...|\n", - "| 5|sim58|alpha_7|[-0.0126226427178...|\n", - "| 2|sim58|alpha_7|[0.00871975701462...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "29fac9be-5092-4e08-a5ab-307536fbbf72", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl2.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a0304841-6dfc-416d-902b-ea9c41a9ebc8", + "showTitle": false, + "title": "" + } + }, "source": [ "For producing the LOCO versions of the *y_hat* vectors, it is only necessary to filter out rows from `blockdf_lvl2` corresponding to the chromosome we wish to drop before applying the transformation. For example, if we wanted to produce *y_hat* with chromosome 1 left out (recall that the chromosomes constitute the source blocks for the headers in `blockdf_lvl2`, so headers from chromosome 1 will have headers like %block_1%):" ] }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4e5d9970-6d50-4f85-9ae3-dc0016bbca5d", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl2_loco1 = estimator_lvl2.transform(blockdf_lvl2.filter(f'header NOT LIKE \"%block_1%\"'), labeldf, modeldf_lvl2_est, cvdf_lvl2)" @@ -813,39 +909,46 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 9|sim58|alpha_7|[-0.1347024836295...|\n", - "| 6|sim58|alpha_7|[0.20213653390706...|\n", - "| 5|sim58|alpha_7|[-0.1602333580401...|\n", - "| 2|sim58|alpha_7|[-0.1511874717623...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4bf01e2b-fad0-494c-9dd9-a199bf04ebe3", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl2_loco1.show(4)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "43c76cb0-28d8-40b0-86f2-5c1a3da9c210", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [] } ], "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": {}, + "notebookName": "levels_ridge_regression_tutorial", + "notebookOrigID": 2963708295114367, + "widgets": {} + }, "kernelspec": { "display_name": "glow", "language": "python", @@ -865,5 +968,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 0 }