From 86b5f2912ca03f59fc935f24872df4db2ca18cbc Mon Sep 17 00:00:00 2001 From: Brad Macdonald <52762200+BWMac@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:23:15 -0600 Subject: [PATCH] [IBCDPE-954] Adds `proteomics_srm` GX Suite (#145) * adds srm nb * adds proteomics_srm GX suite * pre-commit fix * updates uniqid and uniprotid regexes * updates regex * remove ?: * pre-commit * updates uniprot regex * pre-commit --- config.yaml | 1 + gx_suite_definitions/proteomics_srm.ipynb | 276 ++++++++++++++++++ .../gx/expectations/proteomics_srm.json | 225 ++++++++++++++ test_config.yaml | 1 + 4 files changed, 503 insertions(+) create mode 100644 gx_suite_definitions/proteomics_srm.ipynb create mode 100644 src/agoradatatools/great_expectations/gx/expectations/proteomics_srm.json diff --git a/config.yaml b/config.yaml index 265e0d6f..ad330a77 100644 --- a/config.yaml +++ b/config.yaml @@ -117,6 +117,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest + gx_enabled: true - target_exp_validation_harmonized: files: diff --git a/gx_suite_definitions/proteomics_srm.ipynb b/gx_suite_definitions/proteomics_srm.ipynb new file mode 100644 index 00000000..a8d33686 --- /dev/null +++ b/gx_suite_definitions/proteomics_srm.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import great_expectations as gx\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Proteomics SRM Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proteomics_srm_data_file = syn.get(\"syn52579910\").path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator = context.sources.pandas_default.read_json(\n", + " proteomics_srm_data_file\n", + ")\n", + "validator.expectation_suite_name = \"proteomics_srm\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uniqid\n", + "validator.expect_column_values_to_be_of_type(\"uniqid\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"uniqid\")\n", + "validator.expect_column_values_to_match_regex(\"uniqid\", \"^[A-Za-z0-9_.-]+\\\\|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hgnc_symbol\n", + "validator.expect_column_values_to_be_of_type(\"hgnc_symbol\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"hgnc_symbol\", 1, 15)\n", + "validator.expect_column_values_to_match_regex(\"hgnc_symbol\", \"^[A-Za-z0-9_.-]+$\")\n", + "validator.expect_column_values_to_not_be_null(\"hgnc_symbol\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uniprotid\n", + "validator.expect_column_values_to_be_of_type(\"uniprotid\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"uniprotid\", 1, 15)\n", + "validator.expect_column_values_to_match_regex(\"uniprotid\", \"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\")\n", + "validator.expect_column_values_to_not_be_null(\"uniprotid\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ensembl_gene_id\n", + "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n", + "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tissue\n", + "validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"tissue\")\n", + "validator.expect_column_values_to_be_in_set(\"tissue\", [\"DLPFC\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# log2_fc\n", + "validator.expect_column_values_to_be_of_type(\"log2_fc\", \"float\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ci_upr\n", + "validator.expect_column_values_to_be_of_type(\"ci_upr\", \"float\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ci_lwr\n", + "validator.expect_column_values_to_be_of_type(\"ci_lwr\", \"float\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pval\n", + "validator.expect_column_values_to_be_of_type(\"pval\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"pval\", 0, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cor_pval\n", + "validator.expect_column_values_to_be_of_type(\"cor_pval\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"cor_pval\", 0, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# multi-field\n", + "validator.expect_compound_columns_to_be_unique([\"hgnc_symbol\", \"uniprotid\", \"tissue\"])\n", + "validator.expect_compound_columns_to_be_unique([\"ensembl_gene_id\", \"uniprotid\", \"tissue\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Expectation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/great_expectations/gx/expectations/proteomics_srm.json b/src/agoradatatools/great_expectations/gx/expectations/proteomics_srm.json new file mode 100644 index 00000000..c026d1cc --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/proteomics_srm.json @@ -0,0 +1,225 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "proteomics_srm", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "uniqid", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "uniqid" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "uniqid", + "regex": "^[A-Za-z0-9_.-]+\\|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hgnc_symbol", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "hgnc_symbol", + "max_value": 15, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "hgnc_symbol", + "regex": "^[A-Za-z0-9_.-]+$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "hgnc_symbol" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "uniprotid", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "uniprotid", + "max_value": 15, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "uniprotid", + "regex": "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "uniprotid" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ensembl_gene_id", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "ensembl_gene_id" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "ensembl_gene_id", + "regex": "^ENSG\\d{11}$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tissue", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "tissue" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "tissue", + "value_set": [ + "DLPFC" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "log2_fc", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ci_upr", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ci_lwr", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "pval", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "pval", + "max_value": 1, + "min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "cor_pval", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "cor_pval", + "max_value": 1, + "min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_compound_columns_to_be_unique", + "kwargs": { + "column_list": [ + "hgnc_symbol", + "uniprotid", + "tissue" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_compound_columns_to_be_unique", + "kwargs": { + "column_list": [ + "ensembl_gene_id", + "uniprotid", + "tissue" + ] + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/test_config.yaml b/test_config.yaml index cc1f29a6..0427a27b 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -117,6 +117,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest + gx_enabled: true - target_exp_validation_harmonized: files: