chore: staging -> prod (#6292)

Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: Joyce Yan <[email protected]> Co-authored-by: Timmy Huang <[email protected]> Co-authored-by: Ronen <[email protected]> Co-authored-by: Trent Smith <[email protected]> Co-authored-by: Mim Hastie <[email protected]> Co-authored-by: Daniel Hegeman <[email protected]> Co-authored-by: atarashansky <[email protected]> Co-authored-by: pablo-gar <[email protected]> Co-authored-by: Fran McDade <[email protected]> Co-authored-by: Fran McDade <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: SethFeingold <[email protected]>
chanzuckerberg · Dec 1, 2023 · a9e7206 · a9e7206
1 parent 0a898a5
commit a9e7206
Show file tree

Hide file tree

Showing 87 changed files with 3,664 additions and 1,522 deletions.
diff --git a/.github/workflows/deploy-happy-stack.yml b/.github/workflows/deploy-happy-stack.yml
@@ -18,7 +18,7 @@ jobs:
     steps:
       - uses: actions/setup-node@v2
         with:
-          node-version: 15
+          node-version: "16.14.2"
       - name: Configure AWS Prod Credentials
         uses: aws-actions/configure-aws-credentials@v2
         if: github.event.deployment.environment == 'prod'

diff --git a/.github/workflows/push-tests.yml b/.github/workflows/push-tests.yml
@@ -39,12 +39,20 @@ jobs:
         with:
           fetch-depth: 2
       - uses: actions/setup-python@v3
+
       - name: check backend
         uses: pre-commit/[email protected]
+      - uses: actions/setup-node@v4
+        with:
+          node-version-file: "frontend/.nvmrc"
+          cache: "npm"
+          cache-dependency-path: "frontend/package-lock.json"
       - name: check frontend
         run: |
-          cp frontend/src/configs/local.js frontend/src/configs/configs.js
-          docker-compose run --no-deps --rm frontend make lint
+          cd frontend
+          npm ci
+          cp src/configs/local.js src/configs/configs.js
+          npm run lint
       - uses: 8398a7/action-slack@v3
         with:
           status: ${{ job.status }}

diff --git a/.happy/terraform/modules/sfn/main.tf b/.happy/terraform/modules/sfn/main.tf
@@ -281,8 +281,7 @@ resource "aws_sfn_state_machine" "state_machine" {
             "BackoffRate": 2.0
         } ],
         "Next": "DeregisterJobDefinition",
-        "ResultPath": null,
-        "OutputPath": "$.[0]"
+        "ResultPath": null
       },
       "HandleErrors": {
         "Type": "Task",
@@ -300,16 +299,62 @@ resource "aws_sfn_state_machine" "state_machine" {
             "MaxAttempts": 3,
             "BackoffRate": 2.0
         } ],
-        "Next": "DeregisterJobDefinition",
+        "Next": "DeregisterJobDefinitionAfterHandleErrors",
         "ResultPath": null
       },
-      "DeregisterJobDefinition": {
+      "DeregisterJobDefinitionAfterHandleErrors": {
         "Type": "Task",
-        "End": true,
+        "Next": "CheckForErrors",
         "Parameters": {
           "JobDefinition.$": "$.batch.JobDefinitionName"
         },
-        "Resource": "arn:aws:states:::aws-sdk:batch:deregisterJobDefinition"
+        "Resource": "arn:aws:states:::aws-sdk:batch:deregisterJobDefinition",
+        "ResultPath": null
+      },
+      "DeregisterJobDefinition": {
+        "Type": "Task",
+        "Next": "CheckForErrors",
+        "Parameters": {
+          "JobDefinition.$": "$[0].batch.JobDefinitionName"
+        },
+        "Resource": "arn:aws:states:::aws-sdk:batch:deregisterJobDefinition",
+        "ResultPath": null
+      },
+      "CheckForErrors": {
+        "Type": "Choice",
+        "Choices": [
+          {
+            "Variable": "$.error",
+            "IsPresent": true,
+            "Next": "DownloadValidateError"
+          },
+          {
+            "Or": [
+              {
+                "Variable": "$[0].error",
+                "IsPresent": true
+              },
+              {
+                "Variable": "$[1].error",
+                "IsPresent": true
+              }
+            ],
+            "Next": "ConversionError"
+          }
+        ],
+        "Default": "EndPass"
+      },
+      "ConversionError": {
+        "Type": "Fail",
+        "Cause": "CXG and/or Seurat conversion failed."
+      },
+      "DownloadValidateError": {
+        "Type": "Fail",
+        "Cause": "An error occurred during Download/Validate."
+      },
+      "EndPass": {
+        "Type": "Pass",
+        "End": true
       }
     }
 }

diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py
@@ -11,7 +11,6 @@
     CollectionMetadataUpdate,
     CollectionQueryFilter,
     DatasetArtifactDownloadData,
-    DeprecatedDatasetArtifactDownloadData,
 )
 from backend.layers.business.exceptions import (
     ArtifactNotFoundException,
@@ -544,26 +543,6 @@ def get_dataset_artifact_download_data(
 
         return DatasetArtifactDownloadData(file_size, url)
 
-    # TODO: Superseded by get_dataset_artifact_download_data. Remove with #5697.
-    def get_dataset_artifact_download_data_deprecated(
-        self, dataset_version_id: DatasetVersionId, artifact_id: DatasetArtifactId
-    ) -> DeprecatedDatasetArtifactDownloadData:
-        """
-        Returns download data for an artifact, including a presigned URL
-        """
-        artifacts = self.get_dataset_artifacts(dataset_version_id)
-        artifact = next((a for a in artifacts if a.id == artifact_id), None)
-
-        if not artifact:
-            raise ArtifactNotFoundException(f"Artifact {artifact_id} not found in dataset {dataset_version_id}")
-
-        file_name = artifact.uri.split("/")[-1]
-        file_type = artifact.type
-        file_size = self.s3_provider.get_file_size(artifact.uri)
-        presigned_url = self.s3_provider.generate_presigned_url(artifact.uri)
-
-        return DeprecatedDatasetArtifactDownloadData(file_name, file_type, file_size, presigned_url)
-
     def get_dataset_status(self, dataset_version_id: DatasetVersionId) -> DatasetStatus:
         """
         Returns the dataset status for a specific dataset version

diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py
@@ -4,7 +4,6 @@
     CollectionMetadataUpdate,
     CollectionQueryFilter,
     DatasetArtifactDownloadData,
-    DeprecatedDatasetArtifactDownloadData,
 )
 from backend.layers.common.entities import (
     CanonicalCollection,
@@ -136,12 +135,6 @@ def get_dataset_artifact_download_data(
     ) -> DatasetArtifactDownloadData:
         pass
 
-    # TODO: Superseded by get_dataset_artifact_download_data. Remove with #5697.
-    def get_dataset_artifact_download_data_deprecated(
-        self, dataset_version_id: DatasetVersionId, artifact_id: DatasetArtifactId
-    ) -> DeprecatedDatasetArtifactDownloadData:
-        pass
-
     def update_dataset_version_status(
         self,
         dataset_version_id: DatasetVersionId,

diff --git a/backend/layers/business/entities.py b/backend/layers/business/entities.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
-from backend.layers.common.entities import DatasetArtifactType, Link
+from backend.layers.common.entities import Link
 
 
 @dataclass
@@ -18,15 +18,6 @@ class DatasetArtifactDownloadData:
     url: str
 
 
-# TODO: Superseded by DatasetArtifactDownloadData. Remove with #5697.
-@dataclass
-class DeprecatedDatasetArtifactDownloadData:
-    file_name: str
-    file_type: DatasetArtifactType
-    file_size: int
-    presigned_url: str
-
-
 @dataclass
 class CollectionMetadataUpdate:
     """

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
@@ -123,7 +123,7 @@ def dataset_migrate(
             existing_dataset_version_id=DatasetVersionId(dataset_version_id),
             start_step_function=False,  # The schema_migration sfn will start the ingest sfn
         )
-        sfn_name = sfn_name_generator(dataset_version_id, prefix="migrate")
+        sfn_name = sfn_name_generator(new_dataset_version_id, prefix="migrate")
         return {
             "collection_version_id": collection_version_id,
             "dataset_version_id": new_dataset_version_id.id,

diff --git a/backend/portal/api/enrichment.py b/backend/portal/api/enrichment.py
@@ -5,6 +5,8 @@
 
 from collections import OrderedDict
 
+from backend.common.feature_flag import FeatureFlagService, FeatureFlagValues
+
 
 def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
     """
@@ -15,6 +17,16 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
 
     terms = [e["ontology_term_id"] for e in dataset[key]]
 
+    is_schema_4 = FeatureFlagService.is_enabled(FeatureFlagValues.SCHEMA_4)
+    is_tissue = key == "tissue"
+    if is_tissue and is_schema_4:
+        # TODO remove is_schema_4 condition once Schema 4 is rolled out and
+        # feature flag is removed (#6266). "tissue" must include "tissue_type"
+        # when generating ancestors; "cell_type" and "development_stage" do not.
+        terms = [generate_tagged_tissue_ontology_id(e) for e in dataset[key]]
+    else:
+        terms = [e["ontology_term_id"] for e in dataset[key]]
+
     if not terms:
         return
 
@@ -23,3 +35,17 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
     unique_ancestors = list(OrderedDict.fromkeys(flattened_ancestors))
     if unique_ancestors:
         dataset[f"{key}_ancestors"] = unique_ancestors
+
+
+def generate_tagged_tissue_ontology_id(tissue):
+    """
+    Generate ontology ID tagged with tissue_type for the given tissue. For
+    example, UBERON:1234567 (organoid).
+    """
+    tissue_id = tissue["ontology_term_id"]
+    # Handle possible None for tissue_type (possible during migration): default
+    # to "tissue".
+    tissue_type = tissue["tissue_type"] or "tissue"
+    if tissue_type == "tissue":
+        return tissue_id
+    return f"{tissue_id} ({tissue_type})"
diff --git a/backend/portal/api/portal-api.yml b/backend/portal/api/portal-api.yml
@@ -581,42 +581,6 @@ paths:
           $ref: "#/components/responses/403"
         "404":
           $ref: "#/components/responses/404"
-    post:
-      tags:
-        - datasets
-      summary: Request to download a dataset
-      description: >-
-        Request to download a file which on success will generate a pre-signed URL to download the dataset.
-      operationId: backend.portal.api.portal_api.post_dataset_asset
-      parameters:
-        - $ref: "#/components/parameters/path_dataset_id"
-        - name: asset_id
-          in: path
-          required: true
-          schema:
-            type: string
-      responses:
-        "200":
-          description: OK
-          content:
-            application/json:
-              schema:
-                type: object
-                properties:
-                  dataset_id:
-                    $ref: "#/components/schemas/dataset_id"
-                  presigned_url:
-                    type: string
-                  file_name:
-                    type: string
-                  file_size:
-                    type: number
-        "401":
-          $ref: "#/components/responses/401"
-        "403":
-          $ref: "#/components/responses/403"
-        "404":
-          $ref: "#/components/responses/404"
 
   /v1/datasets/{dataset_id}/status:
     get:

diff --git a/backend/portal/api/portal_api.py b/backend/portal/api/portal_api.py
@@ -678,38 +678,6 @@ def get_dataset_asset(dataset_id: str, asset_id: str):
     return make_response(response, 200)
 
 
-def post_dataset_asset(dataset_id: str, asset_id: str):
-    """
-    Requests to download a dataset asset, by generating a presigned_url.
-    """
-
-    version = get_business_logic().get_dataset_version(DatasetVersionId(dataset_id))
-    if version is None:
-        raise NotFoundHTTPException(detail=f"'dataset/{dataset_id}' not found.")
-
-    try:
-        download_data = get_business_logic().get_dataset_artifact_download_data_deprecated(
-            DatasetVersionId(dataset_id), DatasetArtifactId(asset_id)
-        )
-    except ArtifactNotFoundException:
-        raise NotFoundHTTPException(detail=f"'dataset/{dataset_id}/asset/{asset_id}' not found.") from None
-
-    if download_data.file_size is None:
-        raise ServerErrorHTTPException() from None
-
-    if download_data.presigned_url is None:
-        raise ServerErrorHTTPException()
-
-    response = {
-        "dataset_id": dataset_id,
-        "file_name": download_data.file_name,
-        "file_size": download_data.file_size,
-        "presigned_url": download_data.presigned_url,
-    }
-
-    return make_response(response, 200)
-
-
 def get_dataset_assets(dataset_id: str):
     """
     Returns a list of all the artifacts registered to a dataset.

diff --git a/backend/wmg/api/common/rollup.py b/backend/wmg/api/common/rollup.py
@@ -74,6 +74,17 @@ def _rollup_gene_expression(gene_expression_df, universal_set_cell_counts_df) ->
     Augments the input gene expression dataframe to include
     (gene_ontology_term_id, tissue_ontology_term_id, cell_type_ontology_term_id, <compare_dimension>)
     combinations for which numeric expression values should be aggregated during the rollup operation.
+    Then proceeds to perform rollup.
+
+    Specifically, this function pivots the tidy gene expression dataframe to get dense 2D arrays for each
+    numeric column (nnz, sum, sqsum). The rows of these 2D arrays are the (tissue, cell_type, <compare_dimension>).
+    The columns of these 2D arrays are the genes. The 2D arrays are then stacked into a 3D array.
+    Because gene expressions are sparse, we need to add missing data corresponding to (tissue, cell_type, <compare_dimension>)
+    combinations present in the cell counts dataframe and not present in the gene expression dataframe. This is done by vertically
+    stacking (axis=0) empty arrays corresponding to the missing combinations. The 3D array is then rolled up along the first dimension
+    which aggregates the rows across cell type descendants for cell types present in the same group (tissue, <compare_dimension>).
+    The resulting non-zero values in the 3D array are then converted back to a tidy dataframe.
+
 
     Parameters
     ----------

diff --git a/frontend/census-projects.json b/frontend/census-projects.json
@@ -19,7 +19,9 @@
     "publication_info": "Cul et al. (2023) bioRxiv",
     "publication_link": "",
     "project_page": "https://example.com/",
-    "notebook_links": [["link A", "https://example.com/"]]
+    "notebook_links": [["link A", "https://example.com/"]],
+    "n_cells": 6523,
+    "n_columns": 8721
   },
   {
     "tier": "maintained",
@@ -40,6 +42,8 @@
     "DOI": null,
     "publication_info": "Cul et al. (2023) bioRxiv",
     "publication_link": "",
-    "notebook_links": [["link A", "https://example.com/"]]
+    "notebook_links": [["link A", "https://example.com/"]],
+    "n_genes": 1312,
+    "n_columns": 2312
   }
 ]
diff --git a/frontend/doc-site/03__Download Published Data.mdx b/frontend/doc-site/03__Download Published Data.mdx
@@ -1,11 +1,11 @@
 # Downloading Published Data on CZ CELLxGENE Discover
 
-Clicking the download button launches a dialog that enables a dataset to be downloaded in h5ad (AnnData v0.8) and rds (Seurat v4) formats. All datasets adhere to the CELLxGENE single cell annotated data schema. Datasets can either be downloaded via the browser by clicking the blue download button, or via the command line by pasting the provided curl command.
-
-1. Select the data set you wish to download.
+Clicking the download button launches a dialog that enables a dataset to be downloaded in h5ad (AnnData v0.8) and rds (Seurat v4) formats. All datasets adhere to the CELLxGENE single cell annotated data schema.
 
 <Image src={"/doc-site/datasetHighlight.png"} />
 
-2. Make your selections and hit the download button
+Click the white Download button for the dataset that you wish to download.
 
 <Image src={"/doc-site/downloadDialog.png"} />
+
+Select either the h5ad (AnnData v0.8) or rds (Seurat v4) download format. Click the blue Download button to download the dataset via the browser. The permanent download link can also be copied, shared, and pasted into a browser address bar.