Merge pull request #6063 from chanzuckerberg/staging

chore: prod deploy 10/23
chanzuckerberg · Oct 23, 2023 · f8f10c7 · f8f10c7
2 parents 2152552 + 1f8b142
commit f8f10c7
Show file tree

Hide file tree

Showing 188 changed files with 3,363 additions and 5,630 deletions.
diff --git a/.github/workflows/build-images-and-create-deployment.yml b/.github/workflows/build-images-and-create-deployment.yml
@@ -4,10 +4,9 @@ on:
   push:
     branches:
       - main
-      - staging
       - prod
   repository_dispatch:
-    types: [build-images]
+    types: [build-images-for-staging]
 env:
   # Force using BuildKit instead of normal Docker, required so that metadata
   # is written/read to allow us to use layers of previous builds as cache.

diff --git a/.github/workflows/test-receiving-repository-dispatch.yml b/.github/workflows/test-receiving-repository-dispatch.yml
diff --git a/.github/workflows/test-sending-repository-dispatch.yml b/.github/workflows/test-sending-repository-dispatch.yml
diff --git a/.github/workflows/test-workflow-run-after-push-tests-pass.yml b/.github/workflows/test-workflow-run-after-push-tests-pass.yml
diff --git a/.github/workflows/trigger-release-candidate-build-and-deploy.yml b/.github/workflows/trigger-release-candidate-build-and-deploy.yml
@@ -51,7 +51,7 @@ jobs:
         uses: peter-evans/repository-dispatch@v2
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          event-type: build-images
+          event-type: build-images-for-staging
           client-payload: '{"ref": "refs/heads/staging"}'
 
       - name: Send slack notification if main not merged into staging

diff --git a/.happy/terraform/envs/dev/main.tf b/.happy/terraform/envs/dev/main.tf
@@ -16,8 +16,8 @@ module stack {
   batch_container_memory_limit = 28000
   wmg_batch_container_memory_limit = 248000
   wmg_desired_vcpus                = 128
-  cg_desired_vcpus                 = 48
-  cg_batch_container_memory_limit  = 92000
+  cg_desired_vcpus                 = 128
+  cg_batch_container_memory_limit  = 248000
   backend_memory               = 8192
   frontend_memory              = 4096
   backend_instance_count       = 4

diff --git a/.happy/terraform/envs/prod/main.tf b/.happy/terraform/envs/prod/main.tf
@@ -16,8 +16,8 @@ module stack {
   batch_container_memory_limit = 63500
   wmg_batch_container_memory_limit = 248000
   wmg_desired_vcpus                = 128
-  cg_desired_vcpus                 = 48
-  cg_batch_container_memory_limit  = 92000
+  cg_desired_vcpus                 = 128
+  cg_batch_container_memory_limit  = 248000
   backend_memory               = 30 * 1024
   frontend_memory              = 4096
   backend_instance_count       = 6

diff --git a/.happy/terraform/envs/stage/main.tf b/.happy/terraform/envs/stage/main.tf
@@ -16,8 +16,8 @@ module stack {
   batch_container_memory_limit = 63500
   wmg_batch_container_memory_limit = 248000
   wmg_desired_vcpus                = 128
-  cg_batch_container_memory_limit  = 92000
-  cg_desired_vcpus                 = 48
+  cg_batch_container_memory_limit  = 248000
+  cg_desired_vcpus                 = 128
   backend_memory               = 8192
   frontend_memory              = 4096
   backend_instance_count       = 4

diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf
@@ -113,6 +113,59 @@ resource aws_batch_job_definition schema_migrations {
   })
 }
 
+resource aws_batch_job_definition pubish_revisions {
+  type = "container"
+  name = "dp-${var.deployment_stage}-${var.custom_stack_name}-${local.name}-publish-revisions"
+  container_properties = jsonencode({
+    command = ["python3",
+      "-m",
+      "backend.layers.processing.publish_revisions",
+    ],
+    jobRoleArn= var.batch_role_arn,
+    image= var.image,
+    environment= [
+      {
+        name= "ARTIFACT_BUCKET",
+        value= var.artifact_bucket
+      },
+      {
+        name= "DEPLOYMENT_STAGE",
+        value= var.deployment_stage
+      },
+      {
+        name= "AWS_DEFAULT_REGION",
+        value= data.aws_region.current.name
+      },
+      {
+        name= "REMOTE_DEV_PREFIX",
+        value= var.remote_dev_prefix
+      },
+      {
+        name= "DATASETS_BUCKET",
+        value= var.datasets_bucket
+      },
+    ],
+    resourceRequirements = [
+      {
+        type= "VCPU",
+        Value="2"
+      },
+      {
+        Type="MEMORY",
+        Value = "4096"
+      }
+    ]
+    logConfiguration= {
+      logDriver= "awslogs",
+      options= {
+        awslogs-group= aws_cloudwatch_log_group.batch_cloud_watch_logs_group.id,
+        awslogs-region= data.aws_region.current.name
+      }
+    }
+  })
+}
+
+
 resource aws_sfn_state_machine sfn_schema_migration {
   name     = "dp-${var.deployment_stage}-${var.custom_stack_name}-${local.name}-sfn"
   role_arn = var.sfn_role_arn

diff --git a/.happy/terraform/modules/sfn/main.tf b/.happy/terraform/modules/sfn/main.tf
@@ -58,7 +58,11 @@ resource "aws_sfn_state_machine" "state_machine" {
               },
               {
                 "Name": "DATASET_ID",
-                 "Value.$": "$.dataset_id"
+                "Value.$": "$.dataset_id"
+              },
+              {
+                "Name": "COLLECTION_ID",
+                "Value.$": "$.collection_id"
               },
               {
                 "Name": "STEP_NAME",

diff --git a/Dockerfile.wmg_pipeline b/Dockerfile.wmg_pipeline
@@ -19,6 +19,8 @@ ADD backend/wmg/__init__.py backend/wmg/__init__.py
 ADD backend/wmg/config.py backend/wmg/config.py
 ADD backend/wmg/data backend/wmg/data
 ADD backend/wmg/pipeline backend/wmg/pipeline
+ADD backend/wmg/api backend/wmg/api
+ADD backend/cellguide/pipeline backend/cellguide/pipeline
 ADD backend/layers backend/layers
 ADD backend/common backend/common
 
@@ -29,4 +31,4 @@ LABEL commit=${HAPPY_COMMIT}
 ENV COMMIT_SHA=${HAPPY_COMMIT}
 ENV COMMIT_BRANCH=${HAPPY_BRANCH}
 
-CMD ["python3", "-m", "backend.wmg.pipeline.cube_pipeline"]
+CMD ["python3", "-m", "backend.wmg.pipeline"]
diff --git a/backend/cellguide/pipeline/computational_marker_genes/__init__.py b/backend/cellguide/pipeline/computational_marker_genes/__init__.py
@@ -1,7 +1,9 @@
 import logging
 
-from backend.cellguide.pipeline.computational_marker_genes.computational_markers import MarkerGenesCalculator
-from backend.cellguide.pipeline.computational_marker_genes.constants import MARKER_SCORE_THRESHOLD
+from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
+    MARKER_SCORE_THRESHOLD,
+    MarkerGenesCalculator,
+)
 from backend.cellguide.pipeline.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
 from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
 from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder

diff --git a/backend/cellguide/pipeline/computational_marker_genes/computational_markers.py b/backend/cellguide/pipeline/computational_marker_genes/computational_markers.py
@@ -10,9 +10,6 @@
 from dask.diagnostics import ProgressBar
 from tqdm import tqdm
 
-from backend.cellguide.pipeline.computational_marker_genes.constants import (
-    MARKER_SCORE_THRESHOLD,
-)
 from backend.cellguide.pipeline.computational_marker_genes.types import ComputationalMarkerGenes
 from backend.cellguide.pipeline.computational_marker_genes.utils import (
     bootstrap_rows_percentiles,
@@ -43,6 +40,8 @@
 or any arbitrary combinations of metadata dimensions.
 """
 
+MARKER_SCORE_THRESHOLD = 0.5
+
 
 class MarkerGenesCalculator:
     def __init__(self, *, snapshot: WmgSnapshot, all_cell_type_ids_in_corpus: list[str], groupby_terms: list[str]):

diff --git a/backend/cellguide/pipeline/computational_marker_genes/constants.py b/backend/cellguide/pipeline/computational_marker_genes/constants.py
diff --git a/backend/cellguide/pipeline/source_collections/source_collections_generator.py b/backend/cellguide/pipeline/source_collections/source_collections_generator.py
@@ -1,16 +1,16 @@
 from backend.cellguide.pipeline.canonical_marker_genes.utils import format_citation_dp
 from backend.cellguide.pipeline.source_collections.types import SourceCollectionsData
 from backend.common.utils.rollup import descendants
-from backend.wmg.data.utils import get_collections_from_curation_api, get_datasets_from_curation_api
+from backend.wmg.data.utils import get_collections_from_discover_api, get_datasets_from_discover_api
 
 
 def generate_source_collections_data(all_cell_type_ids_in_corpus: list[str]) -> dict[str, list[SourceCollectionsData]]:
     """
     For each cell type id in the corpus, we want to generate a SourceCollectionsData object, which contains
     metadata about the source data for each cell type
     """
-    all_datasets = get_datasets_from_curation_api()
-    all_collections = get_collections_from_curation_api()
+    all_datasets = get_datasets_from_discover_api()
+    all_collections = get_collections_from_discover_api()
 
     collections_dict = {collection["collection_id"]: collection for collection in all_collections}
     datasets_dict = {dataset["dataset_id"]: dataset for dataset in all_datasets}

diff --git a/backend/common/utils/exceptions.py b/backend/common/utils/exceptions.py
@@ -23,9 +23,5 @@ class NonExistentDatasetException(CorporaException):
     pass
 
 
-class CubeValidationException(Exception):
-    pass
-
-
 class MarkerGeneCalculationException(Exception):
     pass
diff --git a/backend/common/utils/result_notification.py b/backend/common/utils/result_notification.py
@@ -85,7 +85,7 @@ def gen_wmg_pipeline_failure_message(failure_info: str) -> dict:
     }
 
 
-def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cell_count: int, gene_count: int) -> dict:
+def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cell_count: int) -> dict:
     return {
         "blocks": [
             {
@@ -101,8 +101,7 @@ def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cel
                 "text": {
                     "type": "mrkdwn",
                     "text": f"\n* WMG snapshot stored in {snapshot_path}"
-                    f"\n* The cube contains {cell_count} cells from {dataset_count} "
-                    f"\n  datasets, with expression scores across {gene_count} genes.",
+                    f"\n* The cube contains {cell_count} cells from {dataset_count} datasets.",
                 },
             },
         ]

diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml
@@ -716,6 +716,14 @@ components:
         type: string
       nullable: true
       example: ["patient", "seqBatch"]
+    citation:
+      description: |
+        Citation that includes downloadable permalink to h5ad artifact for this dataset, a permalink to collection it 
+        belongs to in CZ CELLxGENE Discover, and--if applicable--the Publication DOI associated with the dataset.
+        See details about the exact format in the 
+        [schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#citation)
+      type: string
+      nullable: true
     collection_list:
       description: Collection metadata
       properties:
@@ -988,6 +996,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         dataset_id:
           $ref: "#/components/schemas/dataset_id"
         dataset_version_id:
@@ -1092,6 +1102,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         collection_doi:
           $ref: "#/components/schemas/doi"
         collection_id:
@@ -1189,6 +1201,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         collection_id:
           $ref: "#/components/schemas/collection_id"
         collection_version_id:
@@ -1291,7 +1305,13 @@ components:
         [tissue label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#tissue)
       default: []
       items:
-        $ref: "#/components/schemas/ontology_element"
+        allOf:
+          - $ref: "#/components/schemas/ontology_element"
+          - type: object
+            properties:
+              tissue_type:
+                type: string
+                nullable: true
       type: array
     dataset_tombstone:
       description: When True, this Dataset was withdrawn from the Data curation at the request of its submitter.

diff --git a/backend/curation/api/v1/curation/collections/common.py b/backend/curation/api/v1/curation/collections/common.py
@@ -227,6 +227,10 @@ def reshape_dataset_for_curation_api(
             if col is not None:
                 ds[column] = col
 
+    if ds.get("tissue") is not None and CorporaConfig().schema_4_feature_flag.lower() == "false":
+        for tissue in ds["tissue"]:
+            del tissue["tissue_type"]
+
     ds["dataset_id"] = dataset_version.dataset_id.id
     ds["dataset_version_id"] = dataset_version.version_id.id
     # Get none preview specific dataset fields
@@ -318,6 +322,7 @@ class EntityColumns:
         "mean_genes_per_cell",
         "schema_version",
         "donor_id",
+        "citation",
     ]
 
     dataset_metadata_cols = [

diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py
@@ -158,13 +158,18 @@ class OntologyTermId:
     ontology_term_id: str
 
 
+@dataclass
+class TissueOntologyTermId(OntologyTermId):
+    tissue_type: Optional[str] = None
+
+
 @dataclass_json
 @dataclass
 class DatasetMetadata:
     name: str
     schema_version: str
     organism: List[OntologyTermId]
-    tissue: List[OntologyTermId]
+    tissue: List[TissueOntologyTermId]
     assay: List[OntologyTermId]
     disease: List[OntologyTermId]
     sex: List[OntologyTermId]
@@ -178,6 +183,7 @@ class DatasetMetadata:
     donor_id: List[str]
     is_primary_data: str
     x_approximate_distribution: Optional[str]
+    citation: Optional[str] = None
     default_embedding: Optional[str] = None
     embeddings: Optional[List[str]] = None
     feature_biotype: Optional[List[str]] = None