Merge pull request #6370 from chanzuckerberg/staging

chore: prod deployment, dec 18th
chanzuckerberg · Dec 18, 2023 · 91356f8 · 91356f8
2 parents 5687754 + 26d208a
commit 91356f8
Show file tree

Hide file tree

Showing 27 changed files with 961 additions and 743 deletions.
diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml
@@ -709,7 +709,7 @@ components:
     batch_condition:
       description: |
         These keys define the batches that a normalization or integration algorithm should be aware of.
-        [batch condition schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#batch_condition)
+        [batch condition schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#batch_condition)
 
       type: array
       items:
@@ -992,7 +992,7 @@ components:
           type: integer
         cell_type:
           description: |
-            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#cell_type)
+            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#cell_type)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1007,7 +1007,7 @@ components:
           $ref: "#/components/schemas/default_embedding"
         development_stage:
           description: |
-            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#development_stage)
+            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#development_stage)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1034,7 +1034,7 @@ components:
           type: number
         title:
           description: |
-            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#title)
+            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#title)
           nullable: true
           type: string
         organism:
@@ -1061,14 +1061,14 @@ components:
           type: string
         self_reported_ethnicity:
           description: |
-            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#self_reported_ethnicity)
+            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#self_reported_ethnicity)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
         sex:
           description: |
-            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#sex)
+            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#sex)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1098,7 +1098,7 @@ components:
           type: integer
         cell_type:
           description: |
-            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#cell_type)
+            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#cell_type)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1117,7 +1117,7 @@ components:
           $ref: "#/components/schemas/dataset_version_id"
         development_stage:
           description: |
-            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#development_stage)
+            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#development_stage)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1136,7 +1136,7 @@ components:
           type: number
         title:
           description: |
-            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#title)
+            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#title)
           nullable: true
           type: string
         organism:
@@ -1160,14 +1160,14 @@ components:
           type: string
         self_reported_ethnicity:
           description: |
-            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#self_reported_ethnicity)
+            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#self_reported_ethnicity)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
         sex:
           description: |
-            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#sex)
+            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#sex)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1197,7 +1197,7 @@ components:
           type: integer
         cell_type:
           description: |
-            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#cell_type)
+            [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#cell_type)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1219,7 +1219,7 @@ components:
           $ref: "#/components/schemas/default_embedding"
         development_stage:
           description: |
-            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#development_stage)
+            [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#development_stage)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1244,7 +1244,7 @@ components:
           type: number
         title:
           description: |
-            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#title)
+            [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#title)
           nullable: true
           type: string
         organism:
@@ -1261,14 +1261,14 @@ components:
           type: string
         self_reported_ethnicity:
           description: |
-            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#self_reported_ethnicity)
+            [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#self_reported_ethnicity)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
         sex:
           description: |
-            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#sex)
+            [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#sex)
           default: []
           items:
             $ref: "#/components/schemas/ontology_element"
@@ -1282,28 +1282,28 @@ components:
       type: object
     dataset_assay:
       description: |
-        [assay label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#assay)
+        [assay label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#assay)
       default: []
       items:
         $ref: "#/components/schemas/ontology_element"
       type: array
     dataset_disease:
       description: |
-        [disease label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#disease)
+        [disease label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#disease)
       default: []
       items:
         $ref: "#/components/schemas/ontology_element"
       type: array
     dataset_organism:
       description: |
-        [organism label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#organism)
+        [organism label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#organism)
       default: []
       items:
         $ref: "#/components/schemas/ontology_element"
       type: array
     dataset_tissue:
       description: |
-        [tissue label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#tissue)
+        [tissue label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#tissue)
       default: []
       items:
         allOf:
@@ -1351,7 +1351,7 @@ components:
         CELLxGENE Discover runs a heuristic to detect the approximate distribution
         of the data in X so that it can accurately calculate statistical properties
         of the data. This field enables the curator to override this heuristic and
-        specify the data distribution explicitly. [x_approximate_distribution](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#x_approximate_distribution)
+        specify the data distribution explicitly. [x_approximate_distribution](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#x_approximate_distribution)
       enum:
         - COUNT
         - NORMAL
@@ -1407,7 +1407,7 @@ components:
       nullable: true
     is_primary_data:
       description: |
-        [is_primary_data](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#is_primary_data)
+        [is_primary_data](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#is_primary_data)
 
         Describes whether cellular observations for this Dataset are all
         canonical (True), all non-canonical (False), or contain a mixture (True, False).
@@ -1524,7 +1524,7 @@ components:
         - raw.X
     suspension_type:
       description: |
-        [suspension_type](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#suspension_type)
+        [suspension_type](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#suspension_type)
 
         List of unique suspension types represented in the dataset, corresponding to dataset's assay(s).
         Possible item values are 'nucleus', 'cell', and/or 'na'.

diff --git a/frontend/census-projects.json b/frontend/census-projects.json
@@ -1,37 +1,4 @@
 [
-  {
-    "tier": "maintained",
-    "title": "Geneformer embeddings fine-tuned for CELLxGENE Census cell subclass classification",
-    "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the Geneformer ",
-    "primary_contact": {
-      "name": "CELLxGENE Discover Team",
-      "email": "[email protected]",
-      "affiliation": "CZI"
-    },
-    "DOI": "10.1038/s41586-023-06139-9",
-    "publication_info": "",
-    "publication_link": "",
-    "project_page": "",
-    "additional_information": "Beginning with the geneformer-12L-30M pretrained model published by Theodoris et al. (huggingface.co/ctheodoris/Geneformer), a BertForSequenceClassification model was trained to predict cell subclass (as annotated in CELLxGENE Discover see https://cellxgene.cziscience.com/collections). Embeddings were then generated using Geneformer’s EmbExtractor module with emb_layer=0.\nFor full details and a reproducible workflow please see: https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/models/geneformer/README.md",
-    "model_link": "s3://cellxgene-contrib-public/models/geneformer/2023-12-15/homo_sapiens/fined-tuned-model/",
-    "data_type": "obs_embedding",
-    "obsm_layer": "geneformer",
-    "census_version": "2023-12-15",
-    "experiment_name": "homo_sapiens",
-    "measurement_name": "RNA",
-    "n_cells": 62998417,
-    "n_columns": 512,
-    "n_features": 512,
-    "notebook_links": [
-      [
-        "Using trained model",
-        "https://chanzuckerberg.github.io/cellxgene-census/notebooks/analysis_demo/comp_bio_geneformer_prediction.html"
-      ]
-    ],
-    "submission_date": "2023-11-06",
-    "last_updated": null,
-    "revised_by": null
-  },
   {
     "tier": "maintained",
     "title": "scVI integrated-embeddings with explicit modeling of batch effects",
@@ -130,6 +97,40 @@
     "last_updated": null,
     "revised_by": null
   },
+  {
+    "tier": "maintained",
+    "title": "Geneformer embeddings fine-tuned for CELLxGENE Census cell subclass classification",
+    "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the pre-trained Geneformer model.",
+    "primary_contact": {
+      "name": "CELLxGENE Discover Team",
+      "email": "[email protected]",
+      "affiliation": "CZI"
+    },
+    "DOI": "10.1038/s41586-023-06139-9",
+    "publication_info": "",
+    "publication_link": "",
+    "project_page": "",
+    "additional_information": "Beginning with the geneformer-12L-30M pretrained model published by Theodoris et al. (huggingface.co/ctheodoris/Geneformer), a BertForSequenceClassification model was trained to predict cell subclass (as annotated in CELLxGENE Discover see https://cellxgene.cziscience.com/collections). Embeddings were then generated using Geneformer’s EmbExtractor module with emb_layer=0.\nFor full details and a reproducible workflow please see: https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/models/geneformer/README.md",
+    "model_link": "s3://cellxgene-contrib-public/models/geneformer/2023-12-15/homo_sapiens/fined-tuned-model/",
+    "data_type": "obs_embedding",
+    "obsm_layer": "geneformer",
+    "census_version": "2023-12-15",
+    "experiment_name": "homo_sapiens",
+    "measurement_name": "RNA",
+    "n_cells": 62998417,
+    "n_columns": 512,
+    "n_features": 512,
+    "notebook_links": [
+      [
+        "Using trained model",
+        "https://chanzuckerberg.github.io/cellxgene-census/notebooks/analysis_demo/comp_bio_geneformer_prediction.html"
+      ]
+    ],
+    "submission_date": "2023-11-06",
+    "last_updated": null,
+    "revised_by": null
+  },
+
   {
     "tier": "community",
     "title": "PINNACLE: Contextual AI Model for Single-Cell Protein Biology",
@@ -213,12 +214,12 @@
     "additional_contacts": [
       {
         "name": "Jialong Jiang",
-        "email": "[email protected]" ,
+        "email": "[email protected]",
         "affiliation": "Thomson Lab, Caltech"
       },
       {
         "name": "Yingying Gong",
-        "email": "[email protected]" ,
+        "email": "[email protected]",
         "affiliation": "Thomson Lab, Caltech"
       }
     ],

diff --git a/frontend/doc-site/032__Contribute and Publish Data.mdx b/frontend/doc-site/032__Contribute and Publish Data.mdx
@@ -33,6 +33,23 @@ We need the following collection metadata (i.e. details associated with your pub
   - Contact: name and email
   - Publication/preprint DOI: can be added later
   - URLs: any additional URLs for related data or resources, such as GEO or protocols.io - can be added later
+  - Consortia: optional, and can be added later. Can be one or more of:
+    - Allen Institute for Brain Science
+    - BRAIN Initiative
+    - CZ Biohub
+    - CZI Neurodegeneration Challenge Network
+    - CZI Single-Cell Biology
+    - European Union’s Horizon 2020
+    - GenitoUrinary Development Molecular Anatomy Project (GUDMAP)
+    - Gut Cell Atlas
+    - Human BioMolecular Atlas Program (HuBMAP)
+    - Human Cell Atlas (HCA)
+    - Human Pancreas Analysis Program (HPAP)
+    - Human Tumor Atlas Network (HTAN)
+    - Kidney Precision Medicine Project (KPMP)
+    - LungMAP
+    - SEA-AD
+    - Wellcome HCA Strategic Science Support
 
 Each dataset needs the following information added to a single h5ad (AnnData 0.8) format file:
 
@@ -49,12 +66,13 @@ Each dataset needs the following information added to a single h5ad (AnnData 0.8
   - donor_id: free-text identifier that distinguishes the unique individual that data were derived from. It is encouraged to be something not likely to be used in other studies (e.g. donor_1 is likely to not be unique in the data corpus)
   - development_stage_ontology_term_id: [HsapDv](https://www.ebi.ac.uk/ols/ontologies/hsapdv) if human, [MmusDv](https://www.ebi.ac.uk/ols/ontologies/mmusdv) if mouse, `unknown` if information unavailable
   - sex_ontology_term_id: `PATO:0000384` for male, `PATO:0000383` for female, or `unknown` if unavailable
-  - self_reported_ethnicity_ontology_term_id: [HANCESTRO](https://www.ebi.ac.uk/ols/ontologies/hancestro) use `multiethnic` if more than one ethnicity is reported. If human and information unavailable, use `unknown`. Use `na` if non-human.
+  - self_reported_ethnicity_ontology_term_id: [HANCESTRO](https://www.ebi.ac.uk/ols/ontologies/hancestro) multiple comma-separated terms may be used if more than one ethnicity is reported. If human and information unavailable, use `unknown`. Use `na` if non-human.
   - disease_ontology_term_id: [MONDO](https://www.ebi.ac.uk/ols/ontologies/mondo) or `PATO:0000461` for 'normal'
+  - tissue_type: `tissue`, `organoid`, or `cell culture`
   - tissue_ontology_term_id: [UBERON](https://www.ebi.ac.uk/ols/ontologies/uberon)
   - cell_type_ontology_term_id: [CL](https://www.ebi.ac.uk/ols/ontologies/cl)
   - assay_ontology_term_id: [EFO](https://www.ebi.ac.uk/ols/ontologies/efo)
-  - suspension_type: `cell`, `nucleus`, or `na`, as corresponding to assay. Use [this table](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#suspension_type) defined in the data schema for guidance. If the assay does not appear in this table, the most appropriate value MUST be selected and the [curation team informed](mailto:[email protected]) during submission so that the assay can be added to the table.
+  - suspension_type: `cell`, `nucleus`, or `na`, as corresponding to assay. Use [this table](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#suspension_type) defined in the data schema for guidance. If the assay does not appear in this table, the most appropriate value MUST be selected and the [curation team informed](mailto:[email protected]) during submission so that the assay can be added to the table.
 - **Embeddings in obsm**:
   - One or more two-dimensional embeddings, prefixed with 'X\_'
 - **Features in var & raw.var (if present)**: