Rc 13.1.0 (#456)

* Revise dashboards (#433) * Don't cache local install (#431) * Don't put cache dir * Remove -e * Update codeowners to use genie reviewers * Remove missing vital status and assay info sections * hotfix dockerfile * pip3 install no longer works in dockerfile for locally built packages * case-insensitive comparison * test case-insensitive comparison * black on genie_registry/assay.py only * black on genie/process_functions.py only * rename new test function * replace underscores with hyphens * add test for underscore versus hypen * black on genie_registry/assay.py * Modify tsa1, tsa2, ref maf error message (#438) * Modify tsa1, tsa2, ref maf error message * Fix tests * Add sample class filter (#441) * Add sample class filter * lint * Lint * only filter for public release * lint * Make sure processing pipeline doesn't fail for older releases that don't have SAMPLE_CLASS * Fix Docker build (#445) * Change docker tag and add depedency * comment in sdist * Update pandas version (#446) * Use iloc * Use pd.concat * Use pd.concat * Use pd.concat * Use pd.concat instead of append * Update genie/database_to_staging.py Co-authored-by: Haley Hunter-Zinck <[email protected]> * use pd.concat * Use pd.concat * lint * exclude tests * Use pd.concat * Use mask to replace values * Lint * append Co-authored-by: Haley Hunter-Zinck <[email protected]> * Add code of conduct (#448) * year or int of death is not applicable for living patients (#450) * year or int of death is not applicable for living patients * update tests for dead variable Co-authored-by: Thomas Yu <[email protected]> * support scheduled job secrets (#453) * update version number Co-authored-by: Haley Hunter-Zinck <[email protected]> Co-authored-by: Haley Hunter-Zinck <[email protected]>
Sage-Bionetworks · Mar 10, 2022 · 4dea322 · 4dea322
1 parent c37ee08
commit 4dea322
Show file tree

Hide file tree

Showing 15 changed files with 65 additions and 49 deletions.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -0,0 +1,3 @@
+# Contributor Covenant Code of Conduct
+
+We subscribe to [Sage Bionetwork's Code of Conduct](https://sagebionetworks.org/code-of-conduct/), which is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/).
diff --git a/bin/consortium_to_public.py b/bin/consortium_to_public.py
@@ -143,7 +143,7 @@ def main(args):
             "and processingType = 'public'" % processTrackerSynId
         )
         processTrackerDf = processTracker.asDataFrame()
-        processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000))
+        processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000))
         syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))
 
     caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic(
@@ -206,7 +206,7 @@ def main(args):
             "processingType = 'public'" % processTrackerSynId
         )
         processTrackerDf = processTracker.asDataFrame()
-        processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000))
+        processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
         syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))
 
     if not args.test:

diff --git a/genie/__version__.py b/genie/__version__.py
@@ -1 +1 @@
-__version__ = "13.0.0"
+__version__ = "13.1.0"
diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py
@@ -301,7 +301,7 @@ def consortiumToPublic(
         elif "CNA" in entName:
             cna = syn.get(entId, followLink=True)
             cnaDf = pd.read_csv(cna.path, sep="\t")
-            cna_columns = publicReleaseSamples.append(pd.Series("Hugo_Symbol"))
+            cna_columns = pd.concat([publicReleaseSamples, pd.Series("Hugo_Symbol")])
             # parse out the CNA columns to keep
             cnaDf = cnaDf[cnaDf.columns[cnaDf.columns.isin(cna_columns)]]
             text = process_functions.removeFloat(cnaDf)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
@@ -46,7 +46,7 @@ def get_center_data_completion(center, df):
             ]
             completeness = float(sum(not_missing)) / int(total)
             returned = pd.DataFrame([[col, center, total, completeness]])
-            center_data = center_data.append(returned)
+            center_data = pd.concat([center_data, returned])
     return center_data
 
 
@@ -488,12 +488,12 @@ def update_sample_difference_table(syn, database_mappingdf):
         ]
 
         if not new_centers.empty:
-            prior_release = prior_release.append(pd.DataFrame(index=new_centers))
+            prior_release = pd.concat([prior_release, pd.DataFrame(index=new_centers)])
             prior_release = prior_release.fillna(0)
         difference = current_release - prior_release
         difference["Center"] = difference.index
         difference["Release"] = release_name
-        diff_between_releasesdf = diff_between_releasesdf.append(difference)
+        diff_between_releasesdf = pd.concat([diff_between_releasesdf, difference])
 
     difftable_db = syn.tableQuery("SELECT * FROM %s" % sample_diff_count_synid)
     difftable_dbdf = difftable_db.asDataFrame()
@@ -540,13 +540,13 @@ def update_data_completeness_table(syn, database_mappingdf):
         lambda center: get_center_data_completion(center, sampledf)
     )
     for center_info in center_infos:
-        data_completenessdf = data_completenessdf.append(center_info)
+        data_completenessdf = pd.concat([data_completenessdf, center_info])
 
     center_infos = patientdf.CENTER.drop_duplicates().apply(
         lambda center: get_center_data_completion(center, patientdf)
     )
     for center_info in center_infos:
-        data_completenessdf = data_completenessdf.append(center_info)
+        data_completenessdf = pd.concat([data_completenessdf, center_info])
 
     data_completeness_db = syn.tableQuery("select * from %s" % data_completion_synid)
     data_completeness_dbdf = data_completeness_db.asDataFrame()

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -753,8 +753,10 @@ def store_fusion_files(
         f"DNA_SUPPORT,RNA_SUPPORT,METHOD,FRAME from {fusion_synid}",
     )
     version = syn.create_snapshot_version(fusion_synid, comment=genie_version)
-    # FusionsDf = Fusions.asDataFrame()
-    FusionsDf["ENTREZ_GENE_ID"][FusionsDf["ENTREZ_GENE_ID"] == 0] = float("nan")
+
+    FusionsDf["ENTREZ_GENE_ID"].mask(
+        FusionsDf["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True
+    )
 
     if not current_release_staging:
         FusionsStagingDf = FusionsDf[
@@ -1300,13 +1302,13 @@ def store_cna_files(
     cna_template.to_csv(cna_path, sep="\t", index=False)
     # Loop through to create finalized CNA file
     with_center_hugo_symbol = pd.Series("Hugo_Symbol")
-    with_center_hugo_symbol = with_center_hugo_symbol.append(
-        pd.Series(keep_for_center_consortium_samples)
+    with_center_hugo_symbol = pd.concat(
+        [with_center_hugo_symbol, pd.Series(keep_for_center_consortium_samples)]
     )
 
     with_merged_hugo_symbol = pd.Series("Hugo_Symbol")
-    with_merged_hugo_symbol = with_merged_hugo_symbol.append(
-        pd.Series(keep_for_merged_consortium_samples)
+    with_merged_hugo_symbol = pd.concat(
+        [with_merged_hugo_symbol, pd.Series(keep_for_merged_consortium_samples)]
     )
 
     cna_samples = []
@@ -1477,8 +1479,8 @@ def store_data_gene_matrix(
     )
     # Samples have already been removed
     data_gene_matrix = pd.DataFrame(columns=["SAMPLE_ID", "SEQ_ASSAY_ID"])
-    data_gene_matrix = data_gene_matrix.append(
-        clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]]
+    data_gene_matrix = pd.concat(
+        [data_gene_matrix, clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]]]
     )
     data_gene_matrix = data_gene_matrix.rename(columns={"SEQ_ASSAY_ID": "mutations"})
     data_gene_matrix = data_gene_matrix[data_gene_matrix["SAMPLE_ID"] != ""]
@@ -1832,7 +1834,7 @@ def update_process_trackingdf(
         )
     )
     process_trackerdf = process_tracker.asDataFrame()
-    process_trackerdf[column][0] = str(int(time.time() * 1000))
+    process_trackerdf[column].iloc[0] = str(int(time.time() * 1000))
     syn.store(synapseclient.Table(process_trackerdb_synid, process_trackerdf))
 
 

diff --git a/genie/input_to_database.py b/genie/input_to_database.py
@@ -445,12 +445,12 @@ def get_duplicated_files(validation_statusdf):
     cbs_seg_index = filename_str.endswith(("cbs", "seg"))
     cbs_seg_files = validation_statusdf[cbs_seg_index]
     if len(cbs_seg_files) > 1:
-        duplicated_filesdf = duplicated_filesdf.append(cbs_seg_files)
+        duplicated_filesdf = pd.concat([duplicated_filesdf, cbs_seg_files])
     # clinical files should not be duplicated.
     clinical_index = filename_str.startswith("data_clinical_supp")
     clinical_files = validation_statusdf[clinical_index]
     if len(clinical_files) > 2:
-        duplicated_filesdf = duplicated_filesdf.append(clinical_files)
+        duplicated_filesdf = pd.concat([duplicated_filesdf, clinical_files])
     duplicated_filesdf.drop_duplicates("id", inplace=True)
     logger.info("THERE ARE {} DUPLICATED FILES".format(len(duplicated_filesdf)))
     duplicated_filesdf["errors"] = DUPLICATED_FILE_ERROR
@@ -611,8 +611,8 @@ def _update_tables_content(validation_statusdf, error_trackingdf):
 
     # Append duplicated file errors
     duplicated_filesdf["id"].isin(error_trackingdf["id"][duplicated_idx])
-    error_trackingdf = error_trackingdf.append(
-        duplicated_filesdf[error_trackingdf.columns]
+    error_trackingdf = pd.concat(
+        [error_trackingdf, duplicated_filesdf[error_trackingdf.columns]]
     )
     # Remove duplicates if theres already an error that exists for the file
     error_trackingdf.drop_duplicates("id", inplace=True)
@@ -822,7 +822,7 @@ def center_input_to_database(
         # Reorganize so BED file are always validated and processed first
         bed_files = validFiles["fileType"] == "bed"
         beds = validFiles[bed_files]
-        validFiles = beds.append(validFiles)
+        validFiles = pd.concat([beds, validFiles])
         validFiles.drop_duplicates(inplace=True)
         # merge clinical files into one row
         clinical_ind = validFiles["fileType"] == "clinical"
@@ -832,7 +832,7 @@ def center_input_to_database(
             merged_clinical = pd.DataFrame([clinical_files])
             merged_clinical["fileType"] = "clinical"
             merged_clinical["name"] = f"data_clinical_supp_{center}.txt"
-            validFiles = validFiles[~clinical_ind].append(merged_clinical)
+            validFiles = pd.concat([validFiles[~clinical_ind], merged_clinical])
 
         processTrackerSynId = process_functions.getDatabaseSynId(
             syn, "processTracker", databaseToSynIdMappingDf=database_to_synid_mappingdf
@@ -856,7 +856,9 @@ def center_input_to_database(
 
             syn.store(synapseclient.Table(processTrackerSynId, new_rows))
         else:
-            processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000))
+            processTrackerDf["timeStartProcessing"].iloc[0] = str(
+                int(time.time() * 1000)
+            )
             syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))
 
         processfiles(
@@ -881,7 +883,7 @@ def center_input_to_database(
             )
         )
         processTrackerDf = processTracker.asDataFrame()
-        processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000))
+        processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
         syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))
 
         logger.info("SAMPLE/PATIENT RETRACTION")

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -783,9 +783,7 @@ def updateDatabase(
         to_delete_rows = _delete_rows(new_dataset, database, primary_key)
     else:
         to_delete_rows = pd.DataFrame()
-    allupdates = allupdates.append(to_append_rows, sort=False)
-    allupdates = allupdates.append(to_update_rows, sort=False)
-
+    allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
     storedatabase = False
     update_all_file = tempfile.NamedTemporaryFile(dir=SCRIPT_DIR, delete=False)
 
@@ -1046,8 +1044,15 @@ def synLogin(pemfile_path, debug=False):
     """
     try:
         syn = synapseclient.Synapse(debug=debug)
-        syn.login()
+        # Get auth token via scheduled job secrets
+        if os.getenv("SCHEDULED_JOB_SECRETS") is not None:
+            secrets = json.loads(os.getenv("SCHEDULED_JOB_SECRETS"))
+            auth_token = secrets["SYNAPSE_AUTH_TOKEN"]
+        else:
+            auth_token = None
+        syn.login(authToken=auth_token)
     except Exception:
+        # TODO: deprecate this feature soon
         genie_pass = get_password(pemfile_path)
         syn = synapseclient.Synapse(debug=debug)
         syn.login(os.environ["GENIE_USER"], genie_pass)

diff --git a/genie/toRetract.py b/genie/toRetract.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 import argparse
 
-import synapseclient
+import pandas as pd
 
 from . import process_functions
 
@@ -66,7 +66,7 @@ def retract(syn, project_id):
     )
     sampleRetractIds = sampleRetract.asDataFrame()
 
-    allRetractedSamples = sampleRetractIds["genieSampleId"].append(appendSamples)
+    allRetractedSamples = pd.concat([sampleRetractIds["genieSampleId"], appendSamples])
 
     # Only need to retract clinical data, because the rest of the data is filtered by clinical data
     # Sample Clinical Data

diff --git a/genie_registry/assay.py b/genie_registry/assay.py
@@ -103,9 +103,11 @@ def _get_dataframe(self, filepath_list):
 
             seq_assay_id_infodf = assay_info_transposeddf.loc[[assay]]
 
-            to_appenddf = [seq_assay_id_infodf] * (len(assay_specific_info) - 1)
-            if to_appenddf:
-                seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf)
+            for i in range(0, len(assay_specific_info) - 1):
+                seq_assay_id_infodf = pd.concat(
+                    [seq_assay_id_infodf, seq_assay_id_infodf]
+                )
+                # seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf)
             seq_assay_id_infodf.reset_index(drop=True, inplace=True)
             assay_finaldf = pd.concat(
                 [assay_specific_infodf, seq_assay_id_infodf], axis=1
@@ -123,7 +125,7 @@ def _get_dataframe(self, filepath_list):
                 if assay_finaldf.get(col) is not None:
                     assay_finaldf[col] = [";".join(row) for row in assay_finaldf[col]]
             assay_finaldf["SEQ_PIPELINE_ID"] = assay
-            all_panel_info = all_panel_info.append(assay_finaldf)
+            all_panel_info = pd.concat([all_panel_info, assay_finaldf])
         return all_panel_info
 
     def _validate(self, assay_info_df, project_id):

diff --git a/genie_registry/bed.py b/genie_registry/bed.py
@@ -253,9 +253,7 @@ def add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path):
             "Feature_Type",
         ]
     )
-    genie_combineddf = genie_combineddf.append(genie_exondf)
-    genie_combineddf = genie_combineddf.append(genie_introndf)
-    genie_combineddf = genie_combineddf.append(genie_intergenicdf)
+    genie_combineddf = pd.concat([genie_exondf, genie_introndf, genie_intergenicdf])
     return genie_combineddf
 
 
@@ -344,21 +342,21 @@ def _map_position_within_boundary(row, positiondf, boundary=0.9):
             # difference =  difference * -1.0
             max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
             if max_overlap is not None:
-                end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
+                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
         # End goes over end boundary, but start is contained in position
         if sum(chrom_rows["start_position"] <= row["Start_Position"]) > 0:
             overlap = chrom_rows["end_position"] - row["Start_Position"]
             max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
             if max_overlap is not None:
-                end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
+                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
         # Start and end go over position boundary
         check = chrom_rows[chrom_rows["start_position"] >= row["Start_Position"]]
         check = check[check["end_position"] <= row["End_Position"]]
         if not check.empty:
             overlap = chrom_rows["end_position"] - chrom_rows["start_position"]
             max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
             if max_overlap is not None:
-                end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
+                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
     return end_rows
 
 

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
@@ -95,10 +95,10 @@ def _check_int_dead_consistency(clinicaldf: DataFrame) -> str:
     # Check that all string values are equal each other
     is_equal = all(clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"])
     # If dead, int column can't be Not Applicable
-    # If alive, int column can't have values
+    # If alive, int column must be Not Applicable
     if (
         any(clinicaldf.loc[is_dead, "INT_DOD"] == "Not Applicable")
-        or not all(clinicaldf.loc[is_alive, "INT_DOD"].isin(allowed_str))
+        or not all(clinicaldf.loc[is_alive, "INT_DOD"] == "Not Applicable")
         or not is_equal
     ):
         return (

diff --git a/genie_registry/cna.py b/genie_registry/cna.py
@@ -136,9 +136,9 @@ def _process(self, cnaDf, databaseToSynIdMappingDf):
             )
             temp = pd.DataFrame(newVal).transpose()
             temp["Hugo_Symbol"] = i
-            duplicatedGenes = duplicatedGenes.append(temp, sort=False)
+            duplicatedGenes = pd.concat([duplicatedGenes, temp], sort=False)
         cnaDf.drop_duplicates("Hugo_Symbol", keep=False, inplace=True)
-        cnaDf = cnaDf.append(duplicatedGenes, sort=False)
+        cnaDf = pd.concat([cnaDf, duplicatedGenes], sort=False)
         cnaDf = cnaDf[order]
         cnaDf.columns = [
             process_functions.checkGenieId(i, self.center) if i != "Hugo_Symbol" else i

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,4 +2,4 @@
 line-length = 88
 target-version = ['py37']
 include = '\.pyi?$'
-extend-exclude = 'tests'
+exclude = 'tests'
diff --git a/tests/test_clinical.py b/tests/test_clinical.py
@@ -804,8 +804,8 @@ def test__check_int_year_consistency_inconsistent(inconsistent_df,
              "DEAD": [True, False]}
         ),
         pd.DataFrame(
-            {"INT_DOD": [1111, "Not Released"],
-             "DEAD": [True, False]}
+            {"INT_DOD": ["Not Applicable", "Not Applicable"],
+             "DEAD": [False, False]}
         )
     ]
 )
@@ -824,6 +824,10 @@ def test__check_int_dead_consistency_valid(valid_df):
             {"INT_DOD": ["Not Applicable", "Not Applicable"],
              "DEAD": [True, False]}
         ),
+        pd.DataFrame(
+            {"INT_DOD": [1111, "Not Released"],
+             "DEAD": [True, False]}
+        ),
         pd.DataFrame(
             {"INT_DOD": [1111, 11111],
              "DEAD": [True, False]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Contributor Covenant Code of Conduct

		We subscribe to [Sage Bionetwork's Code of Conduct](https://sagebionetworks.org/code-of-conduct/), which is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/).