diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..86dd0b42 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# Contributor Covenant Code of Conduct + +We subscribe to [Sage Bionetwork's Code of Conduct](https://sagebionetworks.org/code-of-conduct/), which is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/). diff --git a/bin/consortium_to_public.py b/bin/consortium_to_public.py index ca39bf5a..5e1693e1 100644 --- a/bin/consortium_to_public.py +++ b/bin/consortium_to_public.py @@ -143,7 +143,7 @@ def main(args): "and processingType = 'public'" % processTrackerSynId ) processTrackerDf = processTracker.asDataFrame() - processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000)) + processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic( @@ -206,7 +206,7 @@ def main(args): "processingType = 'public'" % processTrackerSynId ) processTrackerDf = processTracker.asDataFrame() - processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000)) + processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) if not args.test: diff --git a/genie/__version__.py b/genie/__version__.py index 1311252a..d876dfbe 100644 --- a/genie/__version__.py +++ b/genie/__version__.py @@ -1 +1 @@ -__version__ = "13.0.0" +__version__ = "13.1.0" diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index fe6a2ce6..3b3f464f 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -301,7 +301,7 @@ def consortiumToPublic( elif "CNA" in entName: cna = syn.get(entId, followLink=True) cnaDf = pd.read_csv(cna.path, sep="\t") - cna_columns = publicReleaseSamples.append(pd.Series("Hugo_Symbol")) + cna_columns = pd.concat([publicReleaseSamples, pd.Series("Hugo_Symbol")]) # parse out the CNA columns to keep cnaDf = cnaDf[cnaDf.columns[cnaDf.columns.isin(cna_columns)]] text = process_functions.removeFloat(cnaDf) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index e3dc825c..7d6cd33e 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -46,7 +46,7 @@ def get_center_data_completion(center, df): ] completeness = float(sum(not_missing)) / int(total) returned = pd.DataFrame([[col, center, total, completeness]]) - center_data = center_data.append(returned) + center_data = pd.concat([center_data, returned]) return center_data @@ -488,12 +488,12 @@ def update_sample_difference_table(syn, database_mappingdf): ] if not new_centers.empty: - prior_release = prior_release.append(pd.DataFrame(index=new_centers)) + prior_release = pd.concat([prior_release, pd.DataFrame(index=new_centers)]) prior_release = prior_release.fillna(0) difference = current_release - prior_release difference["Center"] = difference.index difference["Release"] = release_name - diff_between_releasesdf = diff_between_releasesdf.append(difference) + diff_between_releasesdf = pd.concat([diff_between_releasesdf, difference]) difftable_db = syn.tableQuery("SELECT * FROM %s" % sample_diff_count_synid) difftable_dbdf = difftable_db.asDataFrame() @@ -540,13 +540,13 @@ def update_data_completeness_table(syn, database_mappingdf): lambda center: get_center_data_completion(center, sampledf) ) for center_info in center_infos: - data_completenessdf = data_completenessdf.append(center_info) + data_completenessdf = pd.concat([data_completenessdf, center_info]) center_infos = patientdf.CENTER.drop_duplicates().apply( lambda center: get_center_data_completion(center, patientdf) ) for center_info in center_infos: - data_completenessdf = data_completenessdf.append(center_info) + data_completenessdf = pd.concat([data_completenessdf, center_info]) data_completeness_db = syn.tableQuery("select * from %s" % data_completion_synid) data_completeness_dbdf = data_completeness_db.asDataFrame() diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 24b54451..6e33620e 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -753,8 +753,10 @@ def store_fusion_files( f"DNA_SUPPORT,RNA_SUPPORT,METHOD,FRAME from {fusion_synid}", ) version = syn.create_snapshot_version(fusion_synid, comment=genie_version) - # FusionsDf = Fusions.asDataFrame() - FusionsDf["ENTREZ_GENE_ID"][FusionsDf["ENTREZ_GENE_ID"] == 0] = float("nan") + + FusionsDf["ENTREZ_GENE_ID"].mask( + FusionsDf["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True + ) if not current_release_staging: FusionsStagingDf = FusionsDf[ @@ -1300,13 +1302,13 @@ def store_cna_files( cna_template.to_csv(cna_path, sep="\t", index=False) # Loop through to create finalized CNA file with_center_hugo_symbol = pd.Series("Hugo_Symbol") - with_center_hugo_symbol = with_center_hugo_symbol.append( - pd.Series(keep_for_center_consortium_samples) + with_center_hugo_symbol = pd.concat( + [with_center_hugo_symbol, pd.Series(keep_for_center_consortium_samples)] ) with_merged_hugo_symbol = pd.Series("Hugo_Symbol") - with_merged_hugo_symbol = with_merged_hugo_symbol.append( - pd.Series(keep_for_merged_consortium_samples) + with_merged_hugo_symbol = pd.concat( + [with_merged_hugo_symbol, pd.Series(keep_for_merged_consortium_samples)] ) cna_samples = [] @@ -1477,8 +1479,8 @@ def store_data_gene_matrix( ) # Samples have already been removed data_gene_matrix = pd.DataFrame(columns=["SAMPLE_ID", "SEQ_ASSAY_ID"]) - data_gene_matrix = data_gene_matrix.append( - clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]] + data_gene_matrix = pd.concat( + [data_gene_matrix, clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]]] ) data_gene_matrix = data_gene_matrix.rename(columns={"SEQ_ASSAY_ID": "mutations"}) data_gene_matrix = data_gene_matrix[data_gene_matrix["SAMPLE_ID"] != ""] @@ -1832,7 +1834,7 @@ def update_process_trackingdf( ) ) process_trackerdf = process_tracker.asDataFrame() - process_trackerdf[column][0] = str(int(time.time() * 1000)) + process_trackerdf[column].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(process_trackerdb_synid, process_trackerdf)) diff --git a/genie/input_to_database.py b/genie/input_to_database.py index 9a2626f8..9ab1ff10 100644 --- a/genie/input_to_database.py +++ b/genie/input_to_database.py @@ -445,12 +445,12 @@ def get_duplicated_files(validation_statusdf): cbs_seg_index = filename_str.endswith(("cbs", "seg")) cbs_seg_files = validation_statusdf[cbs_seg_index] if len(cbs_seg_files) > 1: - duplicated_filesdf = duplicated_filesdf.append(cbs_seg_files) + duplicated_filesdf = pd.concat([duplicated_filesdf, cbs_seg_files]) # clinical files should not be duplicated. clinical_index = filename_str.startswith("data_clinical_supp") clinical_files = validation_statusdf[clinical_index] if len(clinical_files) > 2: - duplicated_filesdf = duplicated_filesdf.append(clinical_files) + duplicated_filesdf = pd.concat([duplicated_filesdf, clinical_files]) duplicated_filesdf.drop_duplicates("id", inplace=True) logger.info("THERE ARE {} DUPLICATED FILES".format(len(duplicated_filesdf))) duplicated_filesdf["errors"] = DUPLICATED_FILE_ERROR @@ -611,8 +611,8 @@ def _update_tables_content(validation_statusdf, error_trackingdf): # Append duplicated file errors duplicated_filesdf["id"].isin(error_trackingdf["id"][duplicated_idx]) - error_trackingdf = error_trackingdf.append( - duplicated_filesdf[error_trackingdf.columns] + error_trackingdf = pd.concat( + [error_trackingdf, duplicated_filesdf[error_trackingdf.columns]] ) # Remove duplicates if theres already an error that exists for the file error_trackingdf.drop_duplicates("id", inplace=True) @@ -822,7 +822,7 @@ def center_input_to_database( # Reorganize so BED file are always validated and processed first bed_files = validFiles["fileType"] == "bed" beds = validFiles[bed_files] - validFiles = beds.append(validFiles) + validFiles = pd.concat([beds, validFiles]) validFiles.drop_duplicates(inplace=True) # merge clinical files into one row clinical_ind = validFiles["fileType"] == "clinical" @@ -832,7 +832,7 @@ def center_input_to_database( merged_clinical = pd.DataFrame([clinical_files]) merged_clinical["fileType"] = "clinical" merged_clinical["name"] = f"data_clinical_supp_{center}.txt" - validFiles = validFiles[~clinical_ind].append(merged_clinical) + validFiles = pd.concat([validFiles[~clinical_ind], merged_clinical]) processTrackerSynId = process_functions.getDatabaseSynId( syn, "processTracker", databaseToSynIdMappingDf=database_to_synid_mappingdf @@ -856,7 +856,9 @@ def center_input_to_database( syn.store(synapseclient.Table(processTrackerSynId, new_rows)) else: - processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000)) + processTrackerDf["timeStartProcessing"].iloc[0] = str( + int(time.time() * 1000) + ) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) processfiles( @@ -881,7 +883,7 @@ def center_input_to_database( ) ) processTrackerDf = processTracker.asDataFrame() - processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000)) + processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) logger.info("SAMPLE/PATIENT RETRACTION") diff --git a/genie/process_functions.py b/genie/process_functions.py index 1f5697f8..17e4b77b 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -783,9 +783,7 @@ def updateDatabase( to_delete_rows = _delete_rows(new_dataset, database, primary_key) else: to_delete_rows = pd.DataFrame() - allupdates = allupdates.append(to_append_rows, sort=False) - allupdates = allupdates.append(to_update_rows, sort=False) - + allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False) storedatabase = False update_all_file = tempfile.NamedTemporaryFile(dir=SCRIPT_DIR, delete=False) @@ -1046,8 +1044,15 @@ def synLogin(pemfile_path, debug=False): """ try: syn = synapseclient.Synapse(debug=debug) - syn.login() + # Get auth token via scheduled job secrets + if os.getenv("SCHEDULED_JOB_SECRETS") is not None: + secrets = json.loads(os.getenv("SCHEDULED_JOB_SECRETS")) + auth_token = secrets["SYNAPSE_AUTH_TOKEN"] + else: + auth_token = None + syn.login(authToken=auth_token) except Exception: + # TODO: deprecate this feature soon genie_pass = get_password(pemfile_path) syn = synapseclient.Synapse(debug=debug) syn.login(os.environ["GENIE_USER"], genie_pass) diff --git a/genie/toRetract.py b/genie/toRetract.py index 8579c7a9..dc98f5b1 100644 --- a/genie/toRetract.py +++ b/genie/toRetract.py @@ -1,7 +1,7 @@ #! /usr/bin/env python import argparse -import synapseclient +import pandas as pd from . import process_functions @@ -66,7 +66,7 @@ def retract(syn, project_id): ) sampleRetractIds = sampleRetract.asDataFrame() - allRetractedSamples = sampleRetractIds["genieSampleId"].append(appendSamples) + allRetractedSamples = pd.concat([sampleRetractIds["genieSampleId"], appendSamples]) # Only need to retract clinical data, because the rest of the data is filtered by clinical data # Sample Clinical Data diff --git a/genie_registry/assay.py b/genie_registry/assay.py index eba1ec7c..297db8ae 100644 --- a/genie_registry/assay.py +++ b/genie_registry/assay.py @@ -103,9 +103,11 @@ def _get_dataframe(self, filepath_list): seq_assay_id_infodf = assay_info_transposeddf.loc[[assay]] - to_appenddf = [seq_assay_id_infodf] * (len(assay_specific_info) - 1) - if to_appenddf: - seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf) + for i in range(0, len(assay_specific_info) - 1): + seq_assay_id_infodf = pd.concat( + [seq_assay_id_infodf, seq_assay_id_infodf] + ) + # seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf) seq_assay_id_infodf.reset_index(drop=True, inplace=True) assay_finaldf = pd.concat( [assay_specific_infodf, seq_assay_id_infodf], axis=1 @@ -123,7 +125,7 @@ def _get_dataframe(self, filepath_list): if assay_finaldf.get(col) is not None: assay_finaldf[col] = [";".join(row) for row in assay_finaldf[col]] assay_finaldf["SEQ_PIPELINE_ID"] = assay - all_panel_info = all_panel_info.append(assay_finaldf) + all_panel_info = pd.concat([all_panel_info, assay_finaldf]) return all_panel_info def _validate(self, assay_info_df, project_id): diff --git a/genie_registry/bed.py b/genie_registry/bed.py index 3880701f..ed722ec6 100644 --- a/genie_registry/bed.py +++ b/genie_registry/bed.py @@ -253,9 +253,7 @@ def add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path): "Feature_Type", ] ) - genie_combineddf = genie_combineddf.append(genie_exondf) - genie_combineddf = genie_combineddf.append(genie_introndf) - genie_combineddf = genie_combineddf.append(genie_intergenicdf) + genie_combineddf = pd.concat([genie_exondf, genie_introndf, genie_intergenicdf]) return genie_combineddf @@ -344,13 +342,13 @@ def _map_position_within_boundary(row, positiondf, boundary=0.9): # difference = difference * -1.0 max_overlap = _get_max_overlap_index(overlap, bed_length, boundary) if max_overlap is not None: - end_rows = end_rows.append(chrom_rows.loc[[max_overlap]]) + end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]]) # End goes over end boundary, but start is contained in position if sum(chrom_rows["start_position"] <= row["Start_Position"]) > 0: overlap = chrom_rows["end_position"] - row["Start_Position"] max_overlap = _get_max_overlap_index(overlap, bed_length, boundary) if max_overlap is not None: - end_rows = end_rows.append(chrom_rows.loc[[max_overlap]]) + end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]]) # Start and end go over position boundary check = chrom_rows[chrom_rows["start_position"] >= row["Start_Position"]] check = check[check["end_position"] <= row["End_Position"]] @@ -358,7 +356,7 @@ def _map_position_within_boundary(row, positiondf, boundary=0.9): overlap = chrom_rows["end_position"] - chrom_rows["start_position"] max_overlap = _get_max_overlap_index(overlap, bed_length, boundary) if max_overlap is not None: - end_rows = end_rows.append(chrom_rows.loc[[max_overlap]]) + end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]]) return end_rows diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 90fca60d..7739b2ea 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -95,10 +95,10 @@ def _check_int_dead_consistency(clinicaldf: DataFrame) -> str: # Check that all string values are equal each other is_equal = all(clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"]) # If dead, int column can't be Not Applicable - # If alive, int column can't have values + # If alive, int column must be Not Applicable if ( any(clinicaldf.loc[is_dead, "INT_DOD"] == "Not Applicable") - or not all(clinicaldf.loc[is_alive, "INT_DOD"].isin(allowed_str)) + or not all(clinicaldf.loc[is_alive, "INT_DOD"] == "Not Applicable") or not is_equal ): return ( diff --git a/genie_registry/cna.py b/genie_registry/cna.py index fedc8cf3..24fb78e3 100644 --- a/genie_registry/cna.py +++ b/genie_registry/cna.py @@ -136,9 +136,9 @@ def _process(self, cnaDf, databaseToSynIdMappingDf): ) temp = pd.DataFrame(newVal).transpose() temp["Hugo_Symbol"] = i - duplicatedGenes = duplicatedGenes.append(temp, sort=False) + duplicatedGenes = pd.concat([duplicatedGenes, temp], sort=False) cnaDf.drop_duplicates("Hugo_Symbol", keep=False, inplace=True) - cnaDf = cnaDf.append(duplicatedGenes, sort=False) + cnaDf = pd.concat([cnaDf, duplicatedGenes], sort=False) cnaDf = cnaDf[order] cnaDf.columns = [ process_functions.checkGenieId(i, self.center) if i != "Hugo_Symbol" else i diff --git a/pyproject.toml b/pyproject.toml index cb84cfac..bf52dbcf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,4 +2,4 @@ line-length = 88 target-version = ['py37'] include = '\.pyi?$' -extend-exclude = 'tests' +exclude = 'tests' diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 8668f717..18937f2f 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -804,8 +804,8 @@ def test__check_int_year_consistency_inconsistent(inconsistent_df, "DEAD": [True, False]} ), pd.DataFrame( - {"INT_DOD": [1111, "Not Released"], - "DEAD": [True, False]} + {"INT_DOD": ["Not Applicable", "Not Applicable"], + "DEAD": [False, False]} ) ] ) @@ -824,6 +824,10 @@ def test__check_int_dead_consistency_valid(valid_df): {"INT_DOD": ["Not Applicable", "Not Applicable"], "DEAD": [True, False]} ), + pd.DataFrame( + {"INT_DOD": [1111, "Not Released"], + "DEAD": [True, False]} + ), pd.DataFrame( {"INT_DOD": [1111, 11111], "DEAD": [True, False]}