Skip to content

Commit

Permalink
Rc 13.1.0 (#456)
Browse files Browse the repository at this point in the history
* Revise dashboards (#433)

* Don't cache local install (#431)

* Don't put cache dir

* Remove -e

* Update codeowners to use genie reviewers

* Remove missing vital status and assay info sections

* hotfix dockerfile

* pip3 install no longer works in dockerfile for locally built packages

* case-insensitive comparison

* test case-insensitive comparison

* black on genie_registry/assay.py only

* black on genie/process_functions.py only

* rename new test function

* replace underscores with hyphens

* add test for underscore versus hypen

* black on genie_registry/assay.py

* Modify tsa1, tsa2, ref maf error message (#438)

* Modify tsa1, tsa2, ref maf error message

* Fix tests

* Add sample class filter (#441)

* Add sample class filter

* lint

* Lint

* only filter for public release

* lint

* Make sure processing pipeline doesn't fail for older releases that don't have SAMPLE_CLASS

* Fix Docker build (#445)

* Change docker tag and add depedency

* comment in sdist

* Update pandas version (#446)

* Use iloc

* Use pd.concat

* Use pd.concat

* Use pd.concat

* Use pd.concat instead of append

* Update genie/database_to_staging.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* use pd.concat

* Use pd.concat

* lint

* exclude tests

* Use pd.concat

* Use mask to replace values

* Lint

* append

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Add code of conduct (#448)

* year or int of death is not applicable for living patients (#450)

* year or int of death is not applicable for living patients

* update tests for dead variable

Co-authored-by: Thomas Yu <[email protected]>

* support scheduled job secrets (#453)

* update version number

Co-authored-by: Haley Hunter-Zinck <[email protected]>
Co-authored-by: Haley Hunter-Zinck <[email protected]>
  • Loading branch information
3 people authored Mar 10, 2022
1 parent c37ee08 commit 4dea322
Show file tree
Hide file tree
Showing 15 changed files with 65 additions and 49 deletions.
3 changes: 3 additions & 0 deletions CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Contributor Covenant Code of Conduct

We subscribe to [Sage Bionetwork's Code of Conduct](https://sagebionetworks.org/code-of-conduct/), which is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/).
4 changes: 2 additions & 2 deletions bin/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def main(args):
"and processingType = 'public'" % processTrackerSynId
)
processTrackerDf = processTracker.asDataFrame()
processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000))
processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000))
syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic(
Expand Down Expand Up @@ -206,7 +206,7 @@ def main(args):
"processingType = 'public'" % processTrackerSynId
)
processTrackerDf = processTracker.asDataFrame()
processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000))
processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

if not args.test:
Expand Down
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "13.0.0"
__version__ = "13.1.0"
2 changes: 1 addition & 1 deletion genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def consortiumToPublic(
elif "CNA" in entName:
cna = syn.get(entId, followLink=True)
cnaDf = pd.read_csv(cna.path, sep="\t")
cna_columns = publicReleaseSamples.append(pd.Series("Hugo_Symbol"))
cna_columns = pd.concat([publicReleaseSamples, pd.Series("Hugo_Symbol")])
# parse out the CNA columns to keep
cnaDf = cnaDf[cnaDf.columns[cnaDf.columns.isin(cna_columns)]]
text = process_functions.removeFloat(cnaDf)
Expand Down
10 changes: 5 additions & 5 deletions genie/dashboard_table_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_center_data_completion(center, df):
]
completeness = float(sum(not_missing)) / int(total)
returned = pd.DataFrame([[col, center, total, completeness]])
center_data = center_data.append(returned)
center_data = pd.concat([center_data, returned])
return center_data


Expand Down Expand Up @@ -488,12 +488,12 @@ def update_sample_difference_table(syn, database_mappingdf):
]

if not new_centers.empty:
prior_release = prior_release.append(pd.DataFrame(index=new_centers))
prior_release = pd.concat([prior_release, pd.DataFrame(index=new_centers)])
prior_release = prior_release.fillna(0)
difference = current_release - prior_release
difference["Center"] = difference.index
difference["Release"] = release_name
diff_between_releasesdf = diff_between_releasesdf.append(difference)
diff_between_releasesdf = pd.concat([diff_between_releasesdf, difference])

difftable_db = syn.tableQuery("SELECT * FROM %s" % sample_diff_count_synid)
difftable_dbdf = difftable_db.asDataFrame()
Expand Down Expand Up @@ -540,13 +540,13 @@ def update_data_completeness_table(syn, database_mappingdf):
lambda center: get_center_data_completion(center, sampledf)
)
for center_info in center_infos:
data_completenessdf = data_completenessdf.append(center_info)
data_completenessdf = pd.concat([data_completenessdf, center_info])

center_infos = patientdf.CENTER.drop_duplicates().apply(
lambda center: get_center_data_completion(center, patientdf)
)
for center_info in center_infos:
data_completenessdf = data_completenessdf.append(center_info)
data_completenessdf = pd.concat([data_completenessdf, center_info])

data_completeness_db = syn.tableQuery("select * from %s" % data_completion_synid)
data_completeness_dbdf = data_completeness_db.asDataFrame()
Expand Down
20 changes: 11 additions & 9 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,8 +753,10 @@ def store_fusion_files(
f"DNA_SUPPORT,RNA_SUPPORT,METHOD,FRAME from {fusion_synid}",
)
version = syn.create_snapshot_version(fusion_synid, comment=genie_version)
# FusionsDf = Fusions.asDataFrame()
FusionsDf["ENTREZ_GENE_ID"][FusionsDf["ENTREZ_GENE_ID"] == 0] = float("nan")

FusionsDf["ENTREZ_GENE_ID"].mask(
FusionsDf["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True
)

if not current_release_staging:
FusionsStagingDf = FusionsDf[
Expand Down Expand Up @@ -1300,13 +1302,13 @@ def store_cna_files(
cna_template.to_csv(cna_path, sep="\t", index=False)
# Loop through to create finalized CNA file
with_center_hugo_symbol = pd.Series("Hugo_Symbol")
with_center_hugo_symbol = with_center_hugo_symbol.append(
pd.Series(keep_for_center_consortium_samples)
with_center_hugo_symbol = pd.concat(
[with_center_hugo_symbol, pd.Series(keep_for_center_consortium_samples)]
)

with_merged_hugo_symbol = pd.Series("Hugo_Symbol")
with_merged_hugo_symbol = with_merged_hugo_symbol.append(
pd.Series(keep_for_merged_consortium_samples)
with_merged_hugo_symbol = pd.concat(
[with_merged_hugo_symbol, pd.Series(keep_for_merged_consortium_samples)]
)

cna_samples = []
Expand Down Expand Up @@ -1477,8 +1479,8 @@ def store_data_gene_matrix(
)
# Samples have already been removed
data_gene_matrix = pd.DataFrame(columns=["SAMPLE_ID", "SEQ_ASSAY_ID"])
data_gene_matrix = data_gene_matrix.append(
clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]]
data_gene_matrix = pd.concat(
[data_gene_matrix, clinicaldf[["SAMPLE_ID", "SEQ_ASSAY_ID"]]]
)
data_gene_matrix = data_gene_matrix.rename(columns={"SEQ_ASSAY_ID": "mutations"})
data_gene_matrix = data_gene_matrix[data_gene_matrix["SAMPLE_ID"] != ""]
Expand Down Expand Up @@ -1832,7 +1834,7 @@ def update_process_trackingdf(
)
)
process_trackerdf = process_tracker.asDataFrame()
process_trackerdf[column][0] = str(int(time.time() * 1000))
process_trackerdf[column].iloc[0] = str(int(time.time() * 1000))
syn.store(synapseclient.Table(process_trackerdb_synid, process_trackerdf))


Expand Down
18 changes: 10 additions & 8 deletions genie/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,12 +445,12 @@ def get_duplicated_files(validation_statusdf):
cbs_seg_index = filename_str.endswith(("cbs", "seg"))
cbs_seg_files = validation_statusdf[cbs_seg_index]
if len(cbs_seg_files) > 1:
duplicated_filesdf = duplicated_filesdf.append(cbs_seg_files)
duplicated_filesdf = pd.concat([duplicated_filesdf, cbs_seg_files])
# clinical files should not be duplicated.
clinical_index = filename_str.startswith("data_clinical_supp")
clinical_files = validation_statusdf[clinical_index]
if len(clinical_files) > 2:
duplicated_filesdf = duplicated_filesdf.append(clinical_files)
duplicated_filesdf = pd.concat([duplicated_filesdf, clinical_files])
duplicated_filesdf.drop_duplicates("id", inplace=True)
logger.info("THERE ARE {} DUPLICATED FILES".format(len(duplicated_filesdf)))
duplicated_filesdf["errors"] = DUPLICATED_FILE_ERROR
Expand Down Expand Up @@ -611,8 +611,8 @@ def _update_tables_content(validation_statusdf, error_trackingdf):

# Append duplicated file errors
duplicated_filesdf["id"].isin(error_trackingdf["id"][duplicated_idx])
error_trackingdf = error_trackingdf.append(
duplicated_filesdf[error_trackingdf.columns]
error_trackingdf = pd.concat(
[error_trackingdf, duplicated_filesdf[error_trackingdf.columns]]
)
# Remove duplicates if theres already an error that exists for the file
error_trackingdf.drop_duplicates("id", inplace=True)
Expand Down Expand Up @@ -822,7 +822,7 @@ def center_input_to_database(
# Reorganize so BED file are always validated and processed first
bed_files = validFiles["fileType"] == "bed"
beds = validFiles[bed_files]
validFiles = beds.append(validFiles)
validFiles = pd.concat([beds, validFiles])
validFiles.drop_duplicates(inplace=True)
# merge clinical files into one row
clinical_ind = validFiles["fileType"] == "clinical"
Expand All @@ -832,7 +832,7 @@ def center_input_to_database(
merged_clinical = pd.DataFrame([clinical_files])
merged_clinical["fileType"] = "clinical"
merged_clinical["name"] = f"data_clinical_supp_{center}.txt"
validFiles = validFiles[~clinical_ind].append(merged_clinical)
validFiles = pd.concat([validFiles[~clinical_ind], merged_clinical])

processTrackerSynId = process_functions.getDatabaseSynId(
syn, "processTracker", databaseToSynIdMappingDf=database_to_synid_mappingdf
Expand All @@ -856,7 +856,9 @@ def center_input_to_database(

syn.store(synapseclient.Table(processTrackerSynId, new_rows))
else:
processTrackerDf["timeStartProcessing"][0] = str(int(time.time() * 1000))
processTrackerDf["timeStartProcessing"].iloc[0] = str(
int(time.time() * 1000)
)
syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

processfiles(
Expand All @@ -881,7 +883,7 @@ def center_input_to_database(
)
)
processTrackerDf = processTracker.asDataFrame()
processTrackerDf["timeEndProcessing"][0] = str(int(time.time() * 1000))
processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

logger.info("SAMPLE/PATIENT RETRACTION")
Expand Down
13 changes: 9 additions & 4 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,9 +783,7 @@ def updateDatabase(
to_delete_rows = _delete_rows(new_dataset, database, primary_key)
else:
to_delete_rows = pd.DataFrame()
allupdates = allupdates.append(to_append_rows, sort=False)
allupdates = allupdates.append(to_update_rows, sort=False)

allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
storedatabase = False
update_all_file = tempfile.NamedTemporaryFile(dir=SCRIPT_DIR, delete=False)

Expand Down Expand Up @@ -1046,8 +1044,15 @@ def synLogin(pemfile_path, debug=False):
"""
try:
syn = synapseclient.Synapse(debug=debug)
syn.login()
# Get auth token via scheduled job secrets
if os.getenv("SCHEDULED_JOB_SECRETS") is not None:
secrets = json.loads(os.getenv("SCHEDULED_JOB_SECRETS"))
auth_token = secrets["SYNAPSE_AUTH_TOKEN"]
else:
auth_token = None
syn.login(authToken=auth_token)
except Exception:
# TODO: deprecate this feature soon
genie_pass = get_password(pemfile_path)
syn = synapseclient.Synapse(debug=debug)
syn.login(os.environ["GENIE_USER"], genie_pass)
Expand Down
4 changes: 2 additions & 2 deletions genie/toRetract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
import argparse

import synapseclient
import pandas as pd

from . import process_functions

Expand Down Expand Up @@ -66,7 +66,7 @@ def retract(syn, project_id):
)
sampleRetractIds = sampleRetract.asDataFrame()

allRetractedSamples = sampleRetractIds["genieSampleId"].append(appendSamples)
allRetractedSamples = pd.concat([sampleRetractIds["genieSampleId"], appendSamples])

# Only need to retract clinical data, because the rest of the data is filtered by clinical data
# Sample Clinical Data
Expand Down
10 changes: 6 additions & 4 deletions genie_registry/assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,11 @@ def _get_dataframe(self, filepath_list):

seq_assay_id_infodf = assay_info_transposeddf.loc[[assay]]

to_appenddf = [seq_assay_id_infodf] * (len(assay_specific_info) - 1)
if to_appenddf:
seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf)
for i in range(0, len(assay_specific_info) - 1):
seq_assay_id_infodf = pd.concat(
[seq_assay_id_infodf, seq_assay_id_infodf]
)
# seq_assay_id_infodf = seq_assay_id_infodf.append(to_appenddf)
seq_assay_id_infodf.reset_index(drop=True, inplace=True)
assay_finaldf = pd.concat(
[assay_specific_infodf, seq_assay_id_infodf], axis=1
Expand All @@ -123,7 +125,7 @@ def _get_dataframe(self, filepath_list):
if assay_finaldf.get(col) is not None:
assay_finaldf[col] = [";".join(row) for row in assay_finaldf[col]]
assay_finaldf["SEQ_PIPELINE_ID"] = assay
all_panel_info = all_panel_info.append(assay_finaldf)
all_panel_info = pd.concat([all_panel_info, assay_finaldf])
return all_panel_info

def _validate(self, assay_info_df, project_id):
Expand Down
10 changes: 4 additions & 6 deletions genie_registry/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,7 @@ def add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path):
"Feature_Type",
]
)
genie_combineddf = genie_combineddf.append(genie_exondf)
genie_combineddf = genie_combineddf.append(genie_introndf)
genie_combineddf = genie_combineddf.append(genie_intergenicdf)
genie_combineddf = pd.concat([genie_exondf, genie_introndf, genie_intergenicdf])
return genie_combineddf


Expand Down Expand Up @@ -344,21 +342,21 @@ def _map_position_within_boundary(row, positiondf, boundary=0.9):
# difference = difference * -1.0
max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
if max_overlap is not None:
end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
# End goes over end boundary, but start is contained in position
if sum(chrom_rows["start_position"] <= row["Start_Position"]) > 0:
overlap = chrom_rows["end_position"] - row["Start_Position"]
max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
if max_overlap is not None:
end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
# Start and end go over position boundary
check = chrom_rows[chrom_rows["start_position"] >= row["Start_Position"]]
check = check[check["end_position"] <= row["End_Position"]]
if not check.empty:
overlap = chrom_rows["end_position"] - chrom_rows["start_position"]
max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
if max_overlap is not None:
end_rows = end_rows.append(chrom_rows.loc[[max_overlap]])
end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
return end_rows


Expand Down
4 changes: 2 additions & 2 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def _check_int_dead_consistency(clinicaldf: DataFrame) -> str:
# Check that all string values are equal each other
is_equal = all(clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"])
# If dead, int column can't be Not Applicable
# If alive, int column can't have values
# If alive, int column must be Not Applicable
if (
any(clinicaldf.loc[is_dead, "INT_DOD"] == "Not Applicable")
or not all(clinicaldf.loc[is_alive, "INT_DOD"].isin(allowed_str))
or not all(clinicaldf.loc[is_alive, "INT_DOD"] == "Not Applicable")
or not is_equal
):
return (
Expand Down
4 changes: 2 additions & 2 deletions genie_registry/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ def _process(self, cnaDf, databaseToSynIdMappingDf):
)
temp = pd.DataFrame(newVal).transpose()
temp["Hugo_Symbol"] = i
duplicatedGenes = duplicatedGenes.append(temp, sort=False)
duplicatedGenes = pd.concat([duplicatedGenes, temp], sort=False)
cnaDf.drop_duplicates("Hugo_Symbol", keep=False, inplace=True)
cnaDf = cnaDf.append(duplicatedGenes, sort=False)
cnaDf = pd.concat([cnaDf, duplicatedGenes], sort=False)
cnaDf = cnaDf[order]
cnaDf.columns = [
process_functions.checkGenieId(i, self.center) if i != "Hugo_Symbol" else i
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
line-length = 88
target-version = ['py37']
include = '\.pyi?$'
extend-exclude = 'tests'
exclude = 'tests'
8 changes: 6 additions & 2 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,8 @@ def test__check_int_year_consistency_inconsistent(inconsistent_df,
"DEAD": [True, False]}
),
pd.DataFrame(
{"INT_DOD": [1111, "Not Released"],
"DEAD": [True, False]}
{"INT_DOD": ["Not Applicable", "Not Applicable"],
"DEAD": [False, False]}
)
]
)
Expand All @@ -824,6 +824,10 @@ def test__check_int_dead_consistency_valid(valid_df):
{"INT_DOD": ["Not Applicable", "Not Applicable"],
"DEAD": [True, False]}
),
pd.DataFrame(
{"INT_DOD": [1111, "Not Released"],
"DEAD": [True, False]}
),
pd.DataFrame(
{"INT_DOD": [1111, 11111],
"DEAD": [True, False]}
Expand Down

0 comments on commit 4dea322

Please sign in to comment.