Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: CQDG-268 remove fields not observed_phenotypes #77

Merged
merged 2 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -226,32 +226,6 @@
}
}
},
"non_observed_phenotype_tagged": {
"type": "nested",
"properties": {
"internal_phenotype_id": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"source_text": {
"type": "keyword"
},
"parents": {
"type": "keyword"
},
"is_leaf": {
"type": "boolean"
},
"is_tagged": {
"type": "boolean"
},
"age_at_event": {
"type": "keyword"
}
}
},
"phenotypes_tagged": {
"type": "nested",
"properties": {
Expand Down
52 changes: 0 additions & 52 deletions index-task/src/main/resources/templates/template_file_centric.json
Original file line number Diff line number Diff line change
Expand Up @@ -239,32 +239,6 @@
}
}
},
"non_observed_phenotype_tagged": {
"type": "nested",
"properties": {
"internal_phenotype_id": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"parents": {
"type": "keyword"
},
"is_leaf": {
"type": "boolean"
},
"is_tagged": {
"type": "boolean"
},
"source_text": {
"type": "keyword"
},
"age_at_event": {
"type": "keyword"
}
}
},
"observed_phenotypes": {
"type": "nested",
"properties": {
Expand Down Expand Up @@ -509,32 +483,6 @@
}
}
},
"non_observed_phenotype_tagged": {
"type": "nested",
"properties": {
"internal_phenotype_id": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"parents": {
"type": "keyword"
},
"is_leaf": {
"type": "boolean"
},
"is_tagged": {
"type": "boolean"
},
"source_text": {
"type": "keyword"
},
"age_at_event": {
"type": "keyword"
}
}
},
"phenotypes_tagged": {
"type": "nested",
"properties": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,32 +206,6 @@
}
}
},
"non_observed_phenotype_tagged": {
"type": "nested",
"properties": {
"internal_phenotype_id": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"source_text": {
"type": "keyword"
},
"parents": {
"type": "keyword"
},
"is_leaf": {
"type": "boolean"
},
"is_tagged": {
"type": "boolean"
},
"age_at_event": {
"type": "keyword"
}
}
},
"phenotypes_tagged": {
"type": "nested",
"properties": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ object OntologyUtils {
.agg(collect_list(col("phenotype_with_age_grouped")) as colName)
}

def getTaggedPhenotypes(phenotypesDF: DataFrame, hpoTerms: DataFrame): (DataFrame, DataFrame, DataFrame, DataFrame) = {
def getTaggedPhenotypes(phenotypesDF: DataFrame, hpoTerms: DataFrame): (DataFrame, DataFrame, DataFrame) = {
val hpoExplodedAlt = hpoTerms
.withColumn("alt_id", explode(col("alt_ids")))

Expand Down Expand Up @@ -85,8 +85,6 @@ object OntologyUtils {
.withColumnRenamed("age_at_phenotype", "age_at_event")
.withColumnRenamed("phenotype_source_text", "source_text")



val observedPhenotypesWithAncestors = generatePhenotypeWithAncestors(observedPhenotypes, "observed_phenotypes")

val taggedObservedPhenotypes = generateTaggedPhenotypes(observedPhenotypes, "observed_phenotype_tagged")
Expand All @@ -99,7 +97,7 @@ object OntologyUtils {
.groupBy("cqdg_participant_id")
.agg(collect_list("exp_phenotypes") as "phenotypes_tagged")

(taggedObservedPhenotypes, taggedNonObservedPhenotypes, observedPhenotypesWithAncestors.drop("study_id"), phenotypes_tagged)
(taggedObservedPhenotypes, observedPhenotypesWithAncestors.drop("study_id"), phenotypes_tagged)
}

def getDiagnosis(diagnosisDf: DataFrame, mondoTerms: DataFrame, icdTerms: DataFrame): (DataFrame, DataFrame) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ object Utils {
}

def addDiagnosisPhenotypes(phenotypeDF: DataFrame, diagnosesDF: DataFrame)(hpoTerms: DataFrame, mondoTerms: DataFrame, icdTerms: DataFrame): DataFrame = {
val (observedPhenotypes, nonObservedPhenotypes, observedPhenotypesWithAncestors, phenotypes) = getTaggedPhenotypes(phenotypeDF, hpoTerms)
val (observedPhenotypes, observedPhenotypesWithAncestors, phenotypes) = getTaggedPhenotypes(phenotypeDF, hpoTerms)

val (diagnosis, mondoWithAncestors) = getDiagnosis(diagnosesDF, mondoTerms, icdTerms)

Expand All @@ -97,7 +97,6 @@ object Utils {
.drop("cqdg_participant_id")
.join(phenotypes, col("fhir_id") === col("cqdg_participant_id"), "left_outer")
.join(observedPhenotypes, Seq("cqdg_participant_id"), "left_outer")
.join(nonObservedPhenotypes, Seq("cqdg_participant_id"), "left_outer")
.join(observedPhenotypesWithAncestors, Seq("cqdg_participant_id"), "left_outer")
.drop("cqdg_participant_id")
.withColumnRenamed("fhir_id", "participant_id")
Expand Down
47 changes: 20 additions & 27 deletions prepare-index/src/test/scala/OntologyUtilsSpec.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import bio.ferlab.datalake.spark3.loader.GenericLoader.read
import bio.ferlab.fhir.etl.common.OntologyUtils.{getDiagnosis, getTaggedPhenotypes}
import model.{DIAGNOSIS, DIAGNOSIS_INPUT, PHENOTYPE, PHENOTYPE_HPO_CODE, PHENOTYPE_TAGGED, PHENOTYPE_TAGGED_WITH_ANCESTORS, PHENOTYPE_TAGGED_WITH_OBSERVED}
import model.{DIAGNOSIS_INPUT, PHENOTYPE, PHENOTYPE_HPO_CODE, PHENOTYPE_TAGGED, PHENOTYPE_TAGGED_WITH_ANCESTORS, PHENOTYPE_TAGGED_WITH_OBSERVED}
import org.apache.spark.sql.DataFrame
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers
Expand All @@ -20,27 +20,20 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession

val phenotypes = Seq(phenotype1, phenotype2, phenotype3).toDF()

val (t1, t2, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms)
val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms)

//observed phenotypes tagged
val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head
//phenotypes tagged
val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head

taggedPhenotypes shouldBe
("1", Seq(
PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text"),
PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Old"), `source_text` = "text"),
))

//observed phenotypes tagged
val notTaggedPhenotypes = t2.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head

notTaggedPhenotypes shouldBe
("1", Seq(
PHENOTYPE_TAGGED(`internal_phenotype_id` = "3", `parents` = Seq("A Name (HP:A)"), `name` = "C Name (HP:C)", `age_at_event` = Some("Super Old"), `source_text` = "text"),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Old"), `source_text` = "text", `is_observed` = Some(true)),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "3", `parents` = Seq("A Name (HP:A)"), `name` = "C Name (HP:C)", `age_at_event` = Some("Super Old"), `source_text` = "text", `is_observed` = Some(false)),
))

//observed phenotypes with ancestors
val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head
val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head

taggedPhenotypesWithAncestors._2 should contain theSameElementsAs
Seq(
Expand All @@ -58,20 +51,20 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession

val phenotypes = Seq(phenotype1, phenotype2).toDF()

val (t1, _, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms)
val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms)

//observed phenotypes tagged
val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head
val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head

taggedPhenotypes shouldBe
("1", Seq(
PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Young"), `source_text` = "text"),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)),
// HP:D (obsolete) should be changed to HP:G (alternate)
PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = None, `source_text` = "text"),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = None, `source_text` = "text", `is_observed` = Some(true)),
))

//observed phenotypes with ancestors
val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head
val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head

taggedPhenotypesWithAncestors._2 should contain theSameElementsAs
Seq(
Expand All @@ -89,19 +82,19 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession

val phenotypes = Seq(phenotype1, phenotype2).toDF()

val (t1, _, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms)
val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms)

//observed phenotypes tagged
val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head
val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head

taggedPhenotypes shouldBe
("1", Seq(
PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text"),
PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = None, `source_text` = "text"),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)),
PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = None, `source_text` = "text", `is_observed` = Some(true)),
))

//observed phenotypes with ancestors
val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head
val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head

taggedPhenotypesWithAncestors._2 should contain theSameElementsAs
Seq(
Expand Down Expand Up @@ -166,8 +159,8 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession

val phenotypes = Seq(phenotype1, phenotype2, phenotype3).toDF()

val (_, _, _, t4) = getTaggedPhenotypes(phenotypes, hpo_terms)
val taggedPhenotypes = t4.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect()
val (_, _, t3) = getTaggedPhenotypes(phenotypes, hpo_terms)
val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect()

taggedPhenotypes.flatMap(e => e._2.map(p => (p.`internal_phenotype_id`, p.`is_observed`)))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ case class PARTICIPANT_CENTRIC(
`mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`mondo`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
`observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`observed_phenotypes`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
`family_relationships`: Seq[FAMILY_RELATIONSHIP_WITH_FAMILY] = Nil,
`is_a_proband`: Option[Boolean] = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ case class PARTICIPANT_WITH_BIOSPECIMEN(
// `mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
// `observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
// `non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
//`observed_phenotype`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
//`non_observed_phenotype`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
// `familyRelationships`: FAMILY = null,
// `is_a_proband`: Boolean = false,
// `family_type`: String = "probant_only",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ case class SIMPLE_PARTICIPANT(
`mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`mondo`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
`observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty,
`phenotypes_tagged`: Seq[PHENOTYPE_TAGGED_WITH_OBSERVED] = Seq.empty,
`observed_phenotypes`: Seq[PHENOTYPE_ENRICHED] = Seq.empty,
`family_relationships`: Seq[FAMILY_RELATIONSHIP_WITH_FAMILY] = Nil,
Expand Down
Loading