diff --git a/index-task/src/main/resources/templates/template_biospecimen_centric.json b/index-task/src/main/resources/templates/template_biospecimen_centric.json index 64d38b6b..95add3a9 100644 --- a/index-task/src/main/resources/templates/template_biospecimen_centric.json +++ b/index-task/src/main/resources/templates/template_biospecimen_centric.json @@ -226,32 +226,6 @@ } } }, - "non_observed_phenotype_tagged": { - "type": "nested", - "properties": { - "internal_phenotype_id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "source_text": { - "type": "keyword" - }, - "parents": { - "type": "keyword" - }, - "is_leaf": { - "type": "boolean" - }, - "is_tagged": { - "type": "boolean" - }, - "age_at_event": { - "type": "keyword" - } - } - }, "phenotypes_tagged": { "type": "nested", "properties": { diff --git a/index-task/src/main/resources/templates/template_file_centric.json b/index-task/src/main/resources/templates/template_file_centric.json index f38d3378..917951d9 100644 --- a/index-task/src/main/resources/templates/template_file_centric.json +++ b/index-task/src/main/resources/templates/template_file_centric.json @@ -239,32 +239,6 @@ } } }, - "non_observed_phenotype_tagged": { - "type": "nested", - "properties": { - "internal_phenotype_id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "parents": { - "type": "keyword" - }, - "is_leaf": { - "type": "boolean" - }, - "is_tagged": { - "type": "boolean" - }, - "source_text": { - "type": "keyword" - }, - "age_at_event": { - "type": "keyword" - } - } - }, "observed_phenotypes": { "type": "nested", "properties": { @@ -509,32 +483,6 @@ } } }, - "non_observed_phenotype_tagged": { - "type": "nested", - "properties": { - "internal_phenotype_id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "parents": { - "type": "keyword" - }, - "is_leaf": { - "type": "boolean" - }, - "is_tagged": { - "type": "boolean" - }, - "source_text": { - "type": "keyword" - }, - "age_at_event": { - "type": "keyword" - } - } - }, "phenotypes_tagged": { "type": "nested", "properties": { diff --git a/index-task/src/main/resources/templates/template_participant_centric.json b/index-task/src/main/resources/templates/template_participant_centric.json index 63c73ead..65b84787 100644 --- a/index-task/src/main/resources/templates/template_participant_centric.json +++ b/index-task/src/main/resources/templates/template_participant_centric.json @@ -206,32 +206,6 @@ } } }, - "non_observed_phenotype_tagged": { - "type": "nested", - "properties": { - "internal_phenotype_id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "source_text": { - "type": "keyword" - }, - "parents": { - "type": "keyword" - }, - "is_leaf": { - "type": "boolean" - }, - "is_tagged": { - "type": "boolean" - }, - "age_at_event": { - "type": "keyword" - } - } - }, "phenotypes_tagged": { "type": "nested", "properties": { diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/OntologyUtils.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/OntologyUtils.scala index 9a45240c..0d1049e7 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/OntologyUtils.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/OntologyUtils.scala @@ -57,7 +57,7 @@ object OntologyUtils { .agg(collect_list(col("phenotype_with_age_grouped")) as colName) } - def getTaggedPhenotypes(phenotypesDF: DataFrame, hpoTerms: DataFrame): (DataFrame, DataFrame, DataFrame, DataFrame) = { + def getTaggedPhenotypes(phenotypesDF: DataFrame, hpoTerms: DataFrame): (DataFrame, DataFrame, DataFrame) = { val hpoExplodedAlt = hpoTerms .withColumn("alt_id", explode(col("alt_ids"))) @@ -85,8 +85,6 @@ object OntologyUtils { .withColumnRenamed("age_at_phenotype", "age_at_event") .withColumnRenamed("phenotype_source_text", "source_text") - - val observedPhenotypesWithAncestors = generatePhenotypeWithAncestors(observedPhenotypes, "observed_phenotypes") val taggedObservedPhenotypes = generateTaggedPhenotypes(observedPhenotypes, "observed_phenotype_tagged") @@ -99,7 +97,7 @@ object OntologyUtils { .groupBy("cqdg_participant_id") .agg(collect_list("exp_phenotypes") as "phenotypes_tagged") - (taggedObservedPhenotypes, taggedNonObservedPhenotypes, observedPhenotypesWithAncestors.drop("study_id"), phenotypes_tagged) + (taggedObservedPhenotypes, observedPhenotypesWithAncestors.drop("study_id"), phenotypes_tagged) } def getDiagnosis(diagnosisDf: DataFrame, mondoTerms: DataFrame, icdTerms: DataFrame): (DataFrame, DataFrame) = { diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/Utils.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/Utils.scala index a3aa644a..c7fe4105 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/Utils.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/common/Utils.scala @@ -87,7 +87,7 @@ object Utils { } def addDiagnosisPhenotypes(phenotypeDF: DataFrame, diagnosesDF: DataFrame)(hpoTerms: DataFrame, mondoTerms: DataFrame, icdTerms: DataFrame): DataFrame = { - val (observedPhenotypes, nonObservedPhenotypes, observedPhenotypesWithAncestors, phenotypes) = getTaggedPhenotypes(phenotypeDF, hpoTerms) + val (observedPhenotypes, observedPhenotypesWithAncestors, phenotypes) = getTaggedPhenotypes(phenotypeDF, hpoTerms) val (diagnosis, mondoWithAncestors) = getDiagnosis(diagnosesDF, mondoTerms, icdTerms) @@ -97,7 +97,6 @@ object Utils { .drop("cqdg_participant_id") .join(phenotypes, col("fhir_id") === col("cqdg_participant_id"), "left_outer") .join(observedPhenotypes, Seq("cqdg_participant_id"), "left_outer") - .join(nonObservedPhenotypes, Seq("cqdg_participant_id"), "left_outer") .join(observedPhenotypesWithAncestors, Seq("cqdg_participant_id"), "left_outer") .drop("cqdg_participant_id") .withColumnRenamed("fhir_id", "participant_id") diff --git a/prepare-index/src/test/scala/OntologyUtilsSpec.scala b/prepare-index/src/test/scala/OntologyUtilsSpec.scala index 64df10d7..4ad63c0c 100644 --- a/prepare-index/src/test/scala/OntologyUtilsSpec.scala +++ b/prepare-index/src/test/scala/OntologyUtilsSpec.scala @@ -1,6 +1,6 @@ import bio.ferlab.datalake.spark3.loader.GenericLoader.read import bio.ferlab.fhir.etl.common.OntologyUtils.{getDiagnosis, getTaggedPhenotypes} -import model.{DIAGNOSIS, DIAGNOSIS_INPUT, PHENOTYPE, PHENOTYPE_HPO_CODE, PHENOTYPE_TAGGED, PHENOTYPE_TAGGED_WITH_ANCESTORS, PHENOTYPE_TAGGED_WITH_OBSERVED} +import model.{DIAGNOSIS_INPUT, PHENOTYPE, PHENOTYPE_HPO_CODE, PHENOTYPE_TAGGED, PHENOTYPE_TAGGED_WITH_ANCESTORS, PHENOTYPE_TAGGED_WITH_OBSERVED} import org.apache.spark.sql.DataFrame import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -20,27 +20,20 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession val phenotypes = Seq(phenotype1, phenotype2, phenotype3).toDF() - val (t1, t2, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms) + val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms) - //observed phenotypes tagged - val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head + //phenotypes tagged + val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head taggedPhenotypes shouldBe ("1", Seq( - PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text"), - PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Old"), `source_text` = "text"), - )) - - //observed phenotypes tagged - val notTaggedPhenotypes = t2.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head - - notTaggedPhenotypes shouldBe - ("1", Seq( - PHENOTYPE_TAGGED(`internal_phenotype_id` = "3", `parents` = Seq("A Name (HP:A)"), `name` = "C Name (HP:C)", `age_at_event` = Some("Super Old"), `source_text` = "text"), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Old"), `source_text` = "text", `is_observed` = Some(true)), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "3", `parents` = Seq("A Name (HP:A)"), `name` = "C Name (HP:C)", `age_at_event` = Some("Super Old"), `source_text` = "text", `is_observed` = Some(false)), )) //observed phenotypes with ancestors - val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head + val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head taggedPhenotypesWithAncestors._2 should contain theSameElementsAs Seq( @@ -58,20 +51,20 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession val phenotypes = Seq(phenotype1, phenotype2).toDF() - val (t1, _, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms) + val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms) //observed phenotypes tagged - val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head + val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head taggedPhenotypes shouldBe ("1", Seq( - PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Young"), `source_text` = "text"), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)), // HP:D (obsolete) should be changed to HP:G (alternate) - PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = None, `source_text` = "text"), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = None, `source_text` = "text", `is_observed` = Some(true)), )) //observed phenotypes with ancestors - val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head + val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head taggedPhenotypesWithAncestors._2 should contain theSameElementsAs Seq( @@ -89,19 +82,19 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession val phenotypes = Seq(phenotype1, phenotype2).toDF() - val (t1, _, t3, _) = getTaggedPhenotypes(phenotypes, hpo_terms) + val (_, t2, t3) = getTaggedPhenotypes(phenotypes, hpo_terms) //observed phenotypes tagged - val taggedPhenotypes = t1.as[(String, Seq[PHENOTYPE_TAGGED])].collect().head + val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect().head taggedPhenotypes shouldBe ("1", Seq( - PHENOTYPE_TAGGED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text"), - PHENOTYPE_TAGGED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = None, `source_text` = "text"), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "1", `is_leaf` = true, `name` = "G Name (HP:G)", `parents` = Seq("B Name (HP:B)"), `age_at_event` = Some("Young"), `source_text` = "text", `is_observed` = Some(true)), + PHENOTYPE_TAGGED_WITH_OBSERVED(`internal_phenotype_id` = "2", `is_leaf` = true, `name` = "E Name (HP:E)", `parents` = Seq("B Name (HP:B)", "C Name (HP:C)"), `age_at_event` = None, `source_text` = "text", `is_observed` = Some(true)), )) //observed phenotypes with ancestors - val taggedPhenotypesWithAncestors = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head + val taggedPhenotypesWithAncestors = t2.as[(String, Seq[PHENOTYPE_TAGGED_WITH_ANCESTORS])].collect().head taggedPhenotypesWithAncestors._2 should contain theSameElementsAs Seq( @@ -166,8 +159,8 @@ class OntologyUtilsSpec extends AnyFlatSpec with Matchers with WithSparkSession val phenotypes = Seq(phenotype1, phenotype2, phenotype3).toDF() - val (_, _, _, t4) = getTaggedPhenotypes(phenotypes, hpo_terms) - val taggedPhenotypes = t4.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect() + val (_, _, t3) = getTaggedPhenotypes(phenotypes, hpo_terms) + val taggedPhenotypes = t3.as[(String, Seq[PHENOTYPE_TAGGED_WITH_OBSERVED])].collect() taggedPhenotypes.flatMap(e => e._2.map(p => (p.`internal_phenotype_id`, p.`is_observed`))) diff --git a/prepare-index/src/test/scala/model/PARTICIPANT_CENTRIC.scala b/prepare-index/src/test/scala/model/PARTICIPANT_CENTRIC.scala index 39428c1e..24f9ba90 100644 --- a/prepare-index/src/test/scala/model/PARTICIPANT_CENTRIC.scala +++ b/prepare-index/src/test/scala/model/PARTICIPANT_CENTRIC.scala @@ -20,7 +20,6 @@ case class PARTICIPANT_CENTRIC( `mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, `mondo`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, `observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, - `non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, `observed_phenotypes`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, `family_relationships`: Seq[FAMILY_RELATIONSHIP_WITH_FAMILY] = Nil, `is_a_proband`: Option[Boolean] = None, diff --git a/prepare-index/src/test/scala/model/PARTICIPANT_WITH_BIOSPECIMEN.scala b/prepare-index/src/test/scala/model/PARTICIPANT_WITH_BIOSPECIMEN.scala index cf374733..2ce7eb00 100644 --- a/prepare-index/src/test/scala/model/PARTICIPANT_WITH_BIOSPECIMEN.scala +++ b/prepare-index/src/test/scala/model/PARTICIPANT_WITH_BIOSPECIMEN.scala @@ -15,8 +15,6 @@ case class PARTICIPANT_WITH_BIOSPECIMEN( // `mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, // `observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, // `non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, - //`observed_phenotype`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, - //`non_observed_phenotype`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, // `familyRelationships`: FAMILY = null, // `is_a_proband`: Boolean = false, // `family_type`: String = "probant_only", diff --git a/prepare-index/src/test/scala/model/SIMPLE_PARTICIPANT.scala b/prepare-index/src/test/scala/model/SIMPLE_PARTICIPANT.scala index c5ceb792..e56f708f 100644 --- a/prepare-index/src/test/scala/model/SIMPLE_PARTICIPANT.scala +++ b/prepare-index/src/test/scala/model/SIMPLE_PARTICIPANT.scala @@ -20,7 +20,6 @@ case class SIMPLE_PARTICIPANT( `mondo_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, `mondo`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, `observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, - `non_observed_phenotype_tagged`: Seq[PHENOTYPE_TAGGED] = Seq.empty, `phenotypes_tagged`: Seq[PHENOTYPE_TAGGED_WITH_OBSERVED] = Seq.empty, `observed_phenotypes`: Seq[PHENOTYPE_ENRICHED] = Seq.empty, `family_relationships`: Seq[FAMILY_RELATIONSHIP_WITH_FAMILY] = Nil,