From 016687df0fc86b4948fd561365dbed8ef2d44ffa Mon Sep 17 00:00:00 2001 From: vojtechhuser Date: Tue, 2 Aug 2016 16:54:03 -0400 Subject: [PATCH] new rule35 and better documentation - new rule 35 (measurement domain, units) - rule overview improvements #133 - derived analyses overview - new CSV file for future drill down feature #139 --- extras/Heel-Rules.html | 516 ++++++++++++++++++++++++ extras/Rule-Drill-Down.html | 196 +++++++++ extras/notes.md | 54 +-- inst/csv/achilles_rule.csv | 71 ++-- inst/csv/derived_analysis_details.csv | 6 + inst/csv/rule_drill_down.csv | 5 + inst/sql/sql_server/AchillesHeel_v5.sql | 22 +- 7 files changed, 801 insertions(+), 69 deletions(-) create mode 100644 extras/Heel-Rules.html create mode 100644 extras/Rule-Drill-Down.html create mode 100644 inst/csv/derived_analysis_details.csv create mode 100644 inst/csv/rule_drill_down.csv diff --git a/extras/Heel-Rules.html b/extras/Heel-Rules.html new file mode 100644 index 00000000..77d8de9c --- /dev/null +++ b/extras/Heel-Rules.html @@ -0,0 +1,516 @@ + + + + + + + + + + + + + +Rules + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
rule_idrule_nameseverityrule_typerule_descriptionthresholdrule_classificationrule_scope
0Achilles Heel version 1.3this rule is not used for data analysis. It communicates the version of the ruleset.
1multiple checks for greater than zeroerrorDQumbrella rule: this rule includes multiple error checks on over 35 analysis_ids>0complex
2multiple checks where minimum value of a measure should not be negativeerrorDQumbrella rule: this rule includes multiple error checks on over 20 analysis_ids where min value in distribution should not be negativecomplex
3multiple checks related to death data where maximum value of a measure should not be positivewarningDQdeath distributions where max should not be positive (using anlyses 511;512;513;514;515)plausibility
4invalid concept_iderrorCDM conformanceinvalid concept_id
5invalid type concept_iderrorCDM conformanceinvalid type concept_id
6data with unmapped conceptswarningDQfor multiple analyses
7concept from the wrong vocabularyerrorCDM conformanceconcept from the wrong vocabulary
8concept from the wrong vocabulary; raceerrorCDM conformanceconcept from the wrong vocabulary; race
9concept from the wrong vocabulary; ethnicityerrorCDM conformanceconcept from the wrong vocabulary; ethnicity
10concept from the wrong vocabulary; place of serviceerrorCDM conformanceconcept from the wrong vocabulary; place of service
11incorrect terminologyerrorCDM conformancespecialty - 48 specialty
12Dx is not a SNOMED codeerrorCDM conformanceconcept from the wrong vocabulary; Condition Occurrence, Condition Era (SNOMED)
13Drug is not RxNorm concepterrorCDM conformanceconcept from the wrong vocabulary; Drug Exposure, Drug Era (RxNorm)
14Procedure is not CPT, ICD9Proc or HCPCSerrorCDM conformanceprocedure - 4 CPT4/5 HCPCS/3 ICD9P
15incorrect terminologyerrorCDM conformanceCDM V4 only:LOINC
16incorrect terminologyerrorCDM conformanceCDM v4 only:DRG
17incorrect terminologyerrorCDM conformancerevenue code - 43 revenue code
18year of birth is in the futureerrorDQyear of birth should not be in the futureplausibility
19year of birth is prior 1800warningDQyear of birth < 1800<1800plausibility
20age below 0errorDQage < 0plausibility
21age too higherrorDQage > 150>150plausibility
22monthly trendwarningDQmonthly change > 100%fidelity
23monthly trendwarningDQmonthly change > 100% at concept levelfidelity
24too high days_supplywarningDQdays_supply > 180plausibility
25too high number of refilswarningDQrefills > 10>10plausibility
26implausible quantity for drugwarningDQquantity > 600>600plausibility
27more than 1 percent of unmapped rows (concept_0 rows)warningDQfor multiple analyses (4xx;6xx;7xx;8xx;18xx)>1completeness
28percentage of non-numerical measurement records exceeds general population thresholdwarningDQtypically, measurement data contans a significant proportion of rows with numerical result. This rule looks at rows in MEASUREMENT and alerts the user if a large proportion of rows lack any numerical result>=80completenessGeneralPopulationOnly
29infant diagnosis at senior age of over 50yoerrorDQmecconium condition 195075; This rule is example of a terminology depended data quality toolplausibility
31ratio of providers to total patientsnotificationDQThis rules fires if data indicate a high number of patients and only a few providers exist.plausibility
32Percentage of patients with no visits exceeds thresholdnotificationDQchecks if the percentage of patients with no visits exceeds threshold>5plausibility
33[GeneralPopulationOnly] Not all deciles represented at first observationnotificationDQin a general population, a database would observe first visit across all age groups. We at least expect deciles 0 to 8. Rule looks at the count of deciles.<9completenessGeneralPopulationOnly
34Count of unmapped source values in a domain exceeds thresholdnotificationDQlooks at values that are mapped to concept0 and their source values by table, rule 6 is related to this rule but it does not look at the size of the problem (only if unmapped data are present or not present)completeness
35Count of measurement_ids with more than 5 distinct units exceeds thresholdnotificationDQIdealy, each measurement would use only one unit. For example, kg for weight. This rule notifies the user if database has measurements that have 5 or more units. This rule has technically thresholds.>=5;>=10fidelity
+ + + + +
+ + + + + + + + diff --git a/extras/Rule-Drill-Down.html b/extras/Rule-Drill-Down.html new file mode 100644 index 00000000..ab0a7bc2 --- /dev/null +++ b/extras/Rule-Drill-Down.html @@ -0,0 +1,196 @@ + + + + + + + + + + + + + +Overview + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
rule_idlabeldrill_down_typeleveldescriptioncode
25which_drugsachilles1list which drug concepts violate this rule, extreme refil count is shown in column max_number_of_refils, column freq_in_data indicates overal frequency in data and not the frequency of the extremely high refil valuesselect stratum_1 as drug_concept_id, max_value as max_number_of_refils, count_value as freq_in_data from achilles_results_dist where analysis_id = 716 and max_value > 10 order by max_value desc, count_value desc;
34which_source_valuesachilles1list source_values that are unmappedselect stratum_1 as table_name, stratum_2 as unmapped_source_value, count_value from achilles_results where analysis_id = 1900 order by table_name, count_value desc;
35which_measurementsachilles1list which measurements trigger this rule (currently without concept names)select stratum_1 as measurement_concept_id, count() as count_of_units_in_data from achilles_results where analysis_id = 1807 group by stratum_1 having count() >= 5;
35which_unitsachilles1list which units (in addition to measurements) trigger this rule (currently without concept names)select stratum_1, stratum_2, count_value from achilles_results where analysis_id = 1807 and stratum_1 in (select stratum_1 from achilles_results where analysis_id = 1807 group by stratum_1 having count(*) >= 5) order by stratum_1,count_value;
+ + + + +
+ + + + + + + + diff --git a/extras/notes.md b/extras/notes.md index 9992278d..142c57d5 100644 --- a/extras/notes.md +++ b/extras/notes.md @@ -55,6 +55,24 @@ achillesResults <- achilles(connectionDetails,cdmDatabaseSchema=cdmDatabaseSchem ``` +#overview html files +The code below updates html files that show content overview. Use rawgit.com/OHDSI/... to view it nicely. +```R +tempf<-tempfile(pattern = 'temp', fileext = '.Rmd') +writeLines('---\ntitle: "Rules"\n---\n```{r, echo=FALSE}\n rules<-read.csv(system.file("csv","achilles_rule.csv",package="Achilles"),as.is=T);knitr::kable(rules)\n```',tempf) +rmarkdown::render(tempf,output_file = 'c:/temp/Heel-Rules.html',rmarkdown::html_document(toc = F, fig_caption = TRUE)) + + +tempf<-tempfile(pattern = 'temp', fileext = '.Rmd') +writeLines('---\ntitle: "Overview"\n---\n```{r, echo=FALSE}\n rules<-read.csv(system.file("csv","derived_analysis_details.csv",package="Achilles"),as.is=T);knitr::kable(rules)\n```',tempf) +rmarkdown::render(tempf,output_file = 'c:/temp/Derived-Analyses.html',rmarkdown::html_document(toc = F, fig_caption = TRUE)) + +tempf<-tempfile(pattern = 'temp', fileext = '.Rmd') +writeLines('---\ntitle: "Overview"\n---\n```{r, echo=FALSE}\n rules<-read.csv(system.file("csv","rule_drill_down.csv",package="Achilles"),as.is=T);knitr::kable(rules)\n```',tempf) +rmarkdown::render(tempf,output_file = 'c:/temp/Rule-Drill-Down.html',rmarkdown::html_document(toc = F, fig_caption = TRUE)) +``` + + #Data Quality CDM These notes relate Achilles and Achilles Heel to Data Quality CDM (DQ CDM) @@ -80,25 +98,13 @@ rule = check - -#overview of analyses (analysisDetail.csv) -This overview gets updated via insert statements in the Achilles main SQL file. -The R code below updates the CSV file. (used by some other parts of code) -```R -#internal function called here but it simply executes the sql passed to it -analyses_overview<-fetchAnySql(connectionDetails,resultsDatabaseSchema,'select * from achilles_analysis') -write.csv(analyses_overview,file='c:/d/Achilles/inst/csv/analysisDetails.csv',row.names=F,na='') -``` - -#Types of analyses - ##By outputed results ###Stratified analyses -use table ACHILLES_results +These anlyses use table ACHILLES_results ###distributions -use table ACHILLES_results_dist +Such analyses use table ACHILLES_results_dist e.g., 103,104,105,106,107,203,206,211,403,406,506,511,512,513,514,515,603,606,704,706,715,716,717,803,806,815 ##By nature @@ -112,27 +118,9 @@ e.g., analysis_id 7,8,9,207 - - #Analyzing Heel Results ###Simple rules: There are simple rules that generate a single error or warning. ###Complex rules -However, some rules (e.g., rule_id 6) can generate multiple rows. The true primary key for output is combination of rule_id and analysis_id - - -#Possible outputs -##Heel -All errors encoutered - -##Full Data characterization -Full listing of AchillesResults and AchillesResultsDist tables - -##Describe -We propose a new output table for Achilles that we refer to as AchillesDescribe -This can be a subset of full output tables that removes some elements (e.g., frequency data that are sensitive to a data partner within a consortium/study). - - - - +However, some rules (e.g., rule_id 6) can generate multiple rows. The true primary key for output is combination of rule_id and analysis_id \ No newline at end of file diff --git a/inst/csv/achilles_rule.csv b/inst/csv/achilles_rule.csv index ea74331e..e4f415eb 100644 --- a/inst/csv/achilles_rule.csv +++ b/inst/csv/achilles_rule.csv @@ -1,35 +1,36 @@ -rule_id,rule_name,severity,rule_description -0,Achilles Heel version 1.2,,this rule is not used for data analysis. It communicates the version of the ruleset. -1,multiple checks,error,multiple error checks -2,multiple checks,error,distributions where min should not be negative -3,multiple checks,warning,death distributions where max should not be positive -4,invalid concept_id,error,invalid concept_id -5,invalid type concept_id,error,invalid type concept_id -6,data with unmapped concepts,warning,for multiple analyses -7,concept from the wrong vocabulary,error,concept from the wrong vocabulary -8,concept from the wrong vocabulary; race,error,concept from the wrong vocabulary; race -9,concept from the wrong vocabulary; ethnicity,error,concept from the wrong vocabulary; ethnicity -10,concept from the wrong vocabulary; place of service,error,concept from the wrong vocabulary; place of service -11,incorrect terminology,error,specialty - 48 specialty -12,Dx is not a SNOMED code,error,"concept from the wrong vocabulary; Condition Occurrence, Condition Era (SNOMED)" -13,Drug is not RxNorm concept,error,"concept from the wrong vocabulary; Drug Exposure, Drug Era (RxNorm)" -14,"Procedure is not CPT, ICD9Proc or HCPCS",error,procedure - 4 CPT4/5 HCPCS/3 ICD9P -15,incorrect terminology,error,V4 only:LOINC -16,incorrect terminology,error,v4 only:DRG -17,incorrect terminology,error,revenue code - 43 revenue code -18,year of birth is in the future,error,year of birth should not be in the future -19,year of birth is prior 1800,warning, year of birth < 1800 -20,age below 0,error,age < 0 -21,age too high,error,age > 150 -22,monthly trend,warning,monthly change > 100% -23,monthly trend,warning,monthly change > 100% at concept level -24,too high days_supply,warning,days_supply > 180 -25,too high number of refils,warning,refills > 10 -26,implausible quantity for drug,warning,quantity > 600 -27,more than 1 percent of unmapped rows (concept_0 rows),warning,for multiple analyses (4xx;6xx;7xx;8xx;18xx) -28,percentage of deceased patients,warning,fires if (deceased/all person count * 100) is less than 1 (anusual if dataset represents a general healthcare data warehouse) -29,infant diagnosis at senior age of over 50yo,error,mecconium condition 195075; This rule is example of a terminology depended data quality tool -31,ratio of providers to total patients,notification,ratio -32,NOTIFICATION: Percentage of patients with no visits exceeds threshold,notification, checks if there are too many patients with no visits -33,NOTIFICATION: [GeneralPopulationOnly] Not all deciles represented at first observation,notification, the rule only applies to general population datasets -34,NOTIFICATION: Count of unmapped source values in a domain exceeds threshold,notification,looks at values that are mapped to concept0 and their source values by table \ No newline at end of file +rule_id,rule_name,severity,rule_type,rule_description,threshold,rule_classification,rule_scope +0,Achilles Heel version 1.3,,,this rule is not used for data analysis. It communicates the version of the ruleset.,,, +1,multiple checks for greater than zero,error,DQ,umbrella rule: this rule includes multiple error checks on over 35 analysis_ids,>0,complex, +2,multiple checks where minimum value of a measure should not be negative,error,DQ,umbrella rule: this rule includes multiple error checks on over 20 analysis_ids where min value in distribution should not be negative,,complex, +3,multiple checks related to death data where maximum value of a measure should not be positive,warning,DQ,death distributions where max should not be positive (using anlyses 511;512;513;514;515),,plausibility, +4,invalid concept_id,error,CDM conformance,invalid concept_id,,, +5,invalid type concept_id,error,CDM conformance,invalid type concept_id,,, +6,data with unmapped concepts,warning,DQ,for multiple analyses,,, +7,concept from the wrong vocabulary,error,CDM conformance,concept from the wrong vocabulary,,, +8,concept from the wrong vocabulary; race,error,CDM conformance,concept from the wrong vocabulary; race,,, +9,concept from the wrong vocabulary; ethnicity,error,CDM conformance,concept from the wrong vocabulary; ethnicity,,, +10,concept from the wrong vocabulary; place of service,error,CDM conformance,concept from the wrong vocabulary; place of service,,, +11,incorrect terminology,error,CDM conformance,specialty - 48 specialty,,, +12,Dx is not a SNOMED code,error,CDM conformance,"concept from the wrong vocabulary; Condition Occurrence, Condition Era (SNOMED)",,, +13,Drug is not RxNorm concept,error,CDM conformance,"concept from the wrong vocabulary; Drug Exposure, Drug Era (RxNorm)",,, +14,"Procedure is not CPT, ICD9Proc or HCPCS",error,CDM conformance,procedure - 4 CPT4/5 HCPCS/3 ICD9P,,, +15,incorrect terminology,error,CDM conformance,CDM V4 only:LOINC,,, +16,incorrect terminology,error,CDM conformance,CDM v4 only:DRG,,, +17,incorrect terminology,error,CDM conformance,revenue code - 43 revenue code,,, +18,year of birth is in the future,error,DQ,year of birth should not be in the future ,,plausibility, +19,year of birth is prior 1800,warning,DQ, year of birth < 1800,<1800,plausibility, +20,age below 0,error,DQ,age < 0,<0,plausibility, +21,age too high,error,DQ,age > 150,>150,plausibility, +22,monthly trend,warning,DQ,monthly change > 100%,,fidelity, +23,monthly trend,warning,DQ,monthly change > 100% at concept level,,fidelity, +24,too high days_supply,warning,DQ,days_supply > 180,,plausibility, +25,too high number of refils,warning,DQ,refills > 10,>10,plausibility, +26,implausible quantity for drug,warning,DQ,quantity > 600,>600,plausibility, +27,more than 1 percent of unmapped rows (concept_0 rows),warning,DQ,for multiple analyses (4xx;6xx;7xx;8xx;18xx),>1,completeness, +28,percentage of non-numerical measurement records exceeds general population threshold,warning,DQ,"typically, measurement data contans a significant proportion of rows with numerical result. This rule looks at rows in MEASUREMENT and alerts the user if a large proportion of rows lack any numerical result",>=80,completeness,GeneralPopulationOnly +29,infant diagnosis at senior age of over 50yo,error,DQ,mecconium condition 195075; This rule is example of a terminology depended data quality tool,,plausibility, +31,ratio of providers to total patients,notification,DQ,This rules fires if data indicate a high number of patients and only a few providers exist. ,,plausibility, +32,Percentage of patients with no visits exceeds threshold,notification,DQ,checks if the percentage of patients with no visits exceeds threshold,>5,plausibility, +33,[GeneralPopulationOnly] Not all deciles represented at first observation,notification,DQ,"in a general population, a database would observe first visit across all age groups. We at least expect deciles 0 to 8. Rule looks at the count of deciles.",<9,completeness,GeneralPopulationOnly +34,Count of unmapped source values in a domain exceeds threshold,notification,DQ,"looks at values that are mapped to concept0 and their source values by table, rule 6 is related to this rule but it does not look at the size of the problem (only if unmapped data are present or not present)",,completeness, +35,Count of measurement_ids with more than 5 distinct units exceeds threshold,notification,DQ,"Idealy, each measurement would use only one unit. For example, kg for weight. This rule notifies the user if database has measurements that have 5 or more units. This rule has technically thresholds. ",>=5;>=10,fidelity, diff --git a/inst/csv/derived_analysis_details.csv b/inst/csv/derived_analysis_details.csv new file mode 100644 index 00000000..e34c5749 --- /dev/null +++ b/inst/csv/derived_analysis_details.csv @@ -0,0 +1,6 @@ +measure_id,name,statistic_value_name,stratum_1_name,description,associated_rules +UnmappedDataByDomain:SourceValueCnt,Count of source values in unmapped data,count of source values,domain,The measure analyzes how many source codes are unmapped.,34 +AgeAtFirstObsByDecile:DecileCnt,Count of deciles appearing in the data (at first observation),count of deciles,,"The measure analyzes deciles of patients at their first observation. If only certain age groups are being observed, the count of deciles will be low.",33 +Provider:PatientProviderRatio,Patient Provider Ratio,ratio,,"The measure looks at how many patients and how many providers are defined in the data. For example, the ratio may indicate abnormaly low number of providers.",31 +Meas:NoNumValue:Percentage,Percentage of rows in MEASUREMENT table that have NULL recorded as numerical value,percentage,,The measure looks at data recorded in MESUREMENT table. A significant percentage of such rows typically contain a numerical result.,28 +UnmappedData:byDomain:Percentage,Percentage of rows that are unmapped,percentage,domain,The measure looks at relative size of unmapped data.,27 diff --git a/inst/csv/rule_drill_down.csv b/inst/csv/rule_drill_down.csv new file mode 100644 index 00000000..81a766e4 --- /dev/null +++ b/inst/csv/rule_drill_down.csv @@ -0,0 +1,5 @@ +rule_id,label,drill_down_type,level,description,code +25,which_drugs,achilles,1,"list which drug concepts violate this rule, extreme refil count is shown in column max_number_of_refils, column freq_in_data indicates overal frequency in data and not the frequency of the extremely high refil values","select stratum_1 as drug_concept_id, max_value as max_number_of_refils, count_value as freq_in_data from achilles_results_dist where analysis_id = 716 and max_value > 10 order by max_value desc, count_value desc;" +34,which_source_values,achilles,1,list source_values that are unmapped,"select stratum_1 as table_name, stratum_2 as unmapped_source_value, count_value from achilles_results where analysis_id = 1900 order by table_name, count_value desc;" +35,which_measurements,achilles,1,list which measurements trigger this rule (currently without concept names),"select stratum_1 as measurement_concept_id, count(*) as count_of_units_in_data from achilles_results where analysis_id = 1807 group by stratum_1 having count(*) >= 5;" +35,which_units,achilles,1,list which units (in addition to measurements) trigger this rule (currently without concept names),"select stratum_1, stratum_2, count_value from achilles_results where analysis_id = 1807 and stratum_1 in (select stratum_1 from achilles_results where analysis_id = 1807 group by stratum_1 having count(*) >= 5) order by stratum_1,count_value;" diff --git a/inst/sql/sql_server/AchillesHeel_v5.sql b/inst/sql/sql_server/AchillesHeel_v5.sql index b9bd9801..5c2cdf2f 100644 --- a/inst/sql/sql_server/AchillesHeel_v5.sql +++ b/inst/sql/sql_server/AchillesHeel_v5.sql @@ -75,7 +75,7 @@ create table @results_database_schema.ACHILLES_results_derived measure_id varchar(255) ); - + --general derived measures --non-CDM sources may generate derived measures directly --for CDM and Achilles: the fastest way to compute derived measures is to use @@ -1086,5 +1086,25 @@ and statistic_value > 1000; --threshold will be decided in DQ study 2 +--rule35 DQ rule, NOTIFICATION +--this rule analyzes Units recorded for measurement + +INSERT INTO @results_database_schema.ACHILLES_HEEL_results (ACHILLES_HEEL_warning,rule_id,record_count) + SELECT + 'NOTIFICATION: Count of measurement_ids with more than 5 distinct units exceeds threshold' as ACHILLES_HEEL_warning, + 35 as rule_id, + cast(meas_concept_id_cnt as int) as record_count + from ( + select meas_concept_id_cnt from (select sum(freq) as meas_concept_id_cnt from + (select u_cnt, count(*) as freq from + (select stratum_1, count(*) as u_cnt + from @results_database_schema.achilles_results where analysis_id = 1807 group by stratum_1) a + group by u_cnt + ) b + where u_cnt >= 5 --threshold one for the rule + ) c + where meas_concept_id_cnt >= 10 --threshold two for the rule + ) d +;