From b7dc87461e3da03fefe53d6880de0240cb9e05cd Mon Sep 17 00:00:00 2001 From: yh202109 Date: Wed, 24 Jul 2024 21:26:31 -0400 Subject: [PATCH] v0.2.20 --- docs/example_emt3.ipynb | 2 +- docs/statlab_kappa.rst | 2 +- docs/statlab_kappa2.rst | 7 +++- docs/std_cdisc.ipynb | 77 +++++++++++++++++++++++++++++++++++------ mtbp3/stdcdisc/lib.py | 71 ++++++++++++++++++++++++++++++++----- 5 files changed, 138 insertions(+), 21 deletions(-) diff --git a/docs/example_emt3.ipynb b/docs/example_emt3.ipynb index 58222683..cd910885 100644 --- a/docs/example_emt3.ipynb +++ b/docs/example_emt3.ipynb @@ -111,7 +111,7 @@ "## Find Terms Given a FMQ\n", "\n", "There are currently 104 FMQs.\n", - "There are 8,839 PTs are utilized to define to either broad or narrow scope of FMQs.\n", + "There are 8,839 PTs utilized to define either broad or narrow scope of FMQs.\n", "Note that 4 of 104 FMQs are algorithm based. \n", "Algorithm based FMQs may utilize additional data sources, including labs, concomitant medication records, patient histories, etc.\n" ] diff --git a/docs/statlab_kappa.rst b/docs/statlab_kappa.rst index 11cf932e..8da5acc1 100644 --- a/docs/statlab_kappa.rst +++ b/docs/statlab_kappa.rst @@ -24,7 +24,7 @@ ############# -StatLab/Cohen's Kappa +StatLab/Reli/Cohen's Kappa ############# :red-b:`Disclaimer:` diff --git a/docs/statlab_kappa2.rst b/docs/statlab_kappa2.rst index 602273c4..92f5292f 100644 --- a/docs/statlab_kappa2.rst +++ b/docs/statlab_kappa2.rst @@ -19,7 +19,7 @@ .. role:: red ############# -StatLab/Fleiss's Kappa +StatLab/Reli/Fleiss's Kappa ############# :red-b:`Disclaimer:` @@ -266,6 +266,7 @@ Please see the Fleiss (1971) for more discussions. The variance of :math:`\kappa` under the assumption of no agreement beyond chance can be approximated as: .. math:: + :label: eq_kappa2_vk var(\kappa) = c(n,R,\{p_j\}) var\left(\sum_{j=1}^J N_{1j}^2 \right), @@ -325,6 +326,10 @@ Combining :eq:`eq_kappa2_vn3`, :eq:`eq_kappa2_vn4`, and :eq:`eq_kappa2_vn5`, var\left(\sum_{j} N_{ij}^2 \right) = 2R(R-1)\left(S_{p2} - (2R-3)S_{p2}^2 + 2(R-2)S_{p3}\right). +Let :math:`s^2` be the estimated variance of :math:`\kappa` using :eq:`eq_kappa2_vk`. +Under the hypothesis of no agreement beyond chances, the limit distribution :math:`\kappa/s` would be a standard normal distribution. +The value of :math:`\kappa/s` then could be used to describe if the overall agreement is greater then by chance alone [2]_. + ************* Lab Exercise ************* diff --git a/docs/std_cdisc.ipynb b/docs/std_cdisc.ipynb index d5cf6312..f39c871f 100644 --- a/docs/std_cdisc.ipynb +++ b/docs/std_cdisc.ipynb @@ -36,7 +36,7 @@ "\n", "Please note that this page is not executed while building. \n", "Functions and options might be updated as needed.\n", - "Please visit API reference page for the newest information on this module." + "Please visit API reference page for the newest information on this module.\n" ] }, { @@ -131,7 +131,7 @@ "### Get CT Packages\n", "\n", "CDISC Controlled Terminology (CT) is maintained and distributed as part of the National Cancer Institute (NCI) Thesaurus [^5].\n", - "\n", + "CT can be downloaded from NCI FTP and multiple file formats are available (Excel, text, odm.xml, pdf, html, and OWL/RDF formats) [^6].\n", "\n", "To get a list of newest CT packages available:\n", "\n", @@ -148,7 +148,7 @@ "Output:\n", "\n", "```\n", - "{'CDASH', 'MRCT', 'COA', 'ADaM', 'Protocol', 'SDTM', 'QRS', 'SEND', 'QS-FT', 'Define-XML', 'Glossary', 'DDF'}\n", + "['ADaM', 'CDASH', 'COA', 'DDF', 'Define-XML', 'Glossary', 'MRCT', 'Protocol', 'QRS', 'QS-FT', 'SDTM', 'SEND']\n", "```" ] }, @@ -160,9 +160,38 @@ "\n", "- DDF: digital data flow.\n", "- MRCT: multi-regional clinical trials\n", - "- COA: clinical outcome assessment. This is an archived version. COA is part of QRS supplement supporting FDA COA qualification program [^6].\n", + "- COA: clinical outcome assessment. This is an archived version. COA is part of QRS supplement supporting FDA COA qualification program [^7].\n", "- QRS: questionnaires, ratings and scales. This is an archived version. QRS SDTM CT is currently included in SDTM CT. \n", - "- QS-FT: questionnaire and functional test. This is an archived version. This set was merged into COA [^7]." + "- QS-FT: questionnaire and functional test. This is an archived version. This set was merged into COA [^8]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To show package effective date:\n", + "\n", + "```python\n", + "print(cl.ct_list[['Title','PkgSeries','Effective']])\n", + "```\n", + "\n", + "Output:\n", + "\n", + "```\n", + " Title PkgSeries Effective\n", + "0 ADaM 57 2024-03-29\n", + "1 CDASH 52 2022-12-16\n", + "2 COA 21 2015-03-27\n", + "3 DDF 57 2024-03-29\n", + "4 Define-XML 57 2024-03-29\n", + "5 Glossary 56 2023-12-15\n", + "6 MRCT 57 2024-03-29\n", + "7 Protocol 57 2024-03-29\n", + "8 QRS 23 2015-09-25\n", + "9 QS-FT 19 2014-09-26\n", + "10 SDTM 57 2024-03-29\n", + "11 SEND 57 2024-03-29\n", + "```\n" ] }, { @@ -174,8 +203,8 @@ "```python\n", "import json\n", "\n", - "c = cl.get_ct_package('Protocol')\n", - "print(json.dumps(c['package_info'], indent=4))\n", + "cl.get_ct_package('Protocol')\n", + "print(json.dumps(cl.ct_package['Protocol']['package_info']))\n", "```\n", "\n", "Output:\n", @@ -193,6 +222,33 @@ "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get the newest protocol ct package information:\n", + "\n", + "```python\n", + "title = 'Protocol'\n", + "cl = accessLib(\"/Users/yh2020/cdisc.txt\")\n", + "cl.get_ct_list()\n", + "cl.get_ct_package(title)\n", + "cl.get_ct_codelists_df(title)\n", + "print(cl.ct_package[title]['ct_df'][['conceptId', 'name', 'preferredTerm']].head())\n", + "```\n", + "\n", + "Output:\n", + "\n", + "```\n", + " conceptId name preferredTerm\n", + "0 C179587 Biological Sample Attribute Terminology CDISC Protocol Biological Sample Attribute Ter...\n", + "1 C201266 Biomarker Attribute Terminology CDISC Protocol Biomarker Attribute Terminology\n", + "2 C201267 Biomarker Category Value Set Terminology CDISC Protocol Biomarker Category Value Set Te...\n", + "3 C142191 Clinical Study Attribute Terminology CDISC Protocol Entities Clinical Study Attribu...\n", + "4 C139020 Clinical Trial Attribute Terminology CDISC Protocol Entities Clinical Trial Attribu...\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -203,9 +259,10 @@ "[^2]: FDA. (year). Data Standards Resource. ([web page](https://www.fda.gov/industry/fda-data-standards-advisory-board/study-data-standards-resources))\n", "[^3]: FDA. (year). Data Standards Catalog. ([web page](https://www.fda.gov/regulatory-information/search-fda-guidance-documents/data-standards-catalog))\n", "[^4]: FDA. (year). Study Data Technical Conformance Guide. ([web page](https://www.fda.gov/regulatory-information/search-fda-guidance-documents/study-data-technical-conformance-guide-technical-specifications-document))\n", - "[^5]: NCI. (yeawr). CDISC Terminology. ([web page](https://datascience.cancer.gov/resources/cancer-vocabulary/cdisc-terminology))\n", - "[^6]: FDA. (2023). Clinical Outcome Assessment (COA) Qualification Program. ([web page](https://www.fda.gov/drugs/drug-development-tool-ddt-qualification-programs/clinical-outcome-assessment-coa-qualification-program))\n", - "[^7]: P21. (2015). CDISC Questionnaire and Functional Test (QS-FT) Terminology has been deprecated. ([web page](https://www.pinnacle21.com/forum/cdisc-questionnaire-and-functional-test-qs-ft-terminology-has-been-deprecated))\n", + "[^5]: NCI. (year). NCI Thesaurus (NCIt). ([web page](https://ncit.nci.nih.gov/ncitbrowser/))\n", + "[^6]: NCI. (year). CDISC Terminology. ([web page](https://datascience.cancer.gov/resources/cancer-vocabulary/cdisc-terminology))\n", + "[^7]: FDA. (2023). Clinical Outcome Assessment (COA) Qualification Program. ([web page](https://www.fda.gov/drugs/drug-development-tool-ddt-qualification-programs/clinical-outcome-assessment-coa-qualification-program))\n", + "[^8]: P21. (2015). CDISC Questionnaire and Functional Test (QS-FT) Terminology has been deprecated. ([web page](https://www.pinnacle21.com/forum/cdisc-questionnaire-and-functional-test-qs-ft-terminology-has-been-deprecated))\n", "\n", "\n", "\n", diff --git a/mtbp3/stdcdisc/lib.py b/mtbp3/stdcdisc/lib.py index 57b3ddb9..5e9a158b 100644 --- a/mtbp3/stdcdisc/lib.py +++ b/mtbp3/stdcdisc/lib.py @@ -43,6 +43,8 @@ def __init__(self, input_file): self.baseURL = "https://library.cdisc.org/api" self.ct_list = pd.DataFrame() self.ct_list_titles = [] + self.ct_package = {} + def get_ct_list(self, newest=True): """ @@ -74,13 +76,16 @@ def get_ct_list(self, newest=True): df = df._append({'Title': t1, 'PkgSeries': t2, 'Effective': t3, 'Path': package['href'], 'Type': package['type']}, ignore_index=True) df['TitleL'] = df['Title'].str.lower() df['Newest'] = df['Effective'] == df.groupby('TitleL')['Effective'].transform('max') + df = df.sort_values(['TitleL', 'PkgSeries']).reset_index(drop=True) else: raise ValueError("Invalid status code: " + str(req.status_code) + " - " + req.reason) if newest: - df = df[df['Newest']] + df = df[df['Newest']].reset_index(drop=True) + self.ct_list = df - self.ct_list_titles = set(df['Title']) + self.ct_list_titles = sorted(set(df['Title'])) + return def get_ct_package(self, title = "", pkg_series = "", out_folder = ""): @@ -137,14 +142,64 @@ def get_ct_package(self, title = "", pkg_series = "", out_folder = ""): out_path = os.path.join(out_folder, package_name) with open(out_path, 'w') as file: json.dump(out, file) - - return out + + self.ct_package[title] = out + return else: raise ValueError("Invalid status code: " + str(req.status_code) + " - " + req.reason) + + def get_ct_codelists_df(self, title="", max_level = 3): + """ + Converts the codelist information from the package to a pandas DataFrame. + + Args: + package (dict): The package information returned by get_ct_package. + + Returns: + pandas.DataFrame: A DataFrame containing the codelist information. + """ + if not title in self.ct_package.keys(): + print(self.ct_package.keys()) + raise ValueError("Invalid CT package.") + if 'package_info' not in self.ct_package[title].keys() or 'codelists' not in self.ct_package[title].keys(): + raise ValueError("Invalid package. Package does not contain package_info or codelists.") + package = self.ct_package[title] + tmp = package['package_info']['name'].replace(r'[^a-zA-Z0-9]', '_') + remaining_list = package['codelists'] + remaining_list_label = [tmp] * len(remaining_list) + data = [] + not_processed = [] + for level in range(max_level): + if len(remaining_list) == 0: + break + codelists = remaining_list + codelists_label = remaining_list_label + remaining_list = [] + remaining_list_label = [] + for index, item in enumerate(codelists): + if isinstance(item, dict): + if 'conceptId' in item.keys() and 'name' not in item.keys(): + item['name']="" + if 'synonyms' not in item.keys(): + item['synonyms']=[] + if 'terms' in item.keys(): + nterms = len(item['terms']) + remaining_list.extend(item['terms']) + remaining_list_label.extend([codelists_label[index]+'.pseudo'+item['conceptId']]*nterms) + else: + nterms = 0 + data.append([codelists_label[index], level, item['conceptId'], item['name'], item['preferredTerm'], item['submissionValue'], '; '.join(item['synonyms']), item['definition'], nterms]) + else: + not_processed.append(item) + + df = pd.DataFrame(data, columns=['label', 'level', 'conceptId', 'name', 'preferredTerm', 'submissionValue', 'synonyms', 'definition', 'terms']) + label = df[['label', 'preferredTerm']] + df = df.drop('label', axis=1) + + self.ct_package[title]['label_df'] = label + self.ct_package[title]['ct_df'] = df + + return if __name__ == "__main__": pass - - #cl = accessLib("/Users/yh2020/cdisc.txt") - #cl.get_ct_list() - #c = cl.get_ct_package('Protocol') \ No newline at end of file