v0.2.20

yh202109 · Jul 25, 2024 · b7dc874 · b7dc874
1 parent 1c508a2
commit b7dc874
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 21 deletions.
diff --git a/docs/example_emt3.ipynb b/docs/example_emt3.ipynb
@@ -111,7 +111,7 @@
     "## Find Terms Given a FMQ\n",
     "\n",
     "There are currently 104 FMQs.\n",
-    "There are 8,839 PTs are utilized to define to either broad or narrow scope of FMQs.\n",
+    "There are 8,839 PTs utilized to define either broad or narrow scope of FMQs.\n",
     "Note that 4 of 104 FMQs are algorithm based. \n",
     "Algorithm based FMQs may utilize additional data sources, including labs, concomitant medication records, patient histories, etc.\n"
    ]

diff --git a/docs/statlab_kappa.rst b/docs/statlab_kappa.rst
@@ -24,7 +24,7 @@
 
 
 #############
-StatLab/Cohen's Kappa 
+StatLab/Reli/Cohen's Kappa 
 #############
 
 :red-b:`Disclaimer:`

diff --git a/docs/statlab_kappa2.rst b/docs/statlab_kappa2.rst
@@ -19,7 +19,7 @@
 .. role:: red
 
 #############
-StatLab/Fleiss's Kappa  
+StatLab/Reli/Fleiss's Kappa  
 #############
 
 :red-b:`Disclaimer:`
@@ -266,6 +266,7 @@ Please see the Fleiss (1971) for more discussions.
 The variance of :math:`\kappa` under the assumption of no agreement beyond chance can be approximated as:
 
 .. math::
+  :label: eq_kappa2_vk
 
   var(\kappa) = c(n,R,\{p_j\}) var\left(\sum_{j=1}^J N_{1j}^2 \right),
 
@@ -325,6 +326,10 @@ Combining :eq:`eq_kappa2_vn3`, :eq:`eq_kappa2_vn4`, and :eq:`eq_kappa2_vn5`,
   var\left(\sum_{j} N_{ij}^2 \right) 
   = 2R(R-1)\left(S_{p2} - (2R-3)S_{p2}^2 + 2(R-2)S_{p3}\right).
 
+Let :math:`s^2` be the estimated variance of :math:`\kappa` using :eq:`eq_kappa2_vk`.
+Under the hypothesis of no agreement beyond chances, the limit distribution :math:`\kappa/s` would be a standard normal distribution.
+The value of :math:`\kappa/s` then could be used to describe if the overall agreement is greater then by chance alone [2]_.
+
 *************
 Lab Exercise
 *************

diff --git a/docs/std_cdisc.ipynb b/docs/std_cdisc.ipynb
@@ -36,7 +36,7 @@
                 "\n",
                 "Please note that this page is not executed while building. \n",
                 "Functions and options might be updated as needed.\n",
-                "Please visit API reference page for the newest information on this module."
+                "Please visit API reference page for the newest information on this module.\n"
             ]
         },
         {
@@ -131,7 +131,7 @@
                 "### Get CT Packages\n",
                 "\n",
                 "CDISC Controlled Terminology (CT) is maintained and distributed as part of the National Cancer Institute (NCI) Thesaurus [^5].\n",
-                "\n",
+                "CT can be downloaded from NCI FTP and multiple file formats are available (Excel, text, odm.xml, pdf, html, and OWL/RDF formats) [^6].\n",
                 "\n",
                 "To get a list of newest CT packages available:\n",
                 "\n",
@@ -148,7 +148,7 @@
                 "Output:\n",
                 "\n",
                 "```\n",
-                "{'CDASH', 'MRCT', 'COA', 'ADaM', 'Protocol', 'SDTM', 'QRS', 'SEND', 'QS-FT', 'Define-XML', 'Glossary', 'DDF'}\n",
+                "['ADaM', 'CDASH', 'COA', 'DDF', 'Define-XML', 'Glossary', 'MRCT', 'Protocol', 'QRS', 'QS-FT', 'SDTM', 'SEND']\n",
                 "```"
             ]
         },
@@ -160,9 +160,38 @@
                 "\n",
                 "- DDF: digital data flow.\n",
                 "- MRCT: multi-regional clinical trials\n",
-                "- COA: clinical outcome assessment. This is an archived version. COA is part of QRS supplement supporting FDA COA qualification program [^6].\n",
+                "- COA: clinical outcome assessment. This is an archived version. COA is part of QRS supplement supporting FDA COA qualification program [^7].\n",
                 "- QRS: questionnaires, ratings and scales. This is an archived version. QRS SDTM CT is currently included in SDTM CT. \n",
-                "- QS-FT: questionnaire and functional test. This is an archived version. This set was merged into COA [^7]."
+                "- QS-FT: questionnaire and functional test. This is an archived version. This set was merged into COA [^8]."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "To show package effective date:\n",
+                "\n",
+                "```python\n",
+                "print(cl.ct_list[['Title','PkgSeries','Effective']])\n",
+                "```\n",
+                "\n",
+                "Output:\n",
+                "\n",
+                "```\n",
+                "         Title PkgSeries   Effective\n",
+                "0         ADaM        57  2024-03-29\n",
+                "1        CDASH        52  2022-12-16\n",
+                "2          COA        21  2015-03-27\n",
+                "3          DDF        57  2024-03-29\n",
+                "4   Define-XML        57  2024-03-29\n",
+                "5     Glossary        56  2023-12-15\n",
+                "6         MRCT        57  2024-03-29\n",
+                "7     Protocol        57  2024-03-29\n",
+                "8          QRS        23  2015-09-25\n",
+                "9        QS-FT        19  2014-09-26\n",
+                "10        SDTM        57  2024-03-29\n",
+                "11        SEND        57  2024-03-29\n",
+                "```\n"
             ]
         },
         {
@@ -174,8 +203,8 @@
                 "```python\n",
                 "import json\n",
                 "\n",
-                "c = cl.get_ct_package('Protocol')\n",
-                "print(json.dumps(c['package_info'], indent=4))\n",
+                "cl.get_ct_package('Protocol')\n",
+                "print(json.dumps(cl.ct_package['Protocol']['package_info']))\n",
                 "```\n",
                 "\n",
                 "Output:\n",
@@ -193,6 +222,33 @@
                 "```"
             ]
         },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "To get the newest protocol ct package information:\n",
+                "\n",
+                "```python\n",
+                "title = 'Protocol'\n",
+                "cl = accessLib(\"/Users/yh2020/cdisc.txt\")\n",
+                "cl.get_ct_list()\n",
+                "cl.get_ct_package(title)\n",
+                "cl.get_ct_codelists_df(title)\n",
+                "print(cl.ct_package[title]['ct_df'][['conceptId', 'name', 'preferredTerm']].head())\n",
+                "```\n",
+                "\n",
+                "Output:\n",
+                "\n",
+                "```\n",
+                "  conceptId                                      name                                      preferredTerm\n",
+                "0   C179587   Biological Sample Attribute Terminology  CDISC Protocol Biological Sample Attribute Ter...\n",
+                "1   C201266           Biomarker Attribute Terminology     CDISC Protocol Biomarker Attribute Terminology\n",
+                "2   C201267  Biomarker Category Value Set Terminology  CDISC Protocol Biomarker Category Value Set Te...\n",
+                "3   C142191      Clinical Study Attribute Terminology  CDISC Protocol Entities Clinical Study Attribu...\n",
+                "4   C139020      Clinical Trial Attribute Terminology  CDISC Protocol Entities Clinical Trial Attribu...\n",
+                "```"
+            ]
+        },
         {
             "cell_type": "markdown",
             "metadata": {},
@@ -203,9 +259,10 @@
                 "[^2]: FDA. (year). Data Standards Resource. ([web page](https://www.fda.gov/industry/fda-data-standards-advisory-board/study-data-standards-resources))\n",
                 "[^3]: FDA. (year). Data Standards Catalog. ([web page](https://www.fda.gov/regulatory-information/search-fda-guidance-documents/data-standards-catalog))\n",
                 "[^4]: FDA. (year). Study Data Technical Conformance Guide. ([web page](https://www.fda.gov/regulatory-information/search-fda-guidance-documents/study-data-technical-conformance-guide-technical-specifications-document))\n",
-                "[^5]: NCI. (yeawr). CDISC Terminology. ([web page](https://datascience.cancer.gov/resources/cancer-vocabulary/cdisc-terminology))\n",
-                "[^6]: FDA. (2023). Clinical Outcome Assessment (COA) Qualification Program. ([web page](https://www.fda.gov/drugs/drug-development-tool-ddt-qualification-programs/clinical-outcome-assessment-coa-qualification-program))\n",
-                "[^7]: P21. (2015). CDISC Questionnaire and Functional Test (QS-FT) Terminology has been deprecated. ([web page](https://www.pinnacle21.com/forum/cdisc-questionnaire-and-functional-test-qs-ft-terminology-has-been-deprecated))\n",
+                "[^5]: NCI. (year). NCI Thesaurus (NCIt). ([web page](https://ncit.nci.nih.gov/ncitbrowser/))\n",
+                "[^6]: NCI. (year). CDISC Terminology. ([web page](https://datascience.cancer.gov/resources/cancer-vocabulary/cdisc-terminology))\n",
+                "[^7]: FDA. (2023). Clinical Outcome Assessment (COA) Qualification Program. ([web page](https://www.fda.gov/drugs/drug-development-tool-ddt-qualification-programs/clinical-outcome-assessment-coa-qualification-program))\n",
+                "[^8]: P21. (2015). CDISC Questionnaire and Functional Test (QS-FT) Terminology has been deprecated. ([web page](https://www.pinnacle21.com/forum/cdisc-questionnaire-and-functional-test-qs-ft-terminology-has-been-deprecated))\n",
                 "\n",
                 "\n",
                 "\n",

diff --git a/mtbp3/stdcdisc/lib.py b/mtbp3/stdcdisc/lib.py
@@ -43,6 +43,8 @@ def __init__(self, input_file):
         self.baseURL = "https://library.cdisc.org/api"
         self.ct_list = pd.DataFrame()
         self.ct_list_titles = []
+        self.ct_package = {}
+
 
     def get_ct_list(self, newest=True):
         """
@@ -74,13 +76,16 @@ def get_ct_list(self, newest=True):
                 df = df._append({'Title': t1, 'PkgSeries': t2, 'Effective': t3, 'Path': package['href'], 'Type': package['type']}, ignore_index=True)
             df['TitleL'] = df['Title'].str.lower()
             df['Newest'] = df['Effective'] == df.groupby('TitleL')['Effective'].transform('max')
+            df = df.sort_values(['TitleL', 'PkgSeries']).reset_index(drop=True)
         else:
             raise ValueError("Invalid status code: " + str(req.status_code) + " - " + req.reason)
 
         if newest:
-            df = df[df['Newest']]
+            df = df[df['Newest']].reset_index(drop=True)
+
         self.ct_list = df
-        self.ct_list_titles = set(df['Title'])
+        self.ct_list_titles = sorted(set(df['Title']))
+
         return 
 
     def get_ct_package(self, title = "", pkg_series = "", out_folder = ""):
@@ -137,14 +142,64 @@ def get_ct_package(self, title = "", pkg_series = "", out_folder = ""):
                 out_path = os.path.join(out_folder, package_name)
                 with open(out_path, 'w') as file:
                     json.dump(out, file)
-
-            return out
+
+            self.ct_package[title] = out
+            return 
         else:
             raise ValueError("Invalid status code: " + str(req.status_code) + " - " + req.reason)
+
+    def get_ct_codelists_df(self, title="", max_level = 3):
+        """
+        Converts the codelist information from the package to a pandas DataFrame.
+
+        Args:
+            package (dict): The package information returned by get_ct_package.
+
+        Returns:
+            pandas.DataFrame: A DataFrame containing the codelist information.
+        """
+        if not title in self.ct_package.keys():
+            print(self.ct_package.keys())
+            raise ValueError("Invalid CT package.")
+        if 'package_info' not in self.ct_package[title].keys() or 'codelists' not in self.ct_package[title].keys():
+            raise ValueError("Invalid package. Package does not contain package_info or codelists.")
+        package = self.ct_package[title]
+        tmp = package['package_info']['name'].replace(r'[^a-zA-Z0-9]', '_')
+        remaining_list = package['codelists']
+        remaining_list_label = [tmp] * len(remaining_list)
+        data = []
+        not_processed = []
+        for level in range(max_level):
+            if len(remaining_list) == 0:
+                break
+            codelists = remaining_list
+            codelists_label = remaining_list_label
+            remaining_list = []
+            remaining_list_label = []
+            for index, item in enumerate(codelists):
+                if isinstance(item, dict):
+                    if 'conceptId' in item.keys() and 'name' not in item.keys():
+                        item['name']=""
+                    if 'synonyms' not in item.keys():
+                        item['synonyms']=[]
+                    if 'terms' in item.keys():
+                        nterms = len(item['terms'])
+                        remaining_list.extend(item['terms'])
+                        remaining_list_label.extend([codelists_label[index]+'.pseudo'+item['conceptId']]*nterms)
+                    else:
+                        nterms = 0
+                    data.append([codelists_label[index], level, item['conceptId'], item['name'], item['preferredTerm'], item['submissionValue'], '; '.join(item['synonyms']), item['definition'], nterms])
+                else:
+                    not_processed.append(item)
+
+        df = pd.DataFrame(data, columns=['label', 'level', 'conceptId', 'name', 'preferredTerm', 'submissionValue', 'synonyms', 'definition', 'terms'])
+        label = df[['label', 'preferredTerm']]
+        df = df.drop('label', axis=1)
+
+        self.ct_package[title]['label_df'] = label
+        self.ct_package[title]['ct_df'] = df
+
+        return 
 
 if __name__ == "__main__":
     pass
-
-    #cl = accessLib("/Users/yh2020/cdisc.txt")
-    #cl.get_ct_list()
-    #c = cl.get_ct_package('Protocol')