Skip to content

Commit 2d2224d

Browse files
authored
Initial commit
0 parents  commit 2d2224d

File tree

201 files changed

+149618
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

201 files changed

+149618
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
__pycache__/
2+
wandb/
3+
.lock
4+
.ipynb_checkpoints/
5+
.Trash-0/

Dockerfile

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
FROM python:3.8
2+
3+
WORKDIR /workspace
4+
5+
RUN apt-get update && apt-get install apt-file -y && apt-file update && apt-get install vim -y
6+
RUN git config --global user.email "[email protected]"
7+
RUN git config --global user.name "Kunal Suri"
8+
9+
COPY requirements.txt .
10+
11+
RUN pip install -r requirements.txt

LICENSE

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
This is free and unencumbered software released into the public domain.
2+
3+
Anyone is free to copy, modify, publish, use, compile, sell, or
4+
distribute this software, either in source code form or as a compiled
5+
binary, for any purpose, commercial or non-commercial, and by any
6+
means.
7+
8+
In jurisdictions that recognize copyright laws, the author or authors
9+
of this software dedicate any and all copyright interest in the
10+
software to the public domain. We make this dedication for the benefit
11+
of the public at large and to the detriment of our heirs and
12+
successors. We intend this dedication to be an overt act of
13+
relinquishment in perpetuity of all present and future rights to this
14+
software under copyright law.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22+
OTHER DEALINGS IN THE SOFTWARE.
23+
24+
For more information, please refer to <https://unlicense.org>

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# mediqa-chat
2+
For executing Task A - Please refer to instructions in TaskA/ <br/>
3+
For executing Task B - Please refer to instructions in TaskB/

Task A - EDA.ipynb

+231
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "b2dab6b3",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"import plotly\n",
12+
"import plotly.express as px\n",
13+
"import numpy as np\n",
14+
"from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification\n",
15+
"from pathlib import Path\n",
16+
"from datasets import Dataset,DatasetDict,load_dataset,load_metric\n",
17+
"import evaluate\n",
18+
"import re\n",
19+
"from sklearn.model_selection import KFold, StratifiedKFold\n",
20+
"import torch\n",
21+
"from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"id": "ca4fcfa5",
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"train_path = Path.cwd().joinpath(\"mediqa-chat-data\",\"TaskA\",\"TaskA-TrainingSet.csv\")\n",
32+
"validation_path = Path.cwd().joinpath(\"mediqa-chat-data\",\"TaskA\",\"TaskA-ValidationSet.csv\")\n",
33+
"\n",
34+
"train_df = pd.read_csv(train_path,index_col=\"ID\")\n",
35+
"valid_df = pd.read_csv(validation_path,index_col=\"ID\")\n",
36+
"merge_df = pd.concat([train_df,valid_df],axis=0,ignore_index=True)\n",
37+
"merge_df[\"dialogue_wo_whitespaces\"] = merge_df[\"dialogue\"].apply(lambda x: re.sub(r'[\\r\\n\\s]+',' ',x))\n",
38+
"merge_df.reset_index(inplace=True)\n",
39+
"merge_df.rename(mapper={'index':'ID'},axis=1,inplace=True)"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"id": "5c4da23f",
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"merge_df.head()"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"id": "4bbf5d4c",
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"section_header_dist = \\\n",
60+
"merge_df[\"section_header\"].value_counts(normalize=True).reset_index()\n",
61+
"section_header_dist.columns = [\"section_header\",\"proportion\"]\n",
62+
"section_header_cnt = \\\n",
63+
"merge_df[\"section_header\"].value_counts().reset_index()\n",
64+
"section_header_cnt.columns = [\"section_header\",\"Count\"]"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": null,
70+
"id": "478c9763",
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"px.bar(data_frame=section_header_cnt, \\\n",
75+
" x='section_header', \\\n",
76+
" y='Count', \\\n",
77+
" title=\"Section_Header Count\",)"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"id": "82bcda4b",
84+
"metadata": {},
85+
"outputs": [],
86+
"source": [
87+
"px.bar(data_frame=section_header_dist, \\\n",
88+
" x='section_header', \\\n",
89+
" y='proportion', \\\n",
90+
" title=\"Section_Header Proportion\",)"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"id": "82156147",
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"model_checkpoint = \"emilyalsentzer/Bio_ClinicalBERT\"\n",
101+
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=True,force_download=True)"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"id": "b5262ff4",
108+
"metadata": {},
109+
"outputs": [],
110+
"source": [
111+
"merge_df.head()"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"id": "75c83f26",
118+
"metadata": {},
119+
"outputs": [],
120+
"source": [
121+
"token_len_list = []\n",
122+
"for sentence in merge_df[\"dialogue_wo_whitespaces\"]:\n",
123+
" token_list = tokenizer.encode(sentence,add_special_tokens=True)\n",
124+
" token_len_list.append(len(token_list))"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": null,
130+
"id": "6c12940b",
131+
"metadata": {},
132+
"outputs": [],
133+
"source": [
134+
"px.histogram(token_len_list,title=\"Token Length distribution for Dialogue\").update_layout(xaxis_title=\"Number of Tokens in a Dialogue\", \\\n",
135+
" yaxis_title=\"Number of IDs\",showlegend=False)"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": null,
141+
"id": "db328d7c",
142+
"metadata": {},
143+
"outputs": [],
144+
"source": [
145+
"# Getting min, median, max lengths of the text\n",
146+
"min(token_len_list), np.median(token_len_list), max(token_len_list)"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": null,
152+
"id": "a6d81c18",
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"np.percentile(token_len_list,q=[0.,25,50,75,80,85,90,95,99,100])"
157+
]
158+
},
159+
{
160+
"cell_type": "markdown",
161+
"id": "d0a22b02",
162+
"metadata": {},
163+
"source": [
164+
"Sentences with length <= 300 account for about 90% of the data"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"id": "8558b067",
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"max_len = 300"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": null,
180+
"id": "090bcd49",
181+
"metadata": {},
182+
"outputs": [],
183+
"source": [
184+
"token_len_list = []\n",
185+
"for sentence in merge_df[\"section_text\"]:\n",
186+
" token_list = tokenizer.encode(sentence,add_special_tokens=True)\n",
187+
" token_len_list.append(len(token_list))"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": null,
193+
"id": "4697f698",
194+
"metadata": {},
195+
"outputs": [],
196+
"source": [
197+
"px.histogram(token_len_list,title=\"Token Length distribution for Section Text\").update_layout(xaxis_title=\"Number of Tokens in a Section Text\", \\\n",
198+
" yaxis_title=\"Number of IDs\",showlegend=False)"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": null,
204+
"id": "2f21e9ff",
205+
"metadata": {},
206+
"outputs": [],
207+
"source": []
208+
}
209+
],
210+
"metadata": {
211+
"kernelspec": {
212+
"display_name": "Python 3 (ipykernel)",
213+
"language": "python",
214+
"name": "python3"
215+
},
216+
"language_info": {
217+
"codemirror_mode": {
218+
"name": "ipython",
219+
"version": 3
220+
},
221+
"file_extension": ".py",
222+
"mimetype": "text/x-python",
223+
"name": "python",
224+
"nbconvert_exporter": "python",
225+
"pygments_lexer": "ipython3",
226+
"version": "3.8.16"
227+
}
228+
},
229+
"nbformat": 4,
230+
"nbformat_minor": 5
231+
}

0 commit comments

Comments
 (0)