suri-kunal
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎Dockerfile
+11 b/‎Dockerfile
+11
diff --git a/‎LICENSE
+24 b/‎LICENSE
+24
diff --git a/‎README.md
+3 b/‎README.md
+3
diff --git a/‎Task A - EDA.ipynb
+231 b/‎Task A - EDA.ipynb
+231
@@ -0,0 +1,5 @@
+__pycache__/
+wandb/
+.lock
+.ipynb_checkpoints/
+.Trash-0/
@@ -0,0 +1,11 @@
+FROM python:3.8
+
+WORKDIR /workspace
+
+RUN apt-get update && apt-get install apt-file -y && apt-file update && apt-get install vim -y
+RUN git config --global user.email "[email protected]"
+RUN git config --global user.name "Kunal Suri"
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org>
@@ -0,0 +1,3 @@
+# mediqa-chat
+For executing Task A - Please refer to instructions in TaskA/ <br/>
+For executing Task B - Please refer to instructions in TaskB/
@@ -0,0 +1,231 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2dab6b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import plotly\n",
+    "import plotly.express as px\n",
+    "import numpy as np\n",
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification\n",
+    "from pathlib import Path\n",
+    "from datasets import Dataset,DatasetDict,load_dataset,load_metric\n",
+    "import evaluate\n",
+    "import re\n",
+    "from sklearn.model_selection import KFold, StratifiedKFold\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca4fcfa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_path = Path.cwd().joinpath(\"mediqa-chat-data\",\"TaskA\",\"TaskA-TrainingSet.csv\")\n",
+    "validation_path = Path.cwd().joinpath(\"mediqa-chat-data\",\"TaskA\",\"TaskA-ValidationSet.csv\")\n",
+    "\n",
+    "train_df = pd.read_csv(train_path,index_col=\"ID\")\n",
+    "valid_df = pd.read_csv(validation_path,index_col=\"ID\")\n",
+    "merge_df = pd.concat([train_df,valid_df],axis=0,ignore_index=True)\n",
+    "merge_df[\"dialogue_wo_whitespaces\"] = merge_df[\"dialogue\"].apply(lambda x: re.sub(r'[\\r\\n\\s]+',' ',x))\n",
+    "merge_df.reset_index(inplace=True)\n",
+    "merge_df.rename(mapper={'index':'ID'},axis=1,inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c4da23f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merge_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bbf5d4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "section_header_dist = \\\n",
+    "merge_df[\"section_header\"].value_counts(normalize=True).reset_index()\n",
+    "section_header_dist.columns = [\"section_header\",\"proportion\"]\n",
+    "section_header_cnt = \\\n",
+    "merge_df[\"section_header\"].value_counts().reset_index()\n",
+    "section_header_cnt.columns = [\"section_header\",\"Count\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "478c9763",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px.bar(data_frame=section_header_cnt, \\\n",
+    "       x='section_header', \\\n",
+    "       y='Count', \\\n",
+    "       title=\"Section_Header Count\",)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82bcda4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px.bar(data_frame=section_header_dist, \\\n",
+    "       x='section_header', \\\n",
+    "       y='proportion', \\\n",
+    "       title=\"Section_Header Proportion\",)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82156147",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoint = \"emilyalsentzer/Bio_ClinicalBERT\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=True,force_download=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5262ff4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merge_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75c83f26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token_len_list = []\n",
+    "for sentence in merge_df[\"dialogue_wo_whitespaces\"]:\n",
+    "    token_list = tokenizer.encode(sentence,add_special_tokens=True)\n",
+    "    token_len_list.append(len(token_list))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c12940b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px.histogram(token_len_list,title=\"Token Length distribution for Dialogue\").update_layout(xaxis_title=\"Number of Tokens in a Dialogue\", \\\n",
+    "                                                                                       yaxis_title=\"Number of IDs\",showlegend=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db328d7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Getting min, median, max lengths of the text\n",
+    "min(token_len_list), np.median(token_len_list), max(token_len_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6d81c18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.percentile(token_len_list,q=[0.,25,50,75,80,85,90,95,99,100])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0a22b02",
+   "metadata": {},
+   "source": [
+    "Sentences with length <= 300 account for about 90% of the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8558b067",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_len = 300"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "090bcd49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token_len_list = []\n",
+    "for sentence in merge_df[\"section_text\"]:\n",
+    "    token_list = tokenizer.encode(sentence,add_special_tokens=True)\n",
+    "    token_len_list.append(len(token_list))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4697f698",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px.histogram(token_len_list,title=\"Token Length distribution for Section Text\").update_layout(xaxis_title=\"Number of Tokens in a Section Text\", \\\n",
+    "                                                                                              yaxis_title=\"Number of IDs\",showlegend=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f21e9ff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# mediqa-chat`
	`2`	`+For executing Task A - Please refer to instructions in TaskA/ <br/>`
	`3`	`+For executing Task B - Please refer to instructions in TaskB/`