-
Notifications
You must be signed in to change notification settings - Fork 0
/
termset_generator.py
228 lines (181 loc) · 7.36 KB
/
termset_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Run the Termset Generator user interface from the command line.
This file contains the front-end code to run the Termset Generator, a tool
which generates a "termset", or list of synonyms and spelling variations
for a given "concept of interest", or medical term.
Usage:
streamlit run termset_generator.py
"""
import json
import os
import pandas as pd
import streamlit as st
import streamlit_functions as sf
def load_concept_file(concept_file):
"""
Read in the concept file as either a CSV or JSON.
"""
if concept_file.name.lower().endswith(".csv"):
return sf.load_concept_csv(concept_file)
elif concept_file.name.lower().endswith(".json"):
return sf.load_concept_json(concept_file)
else:
# Should never get here
st.info(
'Unsupported filename "%s".' % concept_file.name)
return None
def save_termset(concept, terms, filename):
"""
Save the termset as a JSON if filepath exists.
"""
try:
with open(filename, "w") as f:
json.dump({concept: terms}, f, indent=4)
st.success(f"{filename} saved successfully!")
except FileNotFoundError:
st.text("File path does not exist. Correct or remove file path to save.")
def show_save_controls(concept, terms, selected, button_name, suffix, default_dir):
"""
Display options to save a termset as a JSON.
"""
opt_pathname = st.text_input("Specify file path to save JSON into. (Optional)",
key=concept + str(terms) + "specify_path")
if st.button("Save JSON", key=concept + str(terms) + button_name):
# Assign specified directory
if len(opt_pathname) > 0:
default_dir = os.path.join("..", opt_pathname)
# If directory does not exist, create directory
if not os.path.isdir(default_dir):
os.mkdir(default_dir)
st.success(f"Created folder: {default_dir}")
save_name = os.path.join(default_dir, concept + suffix)
save_termset(concept, terms, save_name)
def show_top_sidebar():
"""
Display options to use "Generate" or "Review" mode.
"""
st.sidebar.image("../static/logo.png", use_column_width=True)
st.sidebar.header("Termset Generator")
st.sidebar.markdown(
"""
Generate candidate text spans from a corpus of annotated medical notes.
"""
)
mode = st.sidebar.radio("Select Generate or Review mode", ("Generate", "Review"))
return mode
def show_bottom_sidebar():
"""
Display Acknowledgements.
"""
st.sidebar.markdown(
"""
### Acknowledgements
Built by IBM as a part of FDA's Biologics Effectiveness and Safety Initiative (BEST)
"""
)
st.sidebar.image("../static/ibm_fda_logos.png", use_column_width=True)
st.sidebar.markdown(
"""
NLP pipeline built with [spaCy](https://spacy.io/) and [ScispaCy](https://allenai.github.io/scispacy/).
UI adapted from UI at [ScispaCy](https://scispacy.apps.allenai.org).
"""
)
def show_terms(concept, terms):
"""
Display spelling variations for a given concept of interest to add or remove.
"""
st.subheader(concept)
terms.sort()
# Allow user to add/remove spelling variations
selected = st.multiselect("Terms found in corpus", terms, default=terms, key=concept + "_ms")
selected.sort()
return selected
def show_results_controls(concept, terms, selected):
"""
Display options to review and edit each termset.
"""
# Show JSON of selected terms
if st.button("Show JSON", key=concept + str(terms) + "_button"):
st.json({concept: selected})
# Manually add new terms
new_term = st.text_input(
label="If you would like to manually add custom terms, enter a comma separated list below. (Optional)",
key=concept + str(terms) + "_add_term")
if new_term != "":
new_terms = [x.strip() for x in new_term.split(",")]
terms.extend(new_terms)
terms.sort()
df_2 = pd.DataFrame({"Final Termset": terms})
st.write(df_2)
return terms
def generate_mode():
"""
Run Generate mode functionality.
"""
# Controls to get the corpus, confidence, and concepts for search
corpus_file = st.sidebar.file_uploader("Annotated file", type="json")
confidence = st.sidebar.slider("Confidence", min_value=0.0, value=0.9)
file_types = ["csv", "json"]
concept_file = st.sidebar.file_uploader("Concept file", type=file_types)
# Controls for acknowledgements
show_bottom_sidebar()
# If a concept file is uploaded, load it into a DataFrame
concept_df = None
if concept_file:
concept_df = load_concept_file(concept_file)
# If we have both the corpus and concepts, get the phrases that are found
phrase_dict = dict()
if corpus_file and concept_file is not None:
with st.spinner("Processing..."):
try:
concepts = list(concept_df["concept"].unique())
concept_list = st.sidebar.multiselect("Concepts", concepts, default=concepts)
phrase_dict = sf.make_phrase_dict(corpus_file, concept_df, concept_list, confidence)
except ValueError:
st.subheader("Annotated file does not contain any of the concepts of interest.")
# Specify the directory for saved termsets
default_dir = "../Saved Termsets/"
# Show the found phrases
for concept, term_counts in phrase_dict.items():
# Buttons for the phrases
terms = list(term_counts.keys())
selected = show_terms(concept, terms)
# Phrase counts
df = pd.DataFrame.from_dict(term_counts, orient="index", columns=["count"])
df["term"] = df.index
df = df[["term", "count"]] # Reorder columns for UI
# Reorder the dataframe by count descending, term alphabetically
df.sort_values(by=["count", "term"], ascending=[False, True], inplace=True)
df = df.reset_index(drop=True)
st.dataframe(df)
# Controls to show and modify the results
terms = show_results_controls(concept, terms, selected)
# Control to save the results
show_save_controls(concept, terms, selected, "generate_save_button", " termset.json", default_dir)
def review_mode():
"""
Run Review mode functionality.
"""
# Controls to get the concepts for search
saved_file = st.sidebar.file_uploader("Saved \"Generate\" results file", type="json")
# Controls for acknowledgements
show_bottom_sidebar()
if saved_file:
default_dir = "../Reviewed Termsets/"
concept_df = sf.load_saved_json(saved_file)
for concept, group in concept_df.groupby("concept"):
terms = sorted(list(group["term"].unique()))
# Buttons for the phrases
selected = show_terms(concept, terms)
df = pd.DataFrame({"Original Values": selected})
st.write(df)
# Controls to show and modify the results
terms = show_results_controls(concept, terms, selected)
# Control to save the results
show_save_controls(concept, terms, selected, "review_save_button", " termset_reviewed.json", default_dir)
if __name__ == "__main__":
mode = show_top_sidebar()
if mode == "Generate":
generate_mode()
elif mode == "Review":
review_mode()