-
Notifications
You must be signed in to change notification settings - Fork 0
/
support.py
120 lines (97 loc) · 2.79 KB
/
support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
r"""
Copyright 2018, 2019, 2020 Rui Antunes
João Figueira Silva
Arnaldo Pereira
Sérgio Matos
https://github.com/ruiantunes/2018-n2c2-track-1
Text mining support utilities.
Due to performance and readability purposes, there are weak or none
consistency validations (duck typing).
Routines listing
----------------
load_stopwords
clean_medical_documents
"""
# third-party modules
from gensim.utils import deaccent
import re
def load_stopwords(filepath):
r"""
Load stopwords from a specific file.
Parameters
----------
filepath : str
Stopwords list filepath.
Returns
-------
stopwords : set
A `set` with the stopwords.
Example
-------
>>> import os
>>> from __init__ import REPO
>>> from support import load_stopwords
>>> filepath = os.path.join(REPO, 'data', 'wrd_stop.txt')
>>> stopwords = load_stopwords(filepath)
>>> len(stopwords)
313
>>> 'not' in stopwords
True
>>>
"""
stopwords = set()
with open(filepath) as f:
for line in f:
stopwords.add(line.strip())
return stopwords
def clean_medical_documents(docs):
r"""
Clean medical reports.
This function makes a simple pre-processing of medical texts. The
steps are:
1. String is deaccented.
2. Sequences of at least 2 letters are extracted (numbers and other
characters are ignored).
3. Tokens with all uppercase letters are kept. Other tokens are
converted to lowercase.
4. The tokens 'pt', and 'pts' are replaced by 'patient'.
Parameters
----------
docs : list
`list` of documents. Each document is a `str`.
Returns
-------
docs : list
`list` of documents. Each document is a `list` of lines. Each
line is a `list` of tokens.
Example
-------
>>> from support import clean_medical_documents
>>> docs = [
... 'The pt. appears awake.',
... 'Cardiologist: Dr. C. Núttèr',
... ]
>>> clean_docs = clean_medical_documents(docs)
>>> print(clean_docs)
['the patient appears awake', 'cardiologist dr nutter']
>>>
"""
# to not modify the input parameter
docs = list(docs)
regex = re.compile(pattern=r'[a-zA-Z]{2,}')
for i, doc in enumerate(docs):
doc = deaccent(doc)
# keep uppercase words
tokens = [
token.lower() if not token.isupper() else token
for token in regex.findall(doc)
]
# replace tokens
for j, token in enumerate(tokens):
if token in ('pt', 'pts'):
tokens[j] = 'patient'
# clean document
docs[i] = ' '.join(tokens)
return docs