-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathprocess_text.py
155 lines (114 loc) · 4.31 KB
/
process_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#from string import digits
#from nltk import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
def make_text_list(postings_dict, first_n_postings=100):
"""
Extract the texts from postings_dict into a list of strings
Parameters:
postings_dict:
first_n_postings:
Returns:
text_list: list of job posting texts
"""
text_list = []
for i in range(0, first_n_postings+1):
# Since some number could be missing due to errors in scraping,
# handle exception here to ensure error free
try:
text_list.append(postings_dict[str(i)]['posting'])
except:
continue
return text_list
def remove_digits(token):
"""
Remove digits from a token
Params:
token: (str) a string token
Returns:
cleaned_token: (str) the cleaned token
"""
# Remove digits from the token
remove_digits = str.maketrans('', '', digits)
token = token.translate(remove_digits)
return token
def tokenize_text(text, stem=False):
"""
Tokenize, stem and remove stop words for the given text
Parameters:
text: a text string
Returns:
tokens: the processed text as a list of tokens
"""
stop_words = set(stopwords.words('english'))
#tokens = word_tokenize(text.lower())
# Change "C++" to "Cpp" to avoid being removed below
#tokens = ['cpp' if token=='c++' else token for token in tokens]
# Same with C#
#tokens = ['csharp' if token=='c#' else token for token in tokens]
# Remove digits
#tokens = [remove_digits(token) for token in tokens]
# Remove non-alphabetic tokens and stopwords
#tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
# Use Regex to tokenize
# Replace any non word characters except .+# with space
text = re.sub("[^\w.+#]", " ", text)
# Twe cases to replace with space
# Case 1: \d+\.?\d+\s -- any number of digits followed by a space with or without
# a dot in between
# Case 2: \d+\+ -- any number of digits followed by a plus sign
text = re.sub("\d+\.?\d+\s|\d+\+", " ", text)
tokens = text.lower().split()
tokens = [token for token in tokens if token not in stop_words]
# Stem tokens
if stem:
stemmer = SnowballStemmer("english")
tokens = [stemmer.stem(i) for i in tokens]
return tokens
def tokenize_list(text_list, stem=False, return_string=False):
"""
Tokenize the given list of text and then combine list of tokens into text for plotting
Parameters:
text_list -- list of job posting strings
Returns:
text -- a text string for word cloud plot
"""
# Split the text based on slash, space and newline, then take set
#text = [set(re.split('/| |\n|', i)) for i in text]
#text = [set(re.split('\W', i)) for i in text_list]
text_list_tokenized = [tokenize_text(text=i, stem=stem) for i in text_list]
tokens = []
# Combine all token lists into one big list of tokens
for i in text_list_tokenized:
tokens += i
if return_string:
text = ' '.join(tokens)
return text
# Return the list of all tokens
return tokens
def check_freq(dict_to_check, text_list):
"""
Checks each given word's freqency in a list of posting strings.
Params:
words: (dict) a dict of word strings to check frequency for, format:
{'languages': ['Python', 'R'..],
'big data': ['AWS', 'Azure'...],
..}
text_list: (list) a list of posting strings to search in
Returns:
freq: (dict) frequency counts
"""
freq = {}
# Join the text together and convert words to lowercase
text = ' '.join(text_list).lower()
for category, skill_list in dict_to_check.items():
# Initialize each category as a dictionary
freq[category] = {}
for skill in skill_list:
if len(skill) == 1: # pad single letter skills such as "R" with spaces
skill_name = ' ' + skill.lower() + ' '
else:
skill_name = skill.lower()
freq[category][skill] = text.count(skill_name)
return freq