-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
126 lines (86 loc) · 2.84 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from tika import parser
def get_only_once_occured_words(tokens):
"""
Get only those words that occured only once in tokenized data.
"""
return FreqDist(tokens).hapaxes()
def remove_duplicated_words(tokens):
"""
Remove duplicated words from tokenized data.
"""
return sorted(set(tokens))
def remove_stopwords(tokens):
"""
Remove stopwords (and, the, unless, about, etc.) from tokenized data.
"""
return [t for t in tokens if t not in stopwords.words('english')]
def generate_plot(tokens):
"""
Generate a chart of the 50 most frequent words in tokenized data, if length of word is greater than 4.
"""
return FreqDist(word for word in tokens if len(word) > 4).plot(50, cumulative=True)
def find_long_words(tokens):
"""
Get only words of greater length than 15.
"""
return sorted([word for word in set(tokens) if len(word) > 15])
def find_long_and_common_words(tokens):
"""
Words of greater length than 7 characters and a count of occurence greater than 7.
Those words are the common content-bearing words of a text
"""
return sorted([word for word in set(tokens) if len(word) > 7 and FreqDist(tokens)[word] > 7])
def get_10_most_frequent_words(tokens):
"""
Get 10 most frequent words from tokenized data.
"""
return FreqDist(word.lower() for word in tokens).most_common(10)
def normalize_word(word):
"""
Convert into lower case.
"""
return word.lower()
def convert_single_word_into_plural_form(word):
"""
Converts single form of word into its plural form.
"""
if word.endswith('y'):
return word[:-1] + 'ies'
elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
return word + 'es'
elif word.endswith('an'):
return word[:-2] + 'en'
else:
return word + 's'
def get_part_of_speech(tokens):
"""
Get part of speech for tokenized data.
"""
return [e for e in nltk.chunk.ne_chunk(nltk.pos_tag(tokens)) if type(e) is tuple]
def noun_phrase_chunking(part_of_speech_data):
"""
Generate parse tree with given part of speech data as input.
"""
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
grammar2 = r"""
NP: {<DT|NN>+} # Chunk sequences of NN and DT
{<DT><JJ><NN>} # Chunk det+adj+noun
"""
return RegexpParser(grammar).parse(part_of_speech_data).draw()
def extract_text_from_pdf(file):
"""
Extract text from input PDF and tokenize it.
"""
return RegexpTokenizer(r'\w+').tokenize(parser.from_file(file)['content'])