-
Notifications
You must be signed in to change notification settings - Fork 1
/
wk3_file_indexer.erl
311 lines (251 loc) · 11.1 KB
/
wk3_file_indexer.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
%% coding: latin-1
-module(wk3_file_indexer).
-export([get_file_contents/1,show_file_contents/1,
get_file_contents_by_line/1, test_suite/0]).
% File Indexer
% (Functional Programming in Erlang MOOC, Week 3 assignment)
% This module builds on the provided functions :
% (get_file_contents, get_all_lines & show_file_contents)
% "The aim of this exercise is to index a text file, by line number.
% In solving this problem you’ll need to think about the different stages of
% processing of the data: you begin with a list of lines, each of which will
% need to be broken into words, and those lines (and words) will need to be
% associated with the corresponding line numbers.
% So, thinking about useful intermediate stages – and helper functions –
% should help you to make progress in solving the problem.
% The output of the main function should be a list of entries consisting of a word
% and a list of the ranges of lines on which it occurs.
% For example, the entry { "foo" , [{3,5},{7,7},{11,13}] } means that the word
% "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
% To take the problem further, you might like to think about these ways of
% refining the solution.
% • Removing all short words (e.g. words of length less than 3) or all common
% words (you‘ll have to think about how to define these).
% • Sorting the output so that the words occur in lexicographic order.
% • Normalising the words so that capitalised ("Foo") and non-capitalised
% versions ("foo") of a word are identified.
% • Normalising so that common endings, plurals, etc. are identified.
% • (Harder) Thinking how you could make the data representation more
% efficient than the one you first chose. This might be efficient for lookup
% only, or for both creation and lookup.
% • Can you think of other ways that you might extend your solution?
% Exports the indexed words of the file as a list of entries indicating the
% range of lines they appear on. Example of one entry in this list :
% { "foo" , [{3,5},{7,7},{11,13}] }
get_file_contents_by_line(Name) ->
LinesText = get_file_contents(Name),
MapOfWordsToListOfReversedRangeTuples = map_line_words(LinesText),
%io:format("Results : ~p ~n", [MapOfWordsToListOfReversedRangeTuples]),
build_output(MapOfWordsToListOfReversedRangeTuples).
%io:format("Results : ~p ~n", [Output]).
% Build the output from the map provided : specifically, each key in the map
% is build up as part of a tuple with its value; first the map value list needs to be reversed
build_output(M) ->
build_output(M, []).
build_output(M, L) ->
maps:fold(
fun(K, V, _) ->
Result = {K, reverse(V)}, % build the tuple
io:format("~p ~n", [Result])
end, [], M).
% Takes a list of all lines in a file, and returns a mapping of the all words
% to a list of tuples indicating the line ranges
%-spec map_line_words([string()]) -> #{string() => [integer()]}.
map_line_words(Xs) -> map_line_words(Xs, 1, #{}).
%-spec map_line_words([string()], integer(), #{string() => [integer()]})
% -> #{string() => [integer()]}.
map_line_words([], _C, Map) -> Map;
map_line_words([X|Xs], C, Map) ->
CurrWords = sanatize_words(
get_words_in_line(X)),
% io:format("Line : ~p : ~p ~p ~n", [C, X, CurrWords]),
UpdatedMap = map_words(CurrWords, C, Map),
map_line_words(Xs, C+1, UpdatedMap).
% Convenience func to sanatize the list of words provided
-spec sanatize_words([string()]) -> [string()].
sanatize_words(Xs) ->
lowercase_words(
filter_small_words(
strip_punc_in_words(Xs))).
% filter small words from a list
filter_small_words([]) -> [];
filter_small_words(Xs) -> filter_small_words(Xs, []).
filter_small_words([], A) -> A;
filter_small_words([X|Xs], A) ->
case (string:length(X) > 3) of
true ->
filter_small_words(Xs, [X|A]);
false ->
filter_small_words(Xs, A)
end.
% Strip known punctuation from the supplied word, matching each char
-spec strip_punc(string()) -> string().
strip_punc(Word) ->
lists:filter( fun(C) ->
not lists:member(C, ".,\ ;:\t\n\'\"")
end,
Word ).
% Strip known punctuation from the supplied list.
-spec strip_punc_in_words([string()]) -> [string()].
strip_punc_in_words([]) -> [];
strip_punc_in_words(Words) ->
lists:map( fun(W) -> strip_punc(W) end,
Words).
% lowercase all words in the supplied list.
-spec lowercase_words([string()]) -> [string()].
lowercase_words([]) -> [];
lowercase_words(Words) ->
lists:map( fun(W) -> string:lowercase(W) end,
Words).
% Update the provided map M, mapping W to a list of values (integers);
% C is pre-pended to the values. If the key W exists, the value
% list is updated, otherwise the key is inserted with the new value
-spec map_word(string(), integer(), #{string() => [integer()]})
-> #{string() => [integer()]}.
map_word(W, C, M) ->
maps:update_with(W, fun(V) ->
[C|V]
end,
[C|[]], M).
% map a word (key) to a range of lines it appears on (list of tuples)
map_word_to_range_list(W, C, M) ->
maps:update_with(W, fun(V) ->
create_or_update_ranges(V, C)
end,
create_or_update_ranges([], C),
M).
% Update the provided map M for all words in the list, updating the values of range tuples for the word
%-spec map_words([string()], integer(), #{string() => [{integer(),integer()}]}
% -> #{string() => [{integer(),integer()}]}.
map_words([], _C, M) -> M;
map_words([X|Xs], C, M) ->
UpdatedMap = map_word_to_range_list(X, C, M),
map_words(Xs, C, UpdatedMap).
% Return the words in a string as a list
%uses a regex to split the line into a list of words
%to ensure it does not return binaries, we can use split/3,
%supplying Options to ask for a list return type
% additionally, the trim options avoids getting an empty list in the result
% without it, the following would match:
%["trailing", ":", []] = get_words_in_line("trailing : "),
-spec get_words_in_line(string()) -> [string()].
get_words_in_line(Line) ->
re:split(Line, " ", [{return, list}, trim]).
% Update or create a range in the provided list given the line number.
% We scan the list from the head, and check the ranges existing, if any.
% If none, we create one with the line number as start & end of the range.
% If found, we inspect the range end - if it's equal, we exit;
% if it is contiguous with the provided line number, we update it,
% otherwise we insert a new tuple at head and return.
% This assumes the list is in reverse order (head == last inserted pair)
-spec create_or_update_ranges([{integer(),integer()}], integer()) ->
[{integer(),integer()}].
create_or_update_ranges([], I) ->
%no ranges yet, create one.
[{I,I}];
create_or_update_ranges([{H,I}|Xs], I) ->
% the end range in the tuple at head matched our input, nothing to do, return the list
[{H,I}|Xs];
% here we use a pattern match to indicate the end range is not the same as
% the provided integer AND a guard clause that satisfies the condition
% that the provided int is contiguous; this saves us a case statement in the
% function body, and simplifies our last function clause !
create_or_update_ranges([{H,I}|Xs], J) when (I+1 == J) ->
% update the tuple with the new end & return.
[{H,J}|Xs];
create_or_update_ranges(Xs, J) ->
%all other matches have failed, simply concat a new tuple to head
[{J,J}|Xs].
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
-spec get_file_contents(string()) -> [string()].
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
-spec get_all_lines(string(),[string()]) -> [string()].
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
% Note : there is a bug here - depending on whether
% the last char in Line is "\n", it should be
% length(Line) or length(Line)-1
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
reverse(Xs) ->
shunt(Xs,[]).
shunt([],Ys) ->
Ys;
shunt([X|Xs],Ys) ->
shunt(Xs,[X|Ys]).
% ======== TESTS ========
test_map_word_to_range_list() ->
M = map_word_to_range_list("Getting", 1, #{}),
#{"Getting" := [{1,1}]} = M,
MUpdated = map_word_to_range_list("Getting", 2, M),
#{"Getting" := [{1,2}]} = MUpdated,
MUpdated2 = map_word_to_range_list("This", 10, MUpdated),
[{1,2}] = maps:get("Getting", MUpdated2),
[{10,10}] = maps:get("This", MUpdated2),
MUpdated3 = map_word_to_range_list("This", 12, MUpdated2),
[{1,2}] = maps:get("Getting", MUpdated3),
[{12,12},{10,10}] = maps:get("This", MUpdated3),
pass.
test_create_or_update_ranges() ->
%empty list, a new range is created
[{1,1}] = create_or_update_ranges([],1),
%the end range is the same as the int provided, no change
[{2,2}] = create_or_update_ranges([{2,2}],2),
%the end range is less and contiguous to the int provided, end range updated
[{2,3}] = create_or_update_ranges([{2,2}],3),
%the end range is not contiguous, a new tuple should be created
[{4,4},{2,2}] = create_or_update_ranges([{2,2}],4),
pass.
test_map_word() ->
M = map_word("initial", 1, #{}),
#{ "initial" := [1] } = M,
MUpdate = map_word("initial", 5, M),
#{ "initial" := [5,1] } = MUpdate,
MUpdate2 = map_word("second", 1, MUpdate),
#{ "initial" := [5,1], "second" := [1]} = MUpdate2,
pass.
test_lowercase_words() ->
["this was a test"] = lowercase_words(["this Was A TesT"]),
pass.
test_filter_small_words() ->
["test","this"] = filter_small_words(["this", "is", "a", "test"]),
pass.
test_strip_punc() ->
"PlsStriiMe" = strip_punc("Pls'Stri;i.Me,"),
pass.
test_strip_punc_in_words() ->
["PlsStri", "iMe"] = strip_punc_in_words(["Pls'Stri", ";i.Me"]),
pass.
test_get_words_in_line() ->
["sam", "i", "am"] = get_words_in_line("sam i am"),
["trailing", "punc."] = get_words_in_line("trailing punc."),
["trailing", ":"] = get_words_in_line("trailing : "),
["trailing", ":"] = get_words_in_line("trailing :"),
pass.
% ======== TEST SUITE ========
test_suite() ->
pass = test_map_word_to_range_list(),
pass = test_filter_small_words(),
pass = test_create_or_update_ranges(),
pass = test_get_words_in_line(),
pass = test_strip_punc(),
pass = test_strip_punc_in_words(),
pass = test_lowercase_words(),
pass = test_map_word(),
pass.