-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextExtractor.js
162 lines (142 loc) · 5.28 KB
/
TextExtractor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/**
* @param {mwn} bot
*/
module.exports = function(bot) {
class TextExtractor {
/**
* Get wikitext extract. If you want plain text or HTML extracts, consider using
* the TextExtracts API instead.
* @param {string} pagetext - full page text
* @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
* the sentence ends after this limit
* @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
* the sentence hasn't ended
* @param {Function} [preprocessHook] - optional function to work on the text at the
* beginnning
*/
static getExtract(pagetext, charLimit, hardUpperLimit, preprocessHook) {
let extract = pagetext;
if (preprocessHook) {
extract = preprocessHook(extract);
}
// Remove images. Can't be done correctly with just regex as there could be wikilinks
// in the captions.
extract = TextExtractor.removeImages(pagetext);
// Remove templates beginning on a new line, such as infoboxes.
// These ocassionally contain parameters with part of the content
// beginning on a newline not starting with a | or * or # or !
// thus can't be handled with the line regex.
extract = TextExtractor.removeTemplatesOnNewlines(extract);
// Remove some other templates too
extract = TextExtractor.removeTemplates(extract, ['efn', 'refn']);
extract = extract
.replace(/<!--.*?-->/sg, '')
// remove refs, including named ref definitions and named ref invocations
.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
// the magic
.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
// trim left to prepare for next step
.trimLeft()
// keep only the first paragraph
.replace(/\n\n.*/s, '')
// unbold
.replace(/'''(.*?)'''/g, '$1')
.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '')
.trim();
if (charLimit) {
// We consider a period followed by a space or newline NOT followed by a lowercase char
// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
// XXX: this still results in issues with name like Arthur A. Kempod.
// (?![^[]*?\]\]) so that this is not a period within a link
// (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
// is a nested templates after the period.
var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;
if (extract.length > charLimit) {
var match = sentenceEnd.exec(extract);
while (match) {
if (TextExtractor.effCharCount(extract.slice(0, match.index)) > charLimit) {
extract = extract.slice(0, match.index + 1);
break;
} else {
match = sentenceEnd.exec(extract);
}
}
}
}
if (hardUpperLimit) {
if (TextExtractor.effCharCount(extract) > hardUpperLimit) {
extract = extract.slice(0, hardUpperLimit) + ' ...';
}
}
return extract;
}
static removeImages(text) {
var wkt = new bot.wikitext(text);
wkt.parseLinks();
wkt.files.forEach(file => {
wkt.removeEntity(file);
});
return wkt.getText();
}
static removeTemplatesOnNewlines(text) {
var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
var match = templateOnNewline.exec(text);
while (match) {
var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
if (template) {
text = text.replace(template.wikitext, '');
} else { // just get rid of that line, otherwise we'd enter an infinite loop
text = text.replace(/^\{\{.*$/m, '');
}
match = templateOnNewline.exec(text);
}
return text;
}
static removeTemplates(text, templates) {
var wkt = new bot.wikitext(text);
const makeRegexFromTemplate = function(template) {
return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
}
wkt.parseTemplates({
namePredicate: name => {
return templates.some(template => {
return makeRegexFromTemplate(template).test(name);
});
}
});
for (let template of wkt.templates) {
wkt.removeEntity(template);
}
return wkt.getText();
}
static effCharCount(text) {
return text
.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
.replace(/''/g, '')
.length;
}
/**
* Do away with some of the more bizarre stuff from page extracts that aren't worth
* checking for on a per-page basis
* Minimise the amount of removals done here, since if the extract was cut off, it may
* happen one of the regexes below will match across two different extracts.
* @param {string} content
*/
static finalSanitise(content) {
return content.replace(/\[\[Category:.*?\]\]/gi, '')
// these are just bad
.replace(/__[A-Z]+__/g, '')
// Harvard referencing
.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
// shortcut for named ref invocation
.replace(/\{\{r\|.*?\}\}/gi, '')
// inline parenthetical referencing
.replace(/\{\{[hH]arv\|.*?\}\}/g, '')
// pronunciation
.replace(/\{\{IPA.*?\}\}/g, '')
// audio
.replace(/\{\{[aA]udio\|.*?\}\}/g, '');
}
}
return TextExtractor;
};