forked from osmlab/name-suggestion-index
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_wikiTags.js
347 lines (291 loc) · 11.7 KB
/
check_wikiTags.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
const clearConsole = require('clear');
const colors = require('colors/safe');
const fetch = require('node-fetch');
const fileTree = require('./lib/file_tree');
const wdk = require('wikidata-sdk');
let _brands = fileTree.read('brands');
let _errors = [];
let _wrongFormat = [];
let _deletedWikidata = [];
let _deletedWikipedia = [];
let _foundSitelink = [];
let _wrongLink = [];
let _wrongEntity = [];
let _missingInstance = [];
let _missingReferences = [];
let _data = gatherData(_brands);
let _urls = {
wikidata: wdk.getManyEntities({
ids: Object.keys(_data.wikidata),
languages: ['en'],
props: ['info', 'claims', 'sitelinks'],
format: 'json',
redirects: false
}),
wikipedia: getWikipediaUrls(
Object.keys(_data.wikipedia)
)
};
process.stdout.write('\nchecking and validating');
doFetch(null, _urls.wikidata, checkWikidata)
.then(doFetch(null, _urls.wikipedia, checkWikipedia))
.then(finish);
// Find all wikidata QIDs and wikipedia articles set as values in all entries
function gatherData(brands) {
let wikidata = {};
let wikipedia = {};
Object.keys(brands).forEach(kvnd => {
['brand:wikidata', 'operator:wikidata'].forEach(t => {
let qid = brands[kvnd].tags[t];
if (qid && /^Q\d+$/.test(qid)) {
wikidata[qid] = kvnd;
} else if (qid) {
_wrongFormat.push([kvnd, qid, t]);
}
});
['brand:wikipedia', 'operator:wikipedia'].forEach(t => {
let wp = brands[kvnd].tags[t];
if (wp && /^[a-z_]{2,}:[^_]*$/.test(wp)) {
wikipedia[wp] = kvnd;
} else if (wp) {
_wrongFormat.push([kvnd, wp, t]);
}
});
});
return { wikidata, wikipedia };
}
function getWikipediaUrls(values) {
let titles = {};
let result = [];
// Separate the title by its language and the actual title
values.forEach(value => {
let [ language, title ] = value.split(':', 2);
if (!titles[language]) {
titles[language] = [];
}
titles[language].push(title);
});
Object.keys(titles).forEach(language => {
// The API does not accept more than 50 titles at once, so the array of titles needs to be split into chunks
chunk(titles[language], 50).forEach(values => {
result.push(
`https://${language}.wikipedia.org/w/api.php?action=query&prop=info|pageprops&ppprop=wikibase_item&titles=${encodeURIComponent(values.join('|'))}&format=json`
);
});
});
return result;
}
function doFetch(index, urls, check) {
index = index || 0;
if (index >= urls.length) {
clearConsole();
return Promise.resolve();
}
let url = urls[index];
process.stdout.write('.');
return fetch(url)
.then(response => response.json())
.then(check)
.catch(e => {
_errors.push(e);
console.error(colors.red(e));
})
.then(() => delay(500))
.then(() => doFetch(++index, urls, check));
}
function checkWikidata(result) {
// blacklist containing wrong claims for entities which are brands
// and an additional message on how to deal with them
let blacklist = {
// P625 - coordinate location
P625: "If this value describes the location of the headquarter of the brand, then add this as a qualifier for P159 (headquarters location) and remove this claim."
};
Object.keys(result.entities).forEach(qid => {
let entity = result.entities[qid];
let target = _data.wikidata[qid];
let entry = _brands[target];
let sitelinks = getSitelinks(entity);
let claims = wdk.simplify.claims(entity.claims, { keepReferences: true });
let instance = entity.claims && entity.claims.P31;
let tag = entry.tags['brand:wikidata'] === qid ? 'brand' : 'operator';
let wikipedia = entry.tags[`${tag}:wikipedia`];
// Wikidata entity was either deleted or is a redirect
if (entity.missing === '') {
return _deletedWikidata.push([target, qid, `${tag}:wikidata`]);
}
// If there is a Wikidata entity specified but no Wikipedia article,
// try to find a matching article from all possible sitelinks
if (!wikipedia && sitelinks.length) {
_foundSitelink.push([target, qid, `${tag}:wikidata`, sitelinks.join(', ')]);
}
if (wikipedia) {
// Check whether the linked Wikipedia article of the Wikidata entity is the correct one
let correct = getCorrectSitelink(wikipedia, entity.sitelinks);
if (correct) {
_wrongLink.push([target, qid, `${tag}:wikidata`, wikipedia, correct]);
}
}
// Check if there are any blacklisted claims
Object.keys(blacklist).forEach(property => {
if (claims[property]) {
_wrongEntity.push([target, qid, `${tag}:wikidata`, property, blacklist[property]]);
}
});
// Entries without any sitelinks have a high risk of being deleted
if (!sitelinks.length) {
// Warn if there are no instance claims and no sitelinks
if (!instance) {
_missingInstance.push([target, qid, `${tag}:wikidata`]);
}
// Warn if there are no references and no sitelinks
let references = getReferences(claims);
if (!references.length) {
_missingReferences.push([target, qid, `${tag}:wikidata`]);
}
}
});
return Promise.resolve();
}
function checkWikipedia(result) {
Object.keys(result.query.pages).forEach(id => {
let page = result.query.pages[id];
let iwl = `${page.pagelanguage}:${page.title}`;
let target = _data.wikipedia[iwl];
let entry = _brands[target];
if (!entry) {
return;
}
let tag = entry.tags['brand:wikipedia'] === iwl ? 'brand' : 'operator';
let wikidata = entry.tags[`${tag}:wikidata`];
// Wikipedia page has been deleted or is a redirect
if (page.missing === '' || page.redirect === '') {
return _deletedWikipedia.push([target, iwl, wikidata, `${tag}:wikipedia`]);
}
// Check whether the (local) linked Wikidata entity of the Wikipedia article is the correct one
if (page.pageprops && page.pageprops.wikibase_item !== wikidata) {
_wrongLink.push([target, iwl, `${tag}:wikipedia`, wikidata, page.pageprops.wikibase_item]);
}
});
return Promise.resolve();
}
// Checks whether the currently used sitelink to Wikipedia is really the correct one
// and returns the correct sitelink if the current sitelink is wrong
function getCorrectSitelink(wikipedia, sitelinks) {
let [ language, title ] = wikipedia.split(':', 2);
let sitelink = sitelinks && sitelinks[`${language}wiki`];
if (sitelink && title.localeCompare(sitelink.title) !== 0) {
return `${language}:${sitelink.title}`;
}
}
// Get all sitelinks of an entity but filter out some wikis with low or no information gain
function getSitelinks(entity) {
let sitelinks = [];
if (entity.sitelinks) {
Object.keys(entity.sitelinks).forEach(k => {
let language = k.replace(/wiki/, '');
if (!/^(ceb|commons|simple)$/.test(language)) {
sitelinks.push(`${language}:${entity.sitelinks[k].title}`);
}
});
}
return sitelinks;
}
function getReferences(claims) {
let references = [];
Object.keys(claims).forEach(claim => {
claims[claim].forEach(value => {
references = references.concat(value.references);
});
});
return references;
}
function finish() {
if (_errors.length) {
console.log(colors.yellow.bold(`\nError Summary:`));
_errors.forEach(msg => console.error(colors.red.bold(msg)));
}
if (_wrongFormat.length) {
console.error(colors.yellow.bold(`\nError - Wrong format:`));
console.error('To resolve these, make sure that the values are in the correct format');
_wrongFormat.sort();
_wrongFormat.forEach(msg => console.error(
`${colors.yellow.bold(msg[0])}: ${colors.red.bold(msg[1])} (${colors.blue.bold(msg[2])}) is in a wrong format`
));
console.error('total ' + _wrongFormat.length);
}
if (_deletedWikidata.length) {
console.error(colors.yellow.bold(`\nError - Deleted Wikidata entities:`));
console.error('To resolve these, either remove the Wikidata entity from the entry or create a new one and add the correct id of the entity');
_deletedWikidata.sort();
_deletedWikidata.forEach(msg => console.error(
`${colors.yellow.bold(msg[0])}: ${colors.red.bold(msg[1])} (${colors.blue.bold(msg[2])}) does not exist or is a redirect`
));
console.error('total ' + _deletedWikidata.length);
}
if (_deletedWikipedia.length) {
console.error(colors.yellow.bold(`\nError - Deleted Wikipedia articles:`));
console.error('To resolve these, either remove the Wikipedia article from the entry or create a new one and add the correct link to the article');
_deletedWikipedia.sort();
_deletedWikipedia.forEach(msg => console.error(
`${colors.yellow.bold(msg[0])}: ${colors.red.bold(msg[1])} (${msg[2]}) (${colors.blue.bold(msg[3])}) does not exist or is a redirect`
));
console.error('total ' + _deletedWikipedia.length);
}
if (_foundSitelink.length) {
console.warn(colors.yellow.bold(`\nWarning - Matched Wikipedia articles:`));
console.warn('To resolve these, add a sitelink to the correct entry');
_foundSitelink.sort();
_foundSitelink.forEach(msg => console.warn(
`${colors.yellow.bold(msg[0])}: ${colors.yellow.bold(msg[1])} (${colors.blue.bold(msg[2])}) has sitelinks to ${colors.green.bold(msg[3])}`
));
console.warn('total ' + _foundSitelink.length);
}
if (_wrongLink.length) {
console.warn(colors.yellow.bold(`\nWarning - Wrong Wikipedia article which is not linked to the Wikidata entity:`));
console.warn('To resolve these, check whether the Wikidata or the Wikipedia value is wrong and correct one of them');
_wrongLink.sort();
_wrongLink.forEach(msg => console.warn(
`${colors.yellow.bold(msg[0])}: ${colors.yellow.bold(msg[1])} (${colors.blue.bold(msg[2])}) is not linked to ${colors.red.bold(msg[3])} but to ${colors.green.bold(msg[4])}`
));
console.warn('total ' + _wrongLink.length);
}
if (_wrongEntity.length) {
console.warn(colors.yellow.bold(`\nWarning - Possibly wrong linked Wikidata entity:`));
console.warn('To resolve these, check whether the Wikidata entity really describes the brand and not something else or follow the hint on how to fix the entry');
_wrongEntity.sort();
_wrongEntity.forEach(msg => console.warn(
`${colors.yellow.bold(msg[0])}: ${colors.yellow.bold(msg[1])} (${colors.blue.bold(msg[2])}) ${colors.red.bold(msg[3])}: ${msg[4]}`
));
console.warn('total ' + _wrongEntity.length);
}
if (_missingInstance.length) {
console.warn(colors.yellow.bold(`\nWarning - Missing sitelink and instance claim (P31) which might lead to a deletion in the future:`));
console.warn('To resolve these, add an instance claim (P31) or a sitelink to the Wikidata item');
_missingInstance.sort();
_missingInstance.forEach(msg => console.warn(
`${colors.yellow.bold(msg[0])}: ${colors.yellow.bold(msg[1])} (${colors.blue.bold(msg[2])}) is missing a sitelink and an instance claim (P31)`
));
console.warn('total ' + _missingInstance.length);
}
if (_missingReferences.length) {
console.warn(colors.yellow.bold(`\nWarning - Missing sitelink and external references which might lead to a deletion in the future:`));
console.warn('To resolve these, add a reference to an external source or a sitelink to the Wikidata item');
_missingReferences.sort();
_missingReferences.forEach(msg => console.warn(
`${colors.yellow.bold(msg[0])}: ${colors.yellow.bold(msg[1])} (${colors.blue.bold(msg[2])}) is missing a sitelink and a reference`
));
console.warn('total ' + _missingReferences.length);
}
}
function delay(msec) {
return new Promise((resolve) => setTimeout(resolve, msec));
}
function chunk(input, size) {
let result = [];
let index = 0;
while (index < input.length) {
result.push(input.slice(index, size + index));
index += size;
}
return result;
}