-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupload-images.py
268 lines (208 loc) · 8.1 KB
/
upload-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/usr/bin/python
# -*- coding: utf-8 -*-
##
## uploads image files (that already have metadata in Wikidata) from muis.ee to Commons and adds image claim to Wikidata item
## work list is stored in local MongoDB in format:
## {'_id': 'http://www.wikidata.org/entity/Q55869691', 'MuIS': 248095, 'muislink': 'https://www.muis.ee/museaalview/248095', 'commons': '', 'looja': 'http://www.wikidata.org/entity/Q3742903', 'loojaLabel': 'Jaan Koort'}
## inital mongo db is populated from Wikidata Query: https://w.wiki/ihh
##
from pymongo import MongoClient
import urllib.request
from PIL import ImageFile
import rdflib
from rdflib import URIRef
import pywikibot
from pywikibot.specialbots import UploadRobot
import os.path, re
from io import BytesIO
import hashlib, base64
def cleanUpTitle(title):
''' Clean up the title of a potential mediawiki page. Otherwise the title of
the page might not be allowed by the software.
'''
title = title.strip()
title = re.sub(u"[<{\\[]", u"(", title)
title = re.sub(u"[>}\\]]", u")", title)
title = re.sub(u"[ _]?\\(!\\)", u"", title)
title = re.sub(u"\"", u"", title)
title = re.sub(u",:[ _]", u", ", title)
title = re.sub(u"[;:][ _]", u", ", title)
title = re.sub(u"[\t\n ]+", u" ", title)
title = re.sub(u"[\r\n ]+", u" ", title)
title = re.sub(u"[\n]+", u"", title)
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
title = re.sub(u"[&#%?!]", u"^", title)
title = re.sub(u"[;]", u",", title)
title = re.sub(u"[/+\\\\:=]", u"-", title)
title = re.sub(u"--+", u"-", title)
title = re.sub(u",,+", u",", title)
title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
title = title.replace(u" ", u"_")
return title
def generateFileMetadata(wditem):
PART_MAX_LEN = 35
filename = ''
wd_image = None
wd_id = wditem['_id']
wd_id = wd_id.replace('http://www.wikidata.org/entity/', '')
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
item = pywikibot.ItemPage(repo, wd_id)
item_dict = item.get() #Get the item dictionary
if "et" in item_dict["labels"]:
filename = item_dict["labels"]["et"][:PART_MAX_LEN]
clm_dict = item_dict["claims"] # Get the claim dictionary
## image
if "P18" in clm_dict:
wd_image = clm_dict["P18"][0].getTarget()
## author
if "P170" in clm_dict:
wd_author = clm_dict["P170"][0].getTarget()
author_dict = wd_author.get()
if "et" in author_dict["labels"] :
filename = "%s, %s" % (filename, author_dict["labels"]["et"][:PART_MAX_LEN])
elif "en" in author_dict["labels"] :
filename = "%s, %s" % (filename, author_dict["labels"]["en"][:PART_MAX_LEN])
## P217 inventory_nr
inventory_nr = ''
if "P217" in clm_dict:
inventory_nr = clm_dict["P217"][0].getTarget()
filename = "%s, %s" % (filename, inventory_nr[:PART_MAX_LEN])
if filename:
desc = """=={{int:filedesc}}==
{{Artwork
|wikidata=%s
|source=[%s %s]
}}
=={{int:license-header}}==
{{PD-Art|PD-old-auto}}
""" % (wd_id, wditem['muislink'], inventory_nr)
filename = cleanUpTitle(filename)
return (filename, desc, wd_image)
def findDuplicateImage(photo=None, site=pywikibot.Site(u'commons', u'commons')):
''' Takes the photo as BytesIO object, calculates the SHA1 hash and asks the mediawiki api
for a duplicate
'''
sha1 = hashlib.sha1()
sha1.update(photo.getvalue())
for page in site.allimages(sha1=base64.b16encode(sha1.digest())):
return page.title(underscore=True)
return None
def uploadToCommons(filename, desc, file_url):
targetSite = pywikibot.Site('commons', 'commons')
upFile = None
if os.path.exists(TEMP_FILE):
os.remove(TEMP_FILE)
file = urllib.request.urlopen(file_url)
##img_data = requests.get(file_url).content
img_data = file.read()
with open(TEMP_FILE, 'wb') as handler:
handler.write(img_data)
file.close()
duplicate = findDuplicateImage( BytesIO(img_data) )
if duplicate:
pywikibot.output(u'Found duplicate at %s' % ( duplicate ) )
else:
bot = UploadRobot(TEMP_FILE, description=desc, useFilename=filename, keepFilename=True, verifyDescription=False, targetSite=targetSite)
upFile = bot.run()
return (upFile, duplicate)
def addImageClaim(wd_id, full_filename):
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
item = pywikibot.ItemPage(repo, wd_id)
newclaim = pywikibot.Claim(repo, u'P18')
commonssite = pywikibot.Site('commons', 'commons')
imagelink = pywikibot.Link(full_filename, source=commonssite,
default_namespace=6)
image = pywikibot.FilePage(imagelink)
newclaim.setTarget(image)
item.addClaim(newclaim)
return
def getFileSizeAndType(uri):
# get file size, image size and file type (None if not known)
file = urllib.request.urlopen(uri)
size = file.headers.get("content-length")
if size: size = int(size)
p = ImageFile.Parser()
while 1:
data = file.read(1024)
if not data:
break
p.feed(data)
if p.image:
break
file.close()
if p.image:
return size, p.image.size, p.image.format
raise ValueError("Could not read file size and type from: %s" % uri)
return size, (0, 0), None
def processItem(wditem):
MAX_FILE_SIZE = 15*1024*1024 #bytes
MIN_SIDE_LEN = 400 #pixels
allowed_file_types = {
"JPEG": "jpg",
"TIFF": "tif"
}
wd_id = wditem['_id']
wd_id = wd_id.replace('http://www.wikidata.org/entity/', '')
rdf_url = "https://www.muis.ee/rdf/media-list/%s" % (wditem['MuIS'])
# create a Graph
g = rdflib.Graph()
g.parse(rdf_url)
cur_list = URIRef("http://opendata.muis.ee/media-list/%s" % (wditem['MuIS']) )
min_side = 0
selected_url = ''
selected_desc = ''
wd_image = None
full_filename = ''
oversized = False
for img_url in g.objects(subject=cur_list, predicate=None):
if 'muis' in img_url:
print(img_url)
prev_min_side = min_side
(size_bytes, (img_length, img_height), img_type) = getFileSizeAndType(img_url)
if size_bytes > MAX_FILE_SIZE :
oversized = True
print("file size exeeds limit")
break
min_side = img_height
if (img_length < img_height):
min_side = img_length
if min_side > MIN_SIDE_LEN:
if img_type in allowed_file_types and min_side > prev_min_side:
(shortname, selected_desc, wd_image) = generateFileMetadata(wditem)
full_filename = shortname + '.' + allowed_file_types[img_type]
selected_url = img_url
print (full_filename)
print (selected_desc)
print (selected_url)
print (wd_image)
if oversized:
mycol.update_one({"_id": wditem['_id']}, {"$set": {"status": "oversize"}})
elif full_filename and not wd_image:
(uploaded, duplicate) = uploadToCommons(full_filename, selected_desc, selected_url)
if uploaded:
addImageClaim(wd_id, full_filename)
#update mongodb
mycol.update_one({"_id": wditem['_id']}, {"$set": {"status": "uploaded"}})
elif (duplicate):
addImageClaim(wd_id, duplicate)
mycol.update_one({"_id": wditem['_id']}, {"$set": {"status": "updated_wd"}})
else:
print ("----skipped\n\n") # no image or image is too small
mycol.update_one({"_id": wditem['_id']}, {"$set": {"status": "skipped"}})
## main
TEMP_FILE = 'temp_image'
MAX_ITEM_COUNT = 50
client = MongoClient()
mydb = client["muis"]
mycol = mydb["pildid"]
item_count = 0
for wditem in mycol.find({ "status" : { "$exists" : False } }):
if not wditem['commons']:
print( wditem )
processItem(wditem)
item_count += 1
if item_count >= MAX_ITEM_COUNT:
break
pywikibot.stopme()