-
Notifications
You must be signed in to change notification settings - Fork 1
/
parser_avalikteenistus.py
60 lines (48 loc) · 2.1 KB
/
parser_avalikteenistus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Avaliku teenistuse "Tartu" RSS-voo sisendite parsimine
"""
import parsers_common
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
"""
Meetod saidi pakkumiste nimekirja loomiseks
"""
articleDescriptions = []
articleIds = []
# articleImages = []
# articlePubDates = []
articleTitles = pageTree.xpath('//table[@class="views-table cols-5"]/tbody/tr/td[1]/text()')
articleUrls = pageTree.xpath('//table[@class="views-table cols-5"]/tbody/tr/td[5]/div[1]/a/@href')
articleDescName = pageTree.xpath('//table[@class="views-table cols-5"]/tbody/tr/td[2]/div[1]/text()')
articleDescLoc = pageTree.xpath('//table[@class="views-table cols-5"]/tbody/tr/td[4]/div[1]/text()')
for i in range(0, len(articleUrls)):
articleUrl = articleUrls[i]
# get unique id from articleUrl
articleIds.append(articleUrl.split('/')[-1])
# descriptions
articleDescriptions.append(parsers_common.toPlaintext(articleDescName[i]) + "<br>" + parsers_common.toPlaintext(articleDescLoc[i]))
# titles
articleTitles[i] = parsers_common.toPlaintext(articleTitles[i]).capitalize()
# remove non "Tartu" ocation lines
retArticleDescriptions = []
retArticleIds = []
retArticleImages = []
retArticlePubDates = []
retArticleTitles = []
retArticleUrls = []
for i in range(0, len(articleUrls)):
if ('Tartu' in articleDescriptions[i]):
retArticleDescriptions.append(articleDescriptions[i])
retArticleIds.append(articleIds[i])
# retArticleImages.append(articleImages[i])
# retArticlePubDates.append(articlePubDates[i])
retArticleTitles.append(articleTitles[i])
retArticleUrls.append(articleUrls[i])
return {"articleDescriptions": retArticleDescriptions,
"articleIds": retArticleIds,
"articleImages": retArticleImages,
"articlePubDates": retArticlePubDates,
"articleTitles": retArticleTitles,
"articleUrls": retArticleUrls,
}