forked from allanlepp/te_rss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser_elektriteater.py
31 lines (22 loc) · 1.51 KB
/
parser_elektriteater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import parsers_common
def fill_article_dict(articleDataDict, pageTree, domain):
articleDataDict["images"] = parsers_common.xpath_to("list", pageTree, '//div[@class="image session__image"]/img/@data-srcset')
articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//h2[@class="session__title"]/text()')
articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//a[@class="session__link"]/@href')
for i in parsers_common.article_urls_range(articleDataDict["urls"]):
# images
curArtImage = parsers_common.get(articleDataDict["images"], i)
if " " in curArtImage:
curArtImages = curArtImage.split(" ")
curArtImage = curArtImages[-2]
articleDataDict["images"] = parsers_common.list_add_or_assign(articleDataDict["images"], i, curArtImage)
if parsers_common.should_get_article_body(i):
curArtUrl = parsers_common.get(articleDataDict["urls"], i)
# load article into tree
pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll')
# description
curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="film-detail__main"]', parent=True)
if not curArtDesc:
curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="text editor"]', parent=True)
articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc)
return articleDataDict