forked from allanlepp/te_rss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rss_requests.py
158 lines (127 loc) · 7.49 KB
/
rss_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
HTML-i hankimine.
"""
import html
import requests
import urllib3
import parsers_common
import rss_config
import rss_print
SESSION = requests.Session()
urllib3.disable_warnings()
def encoding_check(htmlPageString, htmlPageBytesEncoding):
rss_print.print_debug(__file__, "kodeeringuprobleemide kontroll, sisend vormingus: '" + htmlPageBytesEncoding + "'", 3)
pos = max(-1, htmlPageString.find("&#"))
if pos >= 0:
rss_print.print_debug(__file__, "'&#' tüüpi kodeering asukohas(" + str(pos) + "): '" + htmlPageString[pos - 10:pos + 30] + "', proovime parandada", 2)
htmlPageString = html.unescape(htmlPageString)
# https://www.i18nqa.com/debug/table-iso8859-1-vs-iso8859-15.html
pos = max(-1, htmlPageString.find("ä"), htmlPageString.find("õ"))
if pos >= 0:
rss_print.print_debug(__file__, "'Ã_' tüüpi kodeering asukohas(" + str(pos) + "): '" + htmlPageString[pos - 10:pos + 30] + "', proovime parandada", 1)
htmlPageString = encoding_fix_with_deencode(htmlPageString, 'utf-8', htmlPageBytesEncoding)
# "¤´" on levinud sümbolid ja ei viita katustega ess-ide kodeeringu probleemile
pos = max(-1, htmlPageString.find("¦"), htmlPageString.find("¨"), htmlPageString.find("¸"))
if pos >= 0:
rss_print.print_debug(__file__, "'iso8859_15 as iso8859_1' kodeering asukohas(" + str(pos) + "): '" + htmlPageString[pos - 10:pos + 30] + "', proovime parandada", 0)
htmlPageString = encoding_fix_with_deencode(htmlPageString, 'iso8859_15', htmlPageBytesEncoding)
return htmlPageString
def encoding_fix_with_deencode(inpString, sourceEncoding, destEncoding):
"""
Imiteerime vigast "enkooding" -> "enkooding" konverteerimist ja asendame nii leitud sümbolid tagasi algseteks sümboliteks.
näiteks: sourceEncoding='utf-8', destEncoding='iso8859_15'
https://i18nqa.com/debug/UTF8-debug.html
"""
rss_print.print_debug(__file__, "EBAõnnestunud '" + sourceEncoding + "' as '" + destEncoding + "' kodeeringu parandamine 'tagasiasendamisega'", 1)
rss_print.print_debug(__file__, "'" + inpString + "'", 4)
# handpicked changes
inpString = inpString.replace('ä', 'ä')
inpString = inpString.replace('õ', 'õ')
inpString = inpString.replace('ö', 'ö')
inpString = inpString.replace('ü', 'ü')
inpString = inpString.replace('Ãœ', 'Ü')
inpString = inpString.replace('Ö', 'Ö')
inpString = inpString.replace('Ä', 'Ä')
inpString = inpString.replace('Õ', 'Õ')
inpString = inpString.replace('â', '–')
inpString = inpString.replace('â', '"')
inpString = inpString.replace('â', '"')
inpString = inpString.replace('â', '"')
return inpString
def encoding_fix_with_replace(inpString):
rss_print.print_debug(__file__, "parandame EBAõnnestunud kodeeringu asendamise läbi", 1)
inpString = inpString.replace("\\\\x", "\\x")
# handpicked changes
inpString = inpString.replace("\\xc2\\xa0", " ")
inpString = inpString.replace("\\xc2\\xab", '"')
inpString = inpString.replace("\\xc2\\xbb", '"')
inpString = inpString.replace('\\xc3\\x84', 'Ä')
inpString = inpString.replace('\\xc3\\x95', 'Õ')
inpString = inpString.replace('\\xc3\\x96', 'Ö')
inpString = inpString.replace('\\xc3\\x97', '-')
inpString = inpString.replace('\\xc3\\x9c', 'Ü')
inpString = inpString.replace('\\xc3\\xa4', 'ä')
inpString = inpString.replace('\\xc3\\xb5', 'õ')
inpString = inpString.replace('\\xc3\\xb6', 'ö')
inpString = inpString.replace('\\xc3\\xbc', 'ü')
inpString = inpString.replace('\\xc3\\x83', 'Ã') # viitab mitmekordsele kodeeringuprobleemile, juu aar fakt
inpString = inpString.replace('\\xc3\\x85', 'Å') # viitab mitmekordsele kodeeringuprobleemile, juu aar fakt
inpString = inpString.replace("\\xe2\\x80\\x93", "–")
inpString = inpString.replace('\\xc3\\xa9', 'é')
inpString = inpString.replace('\\xe2\\x80\\x9d', '"')
inpString = inpString.replace('\\xe2\\x80\\x9c', '"')
rss_print.print_debug(__file__, "output='" + inpString + "'", 4)
return inpString
def get_article_string(articleUrl, headers):
"""
Päringu teostamine HTML-i allalaadimiseks.
Väljund: unicode kodeeringus string
"""
rss_print.print_debug(__file__, "tavaline internetipäring: " + articleUrl, 0)
try:
htmlPage = SESSION.get(articleUrl, headers=headers, timeout=rss_config.REQUEST_TIMEOUT, verify=False) # pylint: disable=E1101
except Exception as e:
rss_print.print_debug(__file__, "internetipäring EBAõnnestus, tagastame tühja vastuse", 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 0)
return ""
rss_print.print_debug(__file__, "kontrollime urli redirectimist: " + articleUrl, 4)
htmlPageUrl = parsers_common.str_rchop(htmlPage.url, "/")
if htmlPageUrl[0:5] != articleUrl[0:5]:
rss_print.print_debug(__file__, "vastuse URLi algus erineb päringu URList: " + htmlPageUrl + "!=" + articleUrl, 0)
rss_print.print_debug(__file__, "internetipäringu tulemuse dekodeerimine: " + articleUrl, 3)
htmlPageBytes = htmlPage.content
htmlPageBytesEncoding = htmlPage.encoding
try:
htmlPageString = parsers_common.bytes_to_str(htmlPageBytes, htmlPageBytesEncoding)
except Exception as e:
rss_print.print_debug(__file__, "internetipäringu tulemuse dekodeerimine '" + htmlPageBytesEncoding + "' kujul EBAõnnestus", 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
# proovime baitide dekodeerimist 'väljapakutud' vormingust
rss_print.print_debug(__file__, "internetipäringu tulemuse dekodeerimine 'apparent' enkoodingus: " + articleUrl, 0)
htmlPageBytesEncoding = htmlPage.apparent_encoding
try:
htmlPageString = parsers_common.bytes_to_str(htmlPageBytes, htmlPageBytesEncoding)
rss_print.print_debug(__file__, "internetipäringu tulemuse dekodeerimine '" + htmlPageBytesEncoding + "' kujul õnnestus", 0)
except Exception as e:
rss_print.print_debug(__file__, "internetipäringu tulemuse dekodeerimine '" + htmlPageBytesEncoding + "' kujul EBAõnnestus", 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
htmlPageString = encoding_check(htmlPageString, htmlPageBytesEncoding)
return htmlPageString
def upload_file(filePath, filename):
filePathFull = filePath + "/" + filename
rss_print.print_debug(__file__, "asume üles laadima faili: " + filePathFull, 3)
try:
with open(filePathFull, 'rb') as curFile:
files = {rss_config.UPLOAD_NAME: curFile}
reply = requests.post(rss_config.UPLOAD_URL, files=files, timeout=10)
replyStatusCode = reply.status_code
if replyStatusCode == 200:
rss_print.print_debug(__file__, "faili üleslaadimine õnnestus: " + filename, 3)
rss_print.print_debug(__file__, "serveri vastus: reply.status_code = " + str(replyStatusCode), 4)
else:
rss_print.print_debug(__file__, "faili üleslaadimine EBAõnnestus: " + filename, 0)
rss_print.print_debug(__file__, "serveri vastus: reply.status_code = " + str(replyStatusCode), 1)
rss_print.print_debug(__file__, "serveri vastus: reply.text = " + str(reply.text), 2)
except Exception as e:
rss_print.print_debug(__file__, "faili üleslaadimine EBAõnnestus: " + filename, 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)