-
Notifications
You must be signed in to change notification settings - Fork 8
/
domain_checker.py
154 lines (136 loc) · 5.28 KB
/
domain_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import requests
import re
import urllib3
import os
from datetime import datetime
#from pytz import timezone
from github import Github
ISSUE_COUNT= 0
FILTERLIST_URL = "https://easylist-downloads.adblockplus.org/koreanlist+easylist.txt"
FILTER_DATA= ""
ISSUE_BODY= ""
def printIssue(mystr):
global ISSUE_BODY
print(mystr)
ISSUE_BODY += mystr + "\n"
def publishAnIssue():
global ISSUE_COUNT, ISSUE_BODY
if ISSUE_COUNT == 0:
print("no any issue to publish.")
return
access_token = os.environ['MY_GITHUB_TOKEN']
organization_name = "easylist"
repository_name= "KoreanList"
# german_timezone = timezone('Europe/Berlin')
# today = datetime.now(german_timezone)
# today_data = today.strftime("%d.%m.%Y")
title= "Found "+str(ISSUE_COUNT)+" domain issue(s)."
g = Github(access_token)
org= g.get_organization(organization_name)
org.login
repo = org.get_repo(repository_name)
res= repo.create_issue(title=title, body=ISSUE_BODY)
print(res)
def readSourceFromABPFilters(url, target):
global ISSUE_COUNT, ISSUE_BODY, FILTER_DATA
# download filter data from URL
if FILTER_DATA == "":
http = urllib3.PoolManager()
response = http.request('GET', url)
FILTER_DATA = response.data.decode('utf-8')
target= target.replace("http://", "")
target= target.replace("https://", "")
number_of_domain= extract_number(target)
_pattern= target.replace(str(number_of_domain), "(\d{1,3}|\*)")
for line in FILTER_DATA.splitlines():
searched= re.search(_pattern, line)
# found a target domain
if(searched):
# substringed : only a domain
# extracted : only a number of domain
substringed= line[searched.start():searched.end()]
extracted= extract_number(substringed)
if(extracted == -1):
printIssue(line +" :arrow_right: (nothing to fix) :thumbsup:")
continue
# number of the written filter is old.
if(extracted < number_of_domain):
ISSUE_COUNT += 1
renewed_filter= line.replace(str(extracted), str(number_of_domain))
printIssue(str(ISSUE_COUNT) + ". Filter update suggestion:\n- [ ] " + line + " :arrow_right: " + renewed_filter)
elif(extracted == number_of_domain):
printIssue(line +" :arrow_right: (nothing to fix) :thumbsup:")
printIssue("")
def extract_number(url):
parsed_int_list= re.findall("\d+", url)
if len(parsed_int_list) == 0:
#raise Exception("no any digits in url("+url+")")
print("no any digits in url("+url+")")
return -1
parsed_int= int(parsed_int_list[0])
return parsed_int
def tryAccessWithFakeHeaders(url):
hearders = {'headers':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0'}
n = requests.get(url, headers=hearders)
al = n.text
title= al[al.find('<title>') + 7 : al.find('</title>')]
return title
def url_ok(url):
global ISSUE_BODY
if "http" not in url:
url = "https://"+ url
working_domains= []
parsed_int= extract_number(url)
found_domain_works= False
which_number_latest_works = 0
i= parsed_int-1
while True:
if found_domain_works and i > parsed_int+1 and not which_number_latest_works == i-1:
printIssue("")
break
if i > 150:
printIssue("")
break
try:
replaced= url.replace(str(parsed_int), str(i))
replaced_prev= url.replace(str(parsed_int), str(i-1))
r = requests.head(replaced, allow_redirects=False, timeout=10)
if r.status_code == 301 or r.status_code == 302:
printIssue(replaced + ": redirection detected("+str(r.status_code)+") :leftwards_arrow_with_hook:")
found_domain_works= True
which_number_latest_works= i
elif r.status_code != 200:
printIssue(replaced + ": status("+str(r.status_code)+") :grey_question:")
found_domain_works= True
which_number_latest_works= i
#title= tryAccessWithFakeHeaders(url)
# if title is not None:
# printIssue("Title: " + title)
# found_domain_works= True
# which_number_latest_works= i
# working_domains.append(replaced)
else:
printIssue(replaced+": working fine :white_check_mark:")
found_domain_works= True
which_number_latest_works= i
working_domains.append(replaced)
i=i+1
except Exception as e:
#print(str(e))
printIssue(replaced+": not working :no_entry_sign:")
if found_domain_works:
working_domains.append(replaced_prev)
i=i+1
continue
return working_domains
file1 = open('domain_checklist.txt', 'r')
Lines = file1.readlines()
count = 0
for line in Lines:
if(line.startswith("#")): # ignore comments
continue
working_domains= url_ok(line.strip())
working_domains= list(set(working_domains))
for domain in working_domains:
readSourceFromABPFilters(FILTERLIST_URL, domain)
publishAnIssue()