-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbgtask.py
185 lines (159 loc) · 7.22 KB
/
bgtask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# #coding=utf-8
import time
import urllib, urllib2
import re
import anydbm as dbm
from codepy import menulog
'''
(python bgtask.py &)
用于抓取菜单信息
地址格式变更?这个可以直接访问
http://wap.plus.yixin.im/wap/material/viewImageText?id=31613351
'''
pattern_title = r"<title>(.+)</title>"
pattern_weekday = ur"(星期(.))"
pattern_year = ur'20(\d\d)-'
pattern_month_update = r'-(\d+)-'
pattern_month = r'>(\d+)</span>'
pattern_day = ur'月(\d+)日'
pattern_day2 = ur'>(\d+)日'
urlhead = 'http://numenplus.yixin.im/singleNewsWap.do?materialId='
datafile = 'datafile'
startId = 53370
def getWebContent(url):
try:
url += '&companyId=1'
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; PRO 6 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/44.0.2403.130 Mobile Safari/537.36 YiXin/4.8.3')
res = urllib2.urlopen(req)
html = res.read().decode('utf-8')
return html
except Exception as e:
menulog.debug(str(e))
return ''
class Background:
def __init__(self):
self.frequency = 10800 # 间隔(秒)
self.interval = 150 # 每次爬的id数量
self.back = 0 # 每次从self.startId - self.back开始查找,防止被占坑
self.firstRun = True # 是否在程序开始后先执行一次
self.today = 0
self.running = False # 是否正在运行(否则同一秒会重复执行多次)
self.startId = 0
self.count = 0
self.usedId = 0 # 记录中间最大的非空id
self.nowId = self.startId
self.result = u'未找到菜单'
self.lastQuery = 0
self.cache = {} # {151019:15163} # 日期:id
self.maybe = [] # 爬到的报错的页面
self.empty = 0
self.maxEmpty = 100 # 连续多少空页后中断; 近期总是跳着使用id
def getTime(self):
return int(time.time())
def schedule(self):
if self.firstRun:
self.firstRun = False
self.process()
self.schedule() # 可用Timer().start()替换
else:
while True:
time.sleep(0.1) # 可以极大减少cpu占用
if self.getTime() % self.frequency == 0 and not self.running:
self.running = True
self.process()
elif self.getTime() % 3600 == 0: # 每3600s记录一次存活信息
menulog.info('%s@%d'% (time.strftime('20%y-%m-%d %H:%M:%S', time.localtime()), self.getTime()))
time.sleep(1)
def process(self):
self.count += 1
self.today = int(time.strftime('%y%m%d', time.localtime()))
menulog.info(u'开始第%d次查找@%d'% (self.count, self.getTime()))
try:
db = dbm.open(datafile, 'c')
if not len(db):
# 没有之前的数据文件
db['startId'] = str(startId)
db['lastQuery'] = str(self.getTime())
db['cache'] = str(self.cache)
db['maybe'] = str(self.maybe)
self.startId = eval(db['startId']) - self.back
self.cache = eval(db['cache'])
self.maybe = eval(db['maybe'])
self.nowId = self.startId
self.lastQuery = self.getTime() # 保存最后搜索时间
while self.nowId - self.startId < self.interval:
menulog.info(u'开始查找: %d'% self.nowId)
text = getWebContent(urlhead+ str(self.nowId))
if text.find(u'今日菜单') != -1 and text.find(u'本帮菜') != -1:
self.empty = 0
try:
year = re.findall(pattern_year, text)[0]
monthday = re.findall(pattern_month, text)
if monthday[0] == '0' and len(monthday)> 2:
month = monthday[0]+monthday[1]
dayIndex = 2
else:
month = monthday[0]
dayIndex = 1
if len(monthday) > dayIndex:
day = monthday[dayIndex]
if len(day) == 1:
# 针对 1</span>...>5日 
# 上面的月份也有这种情况
day += re.findall(pattern_day2, text)[0]
else:
day = re.findall(pattern_day, text)[0]
update_month = re.findall(pattern_month_update, text)[0] # 发布菜单的月份,用于跨年
if int(update_month) == 12 and int(month) == 1:
year = str(int(year)+1)
thisday = int(year+month+day)
self.startId = self.nowId
if self.cache.has_key(thisday):
menulog.info(u'更新%s的菜单id为%s'% (thisday, self.nowId))
self.cache[thisday] = self.nowId
menulog.info('find %d'% self.nowId)
except (IndexError, ):
if self.nowId not in self.maybe:
self.maybe.append(self.nowId)
menulog.debug('IndexError add maybe')
else:
if text.find(u'请求素材不存在') == -1:
# 搜索到的结果页有内容(不是菜单)
self.usedId = self.nowId
self.empty = 0
else:
self.empty += 1
menulog.info('empty(%d) %d'% (self.empty, self.nowId))
if self.empty > self.maxEmpty:
menulog.debug('break this round')
break
self.nowId += 1
# if self.maybe and max(self.maybe) > max(self.cache.values()):
# # 取消这个设计, 格式变化太大, 很可能导致卡住
# menulog.info(u'更新起点至可能的ID:%d'% max(self.maybe))
# self.startId = max(self.maybe)
if self.usedId > self.startId:
menulog.info(u'更新起点至%d'% self.usedId)
self.startId = self.usedId
# 保存
db['startId'] = str(self.startId)
db['lastQuery'] = str(self.lastQuery)
db['cache'] = str(self.cache)
db['maybe'] = str(self.maybe)
menulog.info(u'第%d次查找结束'% self.count)
# 已更新的菜单
self.cache = eval(db['cache'])
future = []
for day in self.cache.keys():
if day >= self.today:
future.append(day)
future.sort()
db['future'] = str(future)
menulog.info(u'更新今后已找到的菜单列表')
db.close()
except (IOError, EOFError):
menulog.info(u'缓存读取/创建异常')
finally:
self.running = False
Background().schedule()