19
19
sys .path .append (str (CUR_DIR .parent .parent ))
20
20
21
21
from data_collector .index import IndexBase
22
- from data_collector .utils import get_calendar_list , get_trading_date_by_shift
22
+ from data_collector .utils import get_calendar_list , get_trading_date_by_shift , deco_retry
23
23
24
24
25
25
NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index_code}cons.xls"
29
29
# 2020-11-27 Announcement title change
30
30
INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89"
31
31
32
+ REQ_HEADERS = {
33
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
34
+ }
35
+
36
+
37
+ @deco_retry
38
+ def retry_request (url : str , method : str = "get" , exclude_status : List = None ):
39
+ if exclude_status is None :
40
+ exclude_status = []
41
+ method_func = getattr (requests , method )
42
+ _resp = method_func (url , headers = REQ_HEADERS )
43
+ _status = _resp .status_code
44
+ if _status not in exclude_status and _status != 200 :
45
+ raise ValueError (f"response status: { _status } , url={ url } " )
46
+ return _resp
47
+
32
48
33
49
class CSIIndex (IndexBase ):
34
50
@property
@@ -137,9 +153,8 @@ def _read_change_from_url(self, url: str) -> pd.DataFrame:
137
153
date: pd.Timestamp
138
154
type: str, value from ["add", "remove"]
139
155
"""
140
- resp = requests . get (url )
156
+ resp = retry_request (url )
141
157
_text = resp .text
142
-
143
158
date_list = re .findall (r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日" , _text )
144
159
if len (date_list ) >= 2 :
145
160
add_date = pd .Timestamp ("-" .join (date_list [0 ]))
@@ -150,7 +165,7 @@ def _read_change_from_url(self, url: str) -> pd.DataFrame:
150
165
logger .info (f"get { add_date } changes" )
151
166
try :
152
167
excel_url = re .findall ('.*href="(.*?xls.*?)".*' , _text )[0 ]
153
- content = requests . get (f"http://www.csindex.com.cn{ excel_url } " ).content
168
+ content = retry_request (f"http://www.csindex.com.cn{ excel_url } " , exclude_status = [ 404 ] ).content
154
169
_io = BytesIO (content )
155
170
df_map = pd .read_excel (_io , sheet_name = None )
156
171
with self .cache_dir .joinpath (
@@ -204,7 +219,7 @@ def _get_change_notices_url(self) -> List[str]:
204
219
-------
205
220
[url1, url2]
206
221
"""
207
- resp = requests . get (self .changes_url )
222
+ resp = retry_request (self .changes_url )
208
223
html = etree .HTML (resp .text )
209
224
return html .xpath ("//*[@id='itemContainer']//li/a/@href" )
210
225
@@ -224,7 +239,7 @@ def get_new_companies(self) -> pd.DataFrame:
224
239
end_date: pd.Timestamp
225
240
"""
226
241
logger .info ("get new companies......" )
227
- context = requests . get (self .new_companies_url ).content
242
+ context = retry_request (self .new_companies_url ).content
228
243
with self .cache_dir .joinpath (
229
244
f"{ self .index_name .lower ()} _new_companies.{ self .new_companies_url .split ('.' )[- 1 ]} "
230
245
).open ("wb" ) as fp :
0 commit comments