Skip to content

Commit 985fd08

Browse files
committed
Fix cn_index.collector network error
1 parent b6c3154 commit 985fd08

File tree

1 file changed

+21
-6
lines changed

1 file changed

+21
-6
lines changed

scripts/data_collector/cn_index/collector.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
sys.path.append(str(CUR_DIR.parent.parent))
2020

2121
from data_collector.index import IndexBase
22-
from data_collector.utils import get_calendar_list, get_trading_date_by_shift
22+
from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry
2323

2424

2525
NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index_code}cons.xls"
@@ -29,6 +29,22 @@
2929
# 2020-11-27 Announcement title change
3030
INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89"
3131

32+
REQ_HEADERS = {
33+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
34+
}
35+
36+
37+
@deco_retry
38+
def retry_request(url: str, method: str = "get", exclude_status: List = None):
39+
if exclude_status is None:
40+
exclude_status = []
41+
method_func = getattr(requests, method)
42+
_resp = method_func(url, headers=REQ_HEADERS)
43+
_status = _resp.status_code
44+
if _status not in exclude_status and _status != 200:
45+
raise ValueError(f"response status: {_status}, url={url}")
46+
return _resp
47+
3248

3349
class CSIIndex(IndexBase):
3450
@property
@@ -137,9 +153,8 @@ def _read_change_from_url(self, url: str) -> pd.DataFrame:
137153
date: pd.Timestamp
138154
type: str, value from ["add", "remove"]
139155
"""
140-
resp = requests.get(url)
156+
resp = retry_request(url)
141157
_text = resp.text
142-
143158
date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
144159
if len(date_list) >= 2:
145160
add_date = pd.Timestamp("-".join(date_list[0]))
@@ -150,7 +165,7 @@ def _read_change_from_url(self, url: str) -> pd.DataFrame:
150165
logger.info(f"get {add_date} changes")
151166
try:
152167
excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0]
153-
content = requests.get(f"http://www.csindex.com.cn{excel_url}").content
168+
content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content
154169
_io = BytesIO(content)
155170
df_map = pd.read_excel(_io, sheet_name=None)
156171
with self.cache_dir.joinpath(
@@ -204,7 +219,7 @@ def _get_change_notices_url(self) -> List[str]:
204219
-------
205220
[url1, url2]
206221
"""
207-
resp = requests.get(self.changes_url)
222+
resp = retry_request(self.changes_url)
208223
html = etree.HTML(resp.text)
209224
return html.xpath("//*[@id='itemContainer']//li/a/@href")
210225

@@ -224,7 +239,7 @@ def get_new_companies(self) -> pd.DataFrame:
224239
end_date: pd.Timestamp
225240
"""
226241
logger.info("get new companies......")
227-
context = requests.get(self.new_companies_url).content
242+
context = retry_request(self.new_companies_url).content
228243
with self.cache_dir.joinpath(
229244
f"{self.index_name.lower()}_new_companies.{self.new_companies_url.split('.')[-1]}"
230245
).open("wb") as fp:

0 commit comments

Comments
 (0)