Skip to content

Commit 4ec41ea

Browse files
zhupryou-n-g
authored andcommitted
Add a check if change is mutated to YahooNormalize1d
1 parent 6d91f28 commit 4ec41ea

File tree

2 files changed

+22
-6
lines changed

2 files changed

+22
-6
lines changed

scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"):
7878
data_list.append(_row_data[0])
7979
data_list = sorted(data_list)
8080
date_list = generate_qlib_calendar(data_list, freq=freq)
81+
date_list = sorted(set(daily_calendar.loc[:, 0].values.tolist() + date_list))
8182
write_calendar_to_qlib(qlib_dir, date_list, freq=freq)
8283
bs.logout()
8384
logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31")

scripts/data_collector/yahoo/collector.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,16 @@ class YahooNormalize(BaseNormalize):
283283
COLUMNS = ["open", "close", "high", "low", "volume"]
284284
DAILY_FORMAT = "%Y-%m-%d"
285285

286+
@staticmethod
287+
def calc_change(df: pd.DataFrame, last_close: float) -> pd.Series:
288+
df = df.copy()
289+
_tmp_series = df["close"].fillna(method="ffill")
290+
_tmp_shift_series = _tmp_series.shift(1)
291+
if last_close is not None:
292+
_tmp_shift_series.iloc[0] = float(last_close)
293+
change_series = _tmp_series / _tmp_shift_series - 1
294+
return change_series
295+
286296
@staticmethod
287297
def normalize_yahoo(
288298
df: pd.DataFrame,
@@ -310,11 +320,16 @@ def normalize_yahoo(
310320
)
311321
df.sort_index(inplace=True)
312322
df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {symbol_field_name}] = np.nan
313-
_tmp_series = df["close"].fillna(method="ffill")
314-
_tmp_shift_series = _tmp_series.shift(1)
315-
if last_close is not None:
316-
_tmp_shift_series.iloc[0] = float(last_close)
317-
df["change"] = _tmp_series / _tmp_shift_series - 1
323+
324+
change_series = YahooNormalize.calc_change(df, last_close)
325+
# NOTE: The data obtained by Yahoo finance sometimes has exceptions
326+
# WARNING: If it is normal for a `symbol(exchange)` to differ by a factor of *89* to *111* for consecutive trading days,
327+
# WARNING: the logic in the following line needs to be modified
328+
_mask = (change_series >= 89) & (change_series <= 111)
329+
_tmp_cols = ["high", "close", "low", "open", "adjclose"]
330+
df.loc[_mask, _tmp_cols] = df.loc[_mask, _tmp_cols] / 100
331+
df["change"] = YahooNormalize.calc_change(df, last_close)
332+
318333
columns += ["change"]
319334
df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan
320335

@@ -852,7 +867,7 @@ def normalize_data(
852867
if self.interval.lower() == "1min":
853868
if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists():
854869
raise ValueError(
855-
"If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir <user qlib 1d data >, Reference: https://github.com/zhupr/qlib/tree/support_extend_data/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance"
870+
"If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir <user qlib 1d data >, Reference: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance"
856871
)
857872
super(Run, self).normalize_data(
858873
date_field_name, symbol_field_name, end_date=end_date, qlib_data_1d_dir=qlib_data_1d_dir

0 commit comments

Comments
 (0)