From 192ddc8fa4398811f655329366adddd473746a76 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 19 Oct 2023 21:33:41 +0800 Subject: [PATCH 01/34] improve pit performance --- .gitignore | 1 + qlib/data/base.py | 4 +- qlib/data/cache.py | 4 ++ qlib/data/data.py | 114 ++++++++++++++++++++++------------------- qlib/data/pit.py | 44 ++++++++++------ qlib/utils/__init__.py | 33 ++++++++++++ 6 files changed, 129 insertions(+), 71 deletions(-) diff --git a/.gitignore b/.gitignore index 8854c25e99..0be5d251a6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ _build build/ dist/ +tests/test_pit_data/ *.pkl *.hd5 diff --git a/qlib/data/base.py b/qlib/data/base.py index 496ae38ee2..f7d3df682a 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -267,10 +267,10 @@ class PFeature(Feature): def __str__(self): return "$$" + self._name - def _load_internal(self, instrument, start_index, end_index, cur_time, period=None): + def _load_internal(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): from .data import PITD # pylint: disable=C0415 - return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period) + return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period, start_time) class ExpressionOps(Expression): diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 3264dcd020..f1e6bd3764 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -160,6 +160,7 @@ def __init__(self, mem_cache_size_limit=None, limit_type="length"): self.__calendar_mem_cache = klass(size_limit) self.__instrument_mem_cache = klass(size_limit) self.__feature_mem_cache = klass(size_limit) + self.__pit_mem_cache = klass(size_limit) def __getitem__(self, key): if key == "c": @@ -168,6 +169,8 @@ def __getitem__(self, key): return self.__instrument_mem_cache elif key == "f": return self.__feature_mem_cache + elif key == "p": + return self.__pit_mem_cache else: raise KeyError("Unknown memcache unit") @@ -175,6 +178,7 @@ def clear(self): self.__calendar_mem_cache.clear() self.__instrument_mem_cache.clear() self.__feature_mem_cache.clear() + self.__pit_mem_cache.clear() class MemCacheExpire: diff --git a/qlib/data/data.py b/qlib/data/data.py index 116827f232..22b2f9c47b 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -33,8 +33,7 @@ normalize_cache_fields, code_to_fname, time_to_slc_point, - read_period_data, - get_period_list, + get_period_list_by_offset, ) from ..utils.paral import ParallelExt from .ops import Operators # pylint: disable=W0611 # noqa: F401 @@ -746,13 +745,24 @@ class LocalPITProvider(PITProvider): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! - def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None): + def period_feature(self, instrument, field, start_offset, end_offset, cur_time, period=None, start_time=None): + """get raw data from PIT + we have 3 modes to query data from PIT, all method need current datetime + + 1. given period, return value observed at current datetime + return series with index as datetime + 2. given start_time, return value **observed by each day** from start_time to current datetime + return series with index as datetime + 3. given start_offset and end_offset, return period data between [-start_offset, end_offset] observed at current datetime + return series with index as period + + """ if not isinstance(cur_time, pd.Timestamp): raise ValueError( f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')" ) - assert end_index <= 0 # PIT don't support querying future data + assert end_offset <= 0 # PIT don't support querying future data DATA_RECORDS = [ ("date", C.pit_record_type["date"]), @@ -777,58 +787,56 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe # self.period_index[field] = {} # For acceleration} - if not field.endswith("_q") and not field.endswith("_a"): - raise ValueError("period field must ends with '_q' or '_a'") + key = f"{instrument}.{field}" quarterly = field.endswith("_q") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" - data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" - if not (index_path.exists() and data_path.exists()): - raise FileNotFoundError("No file is found.") - # NOTE: The most significant performance loss is here. - # Does the acceleration that makes the program complicated really matters? - # - It makes parameters of the interface complicate - # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance) - # - If we design it carefully, we can go through for only once to get the historical evolution of the data. - # So I decide to deprecated previous implementation and keep the logic of the program simple - # Instead, I'll add a cache for the index file. - data = np.fromfile(data_path, dtype=DATA_RECORDS) - - # find all revision periods before `cur_time` - cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day) - loc = np.searchsorted(data["date"], cur_time_int, side="right") - if loc <= 0: - return pd.Series(dtype=C.pit_record_type["value"]) - last_period = data["period"][:loc].max() # return the latest quarter - first_period = data["period"][:loc].min() - period_list = get_period_list(first_period, last_period, quarterly) + if key in H["f"]: + df = H["f"][key] + else: + if not field.endswith("_q") and not field.endswith("_a"): + raise ValueError("period field must ends with '_q' or '_a'") + index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" + data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" + if not (index_path.exists() and data_path.exists()): + raise FileNotFoundError("No file is found.") + ## get first period offset + ## NOTE: current index file return offset from a given period not date + ## so we cannot findout the offset by given date + ## stop using index in this version + # start_point = get_pitdata_offset(index_path, period, ) + data = np.fromfile(data_path, dtype=DATA_RECORDS) + df = pd.DataFrame(data, columns=[i[0] for i in DATA_RECORDS]) + df.sort_values(by=["date", "period"], inplace=True) + df["date"] = pd.to_datetime(df["date"].astype(str)) + H["f"][key] = df + + df_ret = df[(df["date"] <= cur_time)] + if df_ret.empty: + return pd.Series(dtype=VALUE_DTYPE) + # keep only the latest period value + df_ret = df_ret.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") + df_ret = df_ret.set_index("period") + # return df if period is not None: - # NOTE: `period` has higher priority than `start_index` & `end_index` - if period not in period_list: - return pd.Series(dtype=C.pit_record_type["value"]) - else: - period_list = [period] + retur = df[df["period"] == period].set_index("date")["value"] + elif start_time is not None: + # df is sorted by date, and the term whose period is monotonically non-decreasing is selected. + s_sign = pd.Series(False, index=df.index) + max_p = df["period"].iloc[0] + for i in range(0, len(s_sign)): + if df["period"].iloc[i] >= max_p: + s_sign.iloc[i] = True + max_p = df["period"].iloc[i] + df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") + s_part = df_sim.set_index("date")[start_time:]["value"] + if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: + # add previous value to result to avoid nan in the first period + pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) + s_part = pd.concat([pre_value, s_part]) + return s_part else: - period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] - value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE) - for i, p in enumerate(period_list): - # last_period_index = self.period_index[field].get(period) # For acceleration - value[i], now_period_index = read_period_data( - index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration - ) - # self.period_index[field].update({period: now_period_index}) # For acceleration - # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan) - # when calculation between different features and only part of its financial indicator is published - series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) - - # {For acceleration - # if cur_index == end_index: - # self.all_fields.remove(field) - # if not len(self.all_fields): - # del self.all_fields - # del self.period_index - # For acceleration} - - return series + period_list = get_period_list_by_offset(df_ret.index[-1], -start_offset, quarterly) + retur = df_ret["value"].reindex(period_list, fill_value=np.nan) + return retur class LocalExpressionProvider(ExpressionProvider): diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 33d5e0c5cc..9b5b7a88c0 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -24,31 +24,43 @@ class P(ElemOperator): def _load_internal(self, instrument, start_index, end_index, freq): _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") - - for cur_index in range(start_index, end_index + 1): - cur_time = _calendar[cur_index] - # To load expression accurately, more historical data are required - start_ws, end_ws = self.feature.get_extended_window_size() - if end_ws > 0: - raise ValueError( - "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" - ) - - # The calculated value will always the last element, so the end_offset is zero. + # To load expression accurately, more historical data are required + start_ws, end_ws = self.feature.get_extended_window_size() + # if start_ws = 0, means expression use only current data, so pit history data is not required + if start_ws == 0 and end_ws == 0: try: - s = self._load_feature(instrument, -start_ws, 0, cur_time) - resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan + # get start and end date + s = self._load_feature(instrument, 0, 0, _calendar[end_index], None, _calendar[start_index]) + # index in s may not in calendar, so we need to reindex it to continue date first + s = s.reindex(pd.date_range(start=s.iloc[0], end=_calendar[end_index])).fillna(method="ffill") + resample_data = s.reindex(_calendar[start_index : end_index + 1]).fillna(method="ffill").values except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") return pd.Series(dtype="float32", name=str(self)) + else: + for cur_index in range(start_index, end_index + 1): + cur_time = _calendar[cur_index] + + if end_ws > 0: + raise ValueError( + "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" + ) + + # The calculated value will always the last element, so the end_offset is zero. + try: + s = self._load_feature(instrument, -start_ws, 0, cur_time) + resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan + except FileNotFoundError: + get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + return pd.Series(dtype="float32", name=str(self)) resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) ) return resample_series - def _load_feature(self, instrument, start_index, end_index, cur_time): - return self.feature.load(instrument, start_index, end_index, cur_time) + def _load_feature(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): + return self.feature.load(instrument, start_index, end_index, cur_time, period, start_time) def get_longest_back_rolling(self): # The period data will collapse as a normal feature. So no extending and looking back @@ -67,5 +79,5 @@ def __init__(self, feature, period): def __str__(self): return f"{super().__str__()}[{self.period}]" - def _load_feature(self, instrument, start_index, end_index, cur_time): + def _load_feature(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): return self.feature.load(instrument, start_index, end_index, cur_time, self.period) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 9e63c104a1..5bd51922d5 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -88,6 +88,39 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res +def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[int]: + """ + This method will be used in PIT database. + It return all the possible values between `first(offset-last)` and `end` (first and end is included) + + Parameters + ---------- + offset: int + offset quarter or year from last + quarterly : bool + will it return quarterly index or yearly index. + + Returns + ------- + List[int] + the possible index between [first, last] + """ + + if not quarterly: + assert all(1900 <= x <= 2099 for x in (last,)), "invalid arguments" + return list(range(last - offset, last + 1)) + else: + assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" + res = [] + # last minus offset quarters + for year in range(int(last // 100 - (offset // 4 + 1)), int(last // 100 + 1)): + for q in range(1, 5): + period = year * 100 + q + if period <= last: + res.append(year * 100 + q) + return res[len(res) - offset - 1 :] + + def get_period_offset(first_year, period, quarterly): if quarterly: offset = (period // 100 - first_year) * 4 + period % 100 - 1 From afff25752e028f6e14a7963d2a5ac89dd65aa0f7 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 11:14:54 +0800 Subject: [PATCH 02/34] improve pit cache --- qlib/data/data.py | 30 ++++++++++++++--------- qlib/utils/__init__.py | 55 +++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index 22b2f9c47b..e9e0c803da 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -787,10 +787,10 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, # self.period_index[field] = {} # For acceleration} - key = f"{instrument}.{field}" + key = (instrument, field) quarterly = field.endswith("_q") - if key in H["f"]: - df = H["f"][key] + if key in H["p"]: + df = H["p"][key] else: if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") @@ -809,12 +809,6 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df["date"] = pd.to_datetime(df["date"].astype(str)) H["f"][key] = df - df_ret = df[(df["date"] <= cur_time)] - if df_ret.empty: - return pd.Series(dtype=VALUE_DTYPE) - # keep only the latest period value - df_ret = df_ret.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") - df_ret = df_ret.set_index("period") # return df if period is not None: retur = df[df["period"] == period].set_index("date")["value"] @@ -834,8 +828,22 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, s_part = pd.concat([pre_value, s_part]) return s_part else: - period_list = get_period_list_by_offset(df_ret.index[-1], -start_offset, quarterly) - retur = df_ret["value"].reindex(period_list, fill_value=np.nan) + df_remain = df[(df["date"] <= cur_time)] + if df_remain.empty: + return pd.Series(dtype=VALUE_DTYPE) + last_observe_date = df_remain["date"].iloc[-1] + # keep only the latest period value + df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") + df_remain = df_remain.set_index("period") + + cache_key = (instrument, field, last_observe_date, start_offset, end_offset, quarterly) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" + if cache_key in H["p"]: + retur = H["p"][cache_key] + else: + last_period = df_remain.index[-1] + period_list = get_period_list_by_offset(last_period, start_offset, end_offset, quarterly) + retur = df_remain["value"].reindex(period_list, fill_value=np.nan) + H["p"][cache_key] = retur return retur diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 5bd51922d5..b04b459f1d 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -25,7 +25,12 @@ from pathlib import Path from typing import List, Union, Optional, Callable from packaging import version -from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer +from .file import ( + get_or_create_path, + save_multiple_parts_file, + unpack_archive_with_buffer, + get_tmp_file_with_buffer, +) from ..config import C from ..log import get_module_logger, set_log_with_config @@ -37,7 +42,9 @@ #################### Server #################### def get_redis_connection(): """get redis connection instance.""" - return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password) + return redis.StrictRedis( + host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password + ) #################### Data #################### @@ -88,7 +95,9 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res -def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[int]: +def get_period_list_by_offset( + last: int, start_offset: int, end_offset: int, quarterly: bool +) -> List[int]: """ This method will be used in PIT database. It return all the possible values between `first(offset-last)` and `end` (first and end is included) @@ -105,20 +114,22 @@ def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[i List[int] the possible index between [first, last] """ - + assert end_offset <= 0 if not quarterly: assert all(1900 <= x <= 2099 for x in (last,)), "invalid arguments" - return list(range(last - offset, last + 1)) + return list(range(last + start_offset, last + 1 + end_offset)) else: assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" res = [] # last minus offset quarters - for year in range(int(last // 100 - (offset // 4 + 1)), int(last // 100 + 1)): + for year in range( + int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset + ): for q in range(1, 5): period = year * 100 + q if period <= last: res.append(year * 100 + q) - return res[len(res) - offset - 1 :] + return res[len(res) + start_offset - 1 : len(res) + end_offset + 1] def get_period_offset(first_year, period, quarterly): @@ -129,7 +140,9 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): +def read_period_data( + index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None +): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. @@ -180,7 +193,9 @@ def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly with open(data_path, "rb") as fd: while _next != NAN_INDEX: fd.seek(_next) - date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) + date, period, value, new_next = struct.unpack( + DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE)) + ) if date > cur_date_int: break prev_next = _next @@ -416,7 +431,9 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False): return calendar -def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): +def get_date_by_shift( + trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None +): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date @@ -449,7 +466,9 @@ def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq=" if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: - raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range") + raise IndexError( + f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range" + ) return cal[shift_index] @@ -486,7 +505,11 @@ def transform_end_date(end_date=None, freq="day"): from ..data import D # pylint: disable=C0415 last_date = D.calendar(freq=freq)[-1] - if end_date is None or (str(end_date) == "-1") or (pd.Timestamp(last_date) < pd.Timestamp(end_date)): + if ( + end_date is None + or (str(end_date) == "-1") + or (pd.Timestamp(last_date) < pd.Timestamp(end_date)) + ): log.warning( "\nInfo: the end_date in the configuration file is {}, " "so the default last date {} is used.".format(end_date, last_date) @@ -602,7 +625,9 @@ def exists_qlib_data(qlib_dir): # check instruments code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir())) _instrument = instruments_dir.joinpath("all.txt") - miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names) + miss_code = set( + pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower) + ) - set(code_names) if miss_code and any(map(lambda x: "sht" not in x, miss_code)): return False @@ -838,7 +863,9 @@ def register(self, provider): self._provider = provider def __repr__(self): - return "{name}(provider={provider})".format(name=self.__class__.__name__, provider=self._provider) + return "{name}(provider={provider})".format( + name=self.__class__.__name__, provider=self._provider + ) def __getattr__(self, key): if self.__dict__.get("_provider", None) is None: From 6c214aabea84bec08c428bd05b39b420173b0acc Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 11:16:36 +0800 Subject: [PATCH 03/34] lint --- qlib/data/data.py | 11 +++++++++-- qlib/utils/__init__.py | 42 ++++++++++-------------------------------- 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index e9e0c803da..bbfaeb9c12 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -835,8 +835,15 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") df_remain = df_remain.set_index("period") - - cache_key = (instrument, field, last_observe_date, start_offset, end_offset, quarterly) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" + + cache_key = ( + instrument, + field, + last_observe_date, + start_offset, + end_offset, + quarterly, + ) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" if cache_key in H["p"]: retur = H["p"][cache_key] else: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index b04b459f1d..66aa2ef07d 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -42,9 +42,7 @@ #################### Server #################### def get_redis_connection(): """get redis connection instance.""" - return redis.StrictRedis( - host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password - ) + return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password) #################### Data #################### @@ -95,9 +93,7 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res -def get_period_list_by_offset( - last: int, start_offset: int, end_offset: int, quarterly: bool -) -> List[int]: +def get_period_list_by_offset(last: int, start_offset: int, end_offset: int, quarterly: bool) -> List[int]: """ This method will be used in PIT database. It return all the possible values between `first(offset-last)` and `end` (first and end is included) @@ -122,9 +118,7 @@ def get_period_list_by_offset( assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" res = [] # last minus offset quarters - for year in range( - int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset - ): + for year in range(int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset): for q in range(1, 5): period = year * 100 + q if period <= last: @@ -140,9 +134,7 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data( - index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None -): +def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. @@ -193,9 +185,7 @@ def read_period_data( with open(data_path, "rb") as fd: while _next != NAN_INDEX: fd.seek(_next) - date, period, value, new_next = struct.unpack( - DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE)) - ) + date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) if date > cur_date_int: break prev_next = _next @@ -431,9 +421,7 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False): return calendar -def get_date_by_shift( - trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None -): +def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date @@ -466,9 +454,7 @@ def get_date_by_shift( if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: - raise IndexError( - f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range" - ) + raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range") return cal[shift_index] @@ -505,11 +491,7 @@ def transform_end_date(end_date=None, freq="day"): from ..data import D # pylint: disable=C0415 last_date = D.calendar(freq=freq)[-1] - if ( - end_date is None - or (str(end_date) == "-1") - or (pd.Timestamp(last_date) < pd.Timestamp(end_date)) - ): + if end_date is None or (str(end_date) == "-1") or (pd.Timestamp(last_date) < pd.Timestamp(end_date)): log.warning( "\nInfo: the end_date in the configuration file is {}, " "so the default last date {} is used.".format(end_date, last_date) @@ -625,9 +607,7 @@ def exists_qlib_data(qlib_dir): # check instruments code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir())) _instrument = instruments_dir.joinpath("all.txt") - miss_code = set( - pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower) - ) - set(code_names) + miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names) if miss_code and any(map(lambda x: "sht" not in x, miss_code)): return False @@ -863,9 +843,7 @@ def register(self, provider): self._provider = provider def __repr__(self): - return "{name}(provider={provider})".format( - name=self.__class__.__name__, provider=self._provider - ) + return "{name}(provider={provider})".format(name=self.__class__.__name__, provider=self._provider) def __getattr__(self, key): if self.__dict__.get("_provider", None) is None: From a144bc9af7b4d2501014739341a49370a495b950 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 13:29:01 +0800 Subject: [PATCH 04/34] deal with empty data --- qlib/data/data.py | 2 ++ qlib/data/pit.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index bbfaeb9c12..c15ff60885 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -822,6 +822,8 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, max_p = df["period"].iloc[i] df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] + if s_part.empty: + return pd.Series(dtype=VALUE_DTYPE) if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 9b5b7a88c0..97a6dff938 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -31,8 +31,10 @@ def _load_internal(self, instrument, start_index, end_index, freq): try: # get start and end date s = self._load_feature(instrument, 0, 0, _calendar[end_index], None, _calendar[start_index]) + if len(s) == 0: + return pd.Series(dtype="float32", name=str(self)) # index in s may not in calendar, so we need to reindex it to continue date first - s = s.reindex(pd.date_range(start=s.iloc[0], end=_calendar[end_index])).fillna(method="ffill") + s = s.reindex(pd.date_range(start=s.index[0], end=_calendar[end_index])).fillna(method="ffill") resample_data = s.reindex(_calendar[start_index : end_index + 1]).fillna(method="ffill").values except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") From 3ed3f171f36ee05a1d5691fb55f1032cc9b157b7 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 10:58:25 +0800 Subject: [PATCH 05/34] add pit backend: FilePITStorage --- qlib/data/data.py | 34 +++-- qlib/data/storage/file_storage.py | 209 +++++++++++++++++++++++++++++- qlib/data/storage/storage.py | 126 ++++++++++++++++++ tests/test_pit.py | 148 ++++++++++++++++----- 4 files changed, 463 insertions(+), 54 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index c15ff60885..86ddf893c5 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -47,7 +47,10 @@ class ProviderBackendMixin: def get_default_backend(self): backend = {} - provider_name: str = re.findall("[A-Z][^A-Z]*", self.__class__.__name__)[-2] + if hasattr(self, "provider_name"): + provider_name = getattr(self, "provider_name") + else: + provider_name: str = re.findall("[A-Z][^A-Z]*", self.__class__.__name__)[-2] # set default storage class backend.setdefault("class", f"File{provider_name}Storage") # set default storage module @@ -335,6 +338,10 @@ def feature(self, instrument, field, start_time, end_time, freq): class PITProvider(abc.ABC): + @property + def provider_name(self): + return "PIT" + @abc.abstractmethod def period_feature( self, @@ -741,10 +748,15 @@ def feature(self, instrument, field, start_index, end_index, freq): return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] -class LocalPITProvider(PITProvider): +class LocalPITProvider(PITProvider, ProviderBackendMixin): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! + def __init__(self, remote=False, backend={}): + super().__init__() + self.remote = remote + self.backend = backend + def period_feature(self, instrument, field, start_offset, end_offset, cur_time, period=None, start_time=None): """get raw data from PIT we have 3 modes to query data from PIT, all method need current datetime @@ -764,17 +776,11 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, assert end_offset <= 0 # PIT don't support querying future data - DATA_RECORDS = [ - ("date", C.pit_record_type["date"]), - ("period", C.pit_record_type["period"]), - ("value", C.pit_record_type["value"]), - ("_next", C.pit_record_type["index"]), - ] - VALUE_DTYPE = C.pit_record_type["value"] - field = str(field).lower()[2:] instrument = code_to_fname(instrument) + backend_obj = self.backend_obj(instrument=instrument, field=field) + # {For acceleration # start_index, end_index, cur_index = kwargs["info"] # if cur_index == start_index: @@ -803,8 +809,8 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, ## so we cannot findout the offset by given date ## stop using index in this version # start_point = get_pitdata_offset(index_path, period, ) - data = np.fromfile(data_path, dtype=DATA_RECORDS) - df = pd.DataFrame(data, columns=[i[0] for i in DATA_RECORDS]) + data = backend_obj.np_data() + df = pd.DataFrame(data) df.sort_values(by=["date", "period"], inplace=True) df["date"] = pd.to_datetime(df["date"].astype(str)) H["f"][key] = df @@ -823,7 +829,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] if s_part.empty: - return pd.Series(dtype=VALUE_DTYPE) + return pd.Series(index=backend_obj.columns, dtype="float64") if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) @@ -832,7 +838,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: df_remain = df[(df["date"] <= cur_time)] if df_remain.empty: - return pd.Series(dtype=VALUE_DTYPE) + return pd.Series(index=backend_obj.columns, dtype="float64") last_observe_date = df_remain["date"].iloc[-1] # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 8a100a2d19..2d36fe3bef 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -7,13 +7,21 @@ import numpy as np import pandas as pd +from qlib.data.storage.storage import PITStorage from qlib.utils.time import Freq from qlib.utils.resam import resam_calendar from qlib.config import C from qlib.data.cache import H from qlib.log import get_module_logger -from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage, CalVT, InstKT, InstVT +from qlib.data.storage import ( + CalendarStorage, + InstrumentStorage, + FeatureStorage, + CalVT, + InstKT, + InstVT, +) logger = get_module_logger("file_storage") @@ -48,7 +56,10 @@ def support_freq(self) -> List[str]: if len(self.provider_uri) == 1 and C.DEFAULT_FREQ in self.provider_uri: freq_l = filter( lambda _freq: not _freq.endswith("_future"), - map(lambda x: x.stem, self.dpm.get_data_uri(C.DEFAULT_FREQ).joinpath("calendars").glob("*.txt")), + map( + lambda x: x.stem, + self.dpm.get_data_uri(C.DEFAULT_FREQ).joinpath("calendars").glob("*.txt"), + ), ) else: freq_l = self.provider_uri.keys() @@ -140,7 +151,10 @@ def data(self) -> List[CalVT]: _calendar = self._read_calendar() if Freq(self._freq_file) != Freq(self.freq): _calendar = resam_calendar( - np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq, self.region + np.array(list(map(pd.Timestamp, _calendar))), + self._freq_file, + self.freq, + self.region, ) return _calendar @@ -287,6 +301,7 @@ def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" + self._start_index = None def clear(self): with self.uri.open("wb") as _: @@ -303,6 +318,7 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" ) return + self._start_index = None if not self.uri.exists(): # write index = 0 if index is None else index @@ -320,7 +336,9 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: _old_data = np.fromfile(fp, dtype=" None: def start_index(self) -> Union[int, None]: if not self.uri.exists(): return None - with self.uri.open("rb") as fp: - index = int(np.frombuffer(fp.read(4), dtype=" Union[int, None]: @@ -377,3 +396,179 @@ def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.Serie def __len__(self) -> int: self.check() return self.uri.stat().st_size // 4 - 1 + + +class FilePITStorage(FileStorageMixin, PITStorage): + """PIT data is a special case of Feature data, it looks like + + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + It is sorted by [date, period]. + + next field currently is not used. just for forward compatible. + """ + + # NOTE: + # PIT data should have two files, one is the index file, the other is the data file. + + # pesudo code: + # date_index = calendar.index(date) + # data_start_index, data_end_index = index_file[date_index] + # data = data_file[data_start_index:data_end_index] + + # the index file is like feature's data file, but given a start index in index file, it will return the first and the last observe index of the data file. + # the data file has tree columns, the first column is observe date, the second column is financial period, the third column is the value. + + # so given start and end date, we can get the start_index and end_index from calendar. + # use it to read two line from index file, then we can get the start and end index of the data file. + + # but consider this implementation, we will create a index file which will have 50 times lines than the data file. Is it a good idea? + # if we just create a index file the same line with data file, we have to read the whole index file for any time slice search, so why not read whole data file? + + def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: dict = None, **kwargs): + super(FilePITStorage, self).__init__(instrument, field, freq, **kwargs) + + if not field.endswith("_q") and not field.endswith("_a"): + raise ValueError("period field must ends with '_q' or '_a'") + self.quarterly = field.endswith("_q") + + self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) + self.file_name = f"{instrument.lower()}/{field.lower()}.data" + self.raw_dtype = [ + ("date", C.pit_record_type["date"]), + ("period", C.pit_record_type["period"]), + ("value", C.pit_record_type["value"]), + ("_next", C.pit_record_type["index"]), # not used in current implementation + ] + self.dtypes = np.dtype(self.raw_dtype) + self.itemsize = self.dtypes.itemsize + self.dtype_string = "".join([i[1] for i in self.raw_dtype]) + self.columns = [i[0] for i in self.raw_dtype] + + @property + def uri(self) -> Path: + if self.freq not in self.support_freq: + raise ValueError(f"{self.storage_name}: {self.provider_uri} does not contain data for {self.freq}") + return self.dpm.get_data_uri(self.freq).joinpath(f"{self.storage_name}", self.file_name) + + def clear(self): + with self.uri.open("wb") as _: + pass + + @property + def data(self) -> pd.DataFrame: + return self[:] + + def update(self, data_array: np.ndarray) -> None: + """update data to storage, replace current data from start_date to end_date with given data_array + + Args: + data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype + """ + if not self.uri.exists(): + # write + index = 0 + else: + # sort it + data_array = np.sort(data_array, order=["date", "period"]) + # get index + update_start_date = data_array[0][0] + update_end_date = data_array[-1][0] + current_data = self.np_data() + index = (current_data["date"] >= update_start_date).argmax() + end_index = (current_data["date"] > update_end_date).argmax() + new_data = np.concatenate([data_array, current_data[end_index:]]) + self.write(new_data, index) + + def write(self, data_array: np.ndarray, index: int = None) -> None: + """write data to storage at specific index + + Args: + data_array: Structured arrays contains date, period, value and next + index: _description_. Defaults to None. + """ + + if len(data_array) == 0: + logger.info( + "len(data_array) == 0, write" + "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" + ) + return + + # sort data_array with first 2 columns + data_array = np.sort(data_array, order=["date", "period"]) + + if not self.uri.exists(): + # write + index = 0 if index is None else index + with self.uri.open("wb") as fp: + data_array.tofile(self.uri) + else: + with self.uri.open("rb+") as fp: + fp.seek(index * self.itemsize) + data_array.tofile(fp) + + @property + def start_index(self) -> Union[int, None]: + return 0 + + @property + def end_index(self) -> Union[int, None]: + if not self.uri.exists(): + return None + # The next data appending index point will be `end_index + 1` + return self.start_index + len(self) - 1 + + def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + if not self.uri.exists(): + if isinstance(i, int): + return None, None + elif isinstance(i, slice): + return pd.Series(dtype=np.float32) + else: + raise TypeError(f"type(i) = {type(i)}") + + if i is None: + i = slice(None, None) + storage_start_index = self.start_index + storage_end_index = self.end_index + with self.uri.open("rb") as fp: + if isinstance(i, int): + if storage_start_index > i: + raise IndexError(f"{i}: start index is {storage_start_index}") + fp.seek(i * self.itemsize) + return np.array([struct.unpack(self.dtype_string, fp.read(self.itemsize))], dtype=self.dtypes) + elif isinstance(i, slice): + start_index = storage_start_index if i.start is None else i.start + end_index = storage_end_index if i.stop is None else i.stop - 1 + si = max(start_index, storage_start_index) + if si > end_index: + return pd.Series(dtype=np.float32) + fp.seek(start_index * self.itemsize) + # read n bytes + count = end_index - si + 1 + data = np.frombuffer(fp.read(self.itemsize * count), dtype=self.dtypes) + return data + else: + raise TypeError(f"type(i) = {type(i)}") + + def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.DataFrame]: + if isinstance(i, int): + return pd.Series(self.np_data(i), index=self.columns, name=i) + elif isinstance(i, slice): + data = self.np_data(i) + si = self.start_index if i.start is None else i.start + if si < 0: + si = len(self) + si + return pd.DataFrame(data, index=pd.RangeIndex(si, si + len(data)), columns=self.columns) + else: + raise TypeError(f"type(i) = {type(i)}") + + def __len__(self) -> int: + self.check() + return self.uri.stat().st_size // self.itemsize diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index 2eb7da1de6..0d0ee0e7eb 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -492,3 +492,129 @@ def __len__(self) -> int: """ raise NotImplementedError("Subclass of FeatureStorage must implement `__len__` method") + + +class PITStorage(FeatureStorage): + @property + def storage_name(self) -> str: + return "financial" # for compatibility + + @property + def data(self) -> pd.DataFrame: + """get all data + + dataframe index is date, columns are report_period and value + + Notes + ------ + if data(storage) does not exist, return empty pd.DataFrame: `return pd.DataFrame(dtype=np.float32)` + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") + + def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): + """Write data_array to FeatureStorage starting from index. + + Notes + ------ + If index is None, append data_array to feature. + + If len(data_array) == 0; return + + If (index - self.end_index) >= 1, self[end_index+1: index] will be filled with np.nan + + Examples + --------- + .. code-block:: + + feature: + 3 4 + 4 5 + 5 6 + + + >>> self.write([6, 7], index=6) + + feature: + 3 4 + 4 5 + 5 6 + 6 6 + 7 7 + + >>> self.write([8], index=9) + + feature: + 3 4 + 4 5 + 5 6 + 6 6 + 7 7 + 8 np.nan + 9 8 + + >>> self.write([1, np.nan], index=3) + + feature: + 3 1 + 4 np.nan + 5 6 + 6 6 + 7 7 + 8 np.nan + 9 8 + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") + + def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): + """overwrite all data in FeatureStorage with data + + Parameters + ---------- + data: Union[List, np.ndarray, Tuple] + data + index: int + data start index + """ + self.clear() + self.write(data, index) + + @overload + def __getitem__(self, s: slice) -> pd.Series: + """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step] + + Returns + ------- + pd.Series(values, index=pd.RangeIndex(start, len(values)) + """ + + @overload + def __getitem__(self, i: int) -> Tuple[int, float]: + """x.__getitem__(y) <==> x[y]""" + + def __getitem__(self, i) -> Union[Tuple[int, float], pd.Series]: + """x.__getitem__(y) <==> x[y] + + Notes + ------- + if data(storage) does not exist: + if isinstance(i, int): + return (None, None) + if isinstance(i, slice): + # return empty pd.Series + return pd.Series(dtype=np.float32) + """ + raise NotImplementedError( + "Subclass of FeatureStorage must implement `__getitem__(i: int)`/`__getitem__(s: slice)` method" + ) + + def __len__(self) -> int: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `__len__` method") diff --git a/tests/test_pit.py b/tests/test_pit.py index 8320e1d361..26655b85ab 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -3,6 +3,8 @@ import sys + +import numpy as np import qlib import shutil import unittest @@ -12,6 +14,7 @@ from pathlib import Path from qlib.data import D +from qlib.data.storage.file_storage import FilePITStorage from qlib.tests.data import GetData sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) @@ -32,37 +35,37 @@ class TestPIT(unittest.TestCase): - @classmethod - def tearDownClass(cls) -> None: - shutil.rmtree(str(DATA_DIR.resolve())) - - @classmethod - def setUpClass(cls) -> None: - cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) - pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) - pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) - GetData().qlib_data( - name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True - ) - GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) - - # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. - # bs.login() - # Run( - # source_dir=pit_dir, - # interval="quarterly", - # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") - # bs.logout() - - Run( - source_dir=pit_dir, - normalize_dir=pit_normalized_dir, - interval="quarterly", - ).normalize_data() - DumpPitData( - csv_path=pit_normalized_dir, - qlib_dir=cn_data_dir, - ).dump(interval="quarterly") + # @classmethod + # def tearDownClass(cls) -> None: + # shutil.rmtree(str(DATA_DIR.resolve())) + + # @classmethod + # def setUpClass(cls) -> None: + # cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) + # pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) + # pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) + # GetData().qlib_data( + # name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True + # ) + # GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) + + # # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. + # # bs.login() + # # Run( + # # source_dir=pit_dir, + # # interval="quarterly", + # # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") + # # bs.logout() + + # Run( + # source_dir=pit_dir, + # normalize_dir=pit_normalized_dir, + # interval="quarterly", + # ).normalize_data() + # DumpPitData( + # csv_path=pit_normalized_dir, + # qlib_dir=cn_data_dir, + # ).dump(interval="quarterly") def setUp(self): # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier @@ -70,11 +73,84 @@ def setUp(self): qlib.init(provider_uri=provider_uri) def to_str(self, obj): - return "".join(str(obj).split()) + return "\n".join(str(obj).split()) def check_same(self, a, b): self.assertEqual(self.to_str(a), self.to_str(b)) + def test_storage_read(self): + s = FilePITStorage("sh600519", "roewa_q") + np_data = s.np_data(1) + self.assertEqual(np_data.shape, (1,)) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + """, + ) + + def test_storage_write(self): + base = FilePITStorage("sh600519", "roewa_q") + s = FilePITStorage("sh600519", "roewa2_q") + + shutil.copy(base.uri, s.uri) + s.write( + np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), + 1, + ) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070917 200703 0.239330 0 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + """, + ) + + def test_storage_slice(self): + s = FilePITStorage("sh600519", "roewa_q") + data = s[1:4] + self.check_same( + data, + """ + date period value _next + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + """, + ) + + def test_storage_update(self): + base = FilePITStorage("sh600519", "roewa_q") + s = FilePITStorage("sh600519", "roewa3_q") + + shutil.copy(base.uri, s.uri) + s.update( + np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype), + ) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20070917 200703 0.111111 0 + 3 20100314 200703 0.111111 0 + 4 20100402 200904 0.335461 4294967295 + """, + ) + def test_query(self): instruments = ["sh600519"] fields = ["P($$roewa_q)", "P($$yoyni_q)"] @@ -107,7 +183,13 @@ def test_query(self): def test_no_exist_data(self): fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"] - data = D.features(["sh600519", "sh601988"], fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") + data = D.features( + ["sh600519", "sh601988"], + fields, + start_time="2019-01-01", + end_time="2019-07-19", + freq="day", + ) data["$close"] = 1 # in case of different dataset gives different values expect = """ P($$roewa_q) P($$yoyni_q) $close From 61c31ca8a480e68606c588ae49503e866838654a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:25:56 +0800 Subject: [PATCH 06/34] improve docstring --- qlib/data/storage/file_storage.py | 12 +++- qlib/data/storage/storage.py | 102 ++++++++++++++++++++---------- tests/test_pit.py | 62 +++++++++--------- 3 files changed, 111 insertions(+), 65 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 2d36fe3bef..a36d1cca60 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -490,7 +490,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: Args: data_array: Structured arrays contains date, period, value and next - index: _description_. Defaults to None. + index: target index to start writing. Defaults to None. """ if len(data_array) == 0: @@ -509,6 +509,8 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: with self.uri.open("wb") as fp: data_array.tofile(self.uri) else: + if index is None or index > self.end_index: + index = self.end_index + 1 with self.uri.open("rb+") as fp: fp.seek(index * self.itemsize) data_array.tofile(fp) @@ -525,6 +527,14 @@ def end_index(self) -> Union[int, None]: return self.start_index + len(self) - 1 def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ if not self.uri.exists(): if isinstance(i, int): return None, None diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index 0d0ee0e7eb..acd6172ab5 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -495,10 +495,36 @@ def __len__(self) -> int: class PITStorage(FeatureStorage): + """PIT data is a special case of Feature data, it looks like + + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + It is sorted by [date, period]. + + next field currently is not used. just for forward compatible. + """ + @property def storage_name(self) -> str: return "financial" # for compatibility + def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ + + raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") + @property def data(self) -> pd.DataFrame: """get all data @@ -511,7 +537,7 @@ def data(self) -> pd.DataFrame: """ raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") - def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): + def write(self, data_array: np.ndarray, index: int = None): """Write data_array to FeatureStorage starting from index. Notes @@ -526,42 +552,24 @@ def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): --------- .. code-block:: - feature: - 3 4 - 4 5 - 5 6 + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 - >>> self.write([6, 7], index=6) + >>> s.write(np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), 1) feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - - >>> self.write([8], index=9) - - feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 - - >>> self.write([1, np.nan], index=3) - - feature: - 3 1 - 4 np.nan - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070917 200703 0.239330 0 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 """ raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") @@ -579,6 +587,34 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): self.clear() self.write(data, index) + def update(self, data_array: np.ndarray) -> None: + """update data to storage, replace current data from start_date to end_date with given data_array + + Args: + data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype + + Examples + --------- + .. code-block:: + + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + >>> s.update(np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype)) + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20070917 200703 0.111111 0 + 3 20100314 200703 0.111111 0 + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `update` method") + @overload def __getitem__(self, s: slice) -> pd.Series: """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step] diff --git a/tests/test_pit.py b/tests/test_pit.py index 26655b85ab..359be618dd 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -35,37 +35,37 @@ class TestPIT(unittest.TestCase): - # @classmethod - # def tearDownClass(cls) -> None: - # shutil.rmtree(str(DATA_DIR.resolve())) - - # @classmethod - # def setUpClass(cls) -> None: - # cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) - # pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) - # pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) - # GetData().qlib_data( - # name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True - # ) - # GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) - - # # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. - # # bs.login() - # # Run( - # # source_dir=pit_dir, - # # interval="quarterly", - # # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") - # # bs.logout() - - # Run( - # source_dir=pit_dir, - # normalize_dir=pit_normalized_dir, - # interval="quarterly", - # ).normalize_data() - # DumpPitData( - # csv_path=pit_normalized_dir, - # qlib_dir=cn_data_dir, - # ).dump(interval="quarterly") + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(str(DATA_DIR.resolve())) + + @classmethod + def setUpClass(cls) -> None: + cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) + pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) + pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) + GetData().qlib_data( + name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True + ) + GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) + + # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. + # bs.login() + # Run( + # source_dir=pit_dir, + # interval="quarterly", + # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") + # bs.logout() + + Run( + source_dir=pit_dir, + normalize_dir=pit_normalized_dir, + interval="quarterly", + ).normalize_data() + DumpPitData( + csv_path=pit_normalized_dir, + qlib_dir=cn_data_dir, + ).dump(interval="quarterly") def setUp(self): # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier From d82ab8d8bdcd4aee26f782aafef0cc9a9a713eca Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:34:05 +0800 Subject: [PATCH 07/34] remove index file check --- qlib/data/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index 86ddf893c5..ac853fda64 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -800,9 +800,9 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" + # index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" - if not (index_path.exists() and data_path.exists()): + if not data_path.exists(): raise FileNotFoundError("No file is found.") ## get first period offset ## NOTE: current index file return offset from a given period not date From 87020494b99121288efa482b3aea5fe634c37a7a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:40:07 +0800 Subject: [PATCH 08/34] pit rewrite does not need index --- qlib/data/storage/storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index acd6172ab5..d5151c7d3d 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -574,7 +574,7 @@ def write(self, data_array: np.ndarray, index: int = None): """ raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") - def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): + def rewrite(self, data: Union[List, np.ndarray, Tuple]): """overwrite all data in FeatureStorage with data Parameters @@ -585,7 +585,7 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): data start index """ self.clear() - self.write(data, index) + self.write(data, 0) def update(self, data_array: np.ndarray) -> None: """update data to storage, replace current data from start_date to end_date with given data_array From e07487def10da606b817c1be2eb173226bc1c620 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:07:14 +0800 Subject: [PATCH 09/34] fix typo --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a36d1cca60..f78a2f104e 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -507,7 +507,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: # write index = 0 if index is None else index with self.uri.open("wb") as fp: - data_array.tofile(self.uri) + data_array.tofile(fp) else: if index is None or index > self.end_index: index = self.end_index + 1 From 8d96bd651a9feca261b3754613340888fcce5508 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:22:48 +0800 Subject: [PATCH 10/34] make sure dir exist --- qlib/data/storage/file_storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index f78a2f104e..a94b6ff1f6 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -439,6 +439,7 @@ def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.data" + self.uri.parent.mkdir(exist_ok=True) self.raw_dtype = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), From 4213b68c005540941590ff2933b074fc4c0b4217 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:44:59 +0800 Subject: [PATCH 11/34] fix parents not exist --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a94b6ff1f6..fe091a89ce 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -439,7 +439,7 @@ def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.data" - self.uri.parent.mkdir(exist_ok=True) + self.uri.parent.mkdir(parents=True, exist_ok=True) self.raw_dtype = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), From 8a354efe0e7169930feeed68488ab7cbfd353e43 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 17:08:19 +0800 Subject: [PATCH 12/34] fix pitstorage update --- qlib/data/storage/file_storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index fe091a89ce..a22c083fd5 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -474,6 +474,7 @@ def update(self, data_array: np.ndarray) -> None: if not self.uri.exists(): # write index = 0 + self.write(data_array, index) else: # sort it data_array = np.sort(data_array, order=["date", "period"]) From dbfe153a91dc1ea0c72625a7769e04538b3c7be7 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Wed, 1 Nov 2023 11:07:47 +0800 Subject: [PATCH 13/34] check dtype --- qlib/data/storage/file_storage.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a22c083fd5..f6d0ec517d 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -501,7 +501,10 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" ) return - + # check data_array dtype + if data_array.dtype != self.dtypes: + raise ValueError(f"data_array.dtype = {data_array.dtype}, self.dtypes = {self.dtypes}") + # sort data_array with first 2 columns data_array = np.sort(data_array, order=["date", "period"]) From 20889cafb5460964d435f0fc945f9d33bfd12195 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Wed, 1 Nov 2023 13:12:21 +0800 Subject: [PATCH 14/34] fix empty data --- qlib/data/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index ac853fda64..a07a48beff 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -829,7 +829,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] if s_part.empty: - return pd.Series(index=backend_obj.columns, dtype="float64") + return pd.Series(dtype="float64") if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) @@ -838,7 +838,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: df_remain = df[(df["date"] <= cur_time)] if df_remain.empty: - return pd.Series(index=backend_obj.columns, dtype="float64") + return pd.Series(dtype="float64") last_observe_date = df_remain["date"].iloc[-1] # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") From 5c161235d247e910761d8152a6553582f0fb9aac Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 2 Nov 2023 13:46:20 +0800 Subject: [PATCH 15/34] lint --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index f6d0ec517d..d335bb22be 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -504,7 +504,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: # check data_array dtype if data_array.dtype != self.dtypes: raise ValueError(f"data_array.dtype = {data_array.dtype}, self.dtypes = {self.dtypes}") - + # sort data_array with first 2 columns data_array = np.sort(data_array, order=["date", "period"]) From 31c37470885f1ae256ac6eee576c431b880ed256 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 7 Nov 2023 19:31:57 +0800 Subject: [PATCH 16/34] deal with empty data file --- qlib/data/storage/file_storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index d335bb22be..125cd35a2d 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -471,7 +471,7 @@ def update(self, data_array: np.ndarray) -> None: Args: data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype """ - if not self.uri.exists(): + if not self.uri.exists() or len(self) == 0: # write index = 0 self.write(data_array, index) @@ -544,7 +544,7 @@ def np_data(self, i: Union[int, slice] = None) -> np.ndarray: if isinstance(i, int): return None, None elif isinstance(i, slice): - return pd.Series(dtype=np.float32) + return np.array(dtype=self.dtypes) else: raise TypeError(f"type(i) = {type(i)}") @@ -563,7 +563,7 @@ def np_data(self, i: Union[int, slice] = None) -> np.ndarray: end_index = storage_end_index if i.stop is None else i.stop - 1 si = max(start_index, storage_start_index) if si > end_index: - return pd.Series(dtype=np.float32) + return np.array(dtype=self.dtypes) fp.seek(start_index * self.itemsize) # read n bytes count = end_index - si + 1 From ef9242e6be61ce461d018d5769e2dfdb9d052b25 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 28 Nov 2023 18:38:06 +0800 Subject: [PATCH 17/34] remove useless function --- qlib/utils/__init__.py | 60 ------------------------------------------ 1 file changed, 60 deletions(-) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 66aa2ef07d..3c0723138f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -134,66 +134,6 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): - """ - At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). - Only the updating info before cur_date or at cur_date will be used. - - Parameters - ---------- - period: int - date period represented by interger, e.g. 201901 corresponds to the first quarter in 2019 - cur_date_int: int - date which represented by interger, e.g. 20190102 - last_period_index: int - it is a optional parameter; it is designed to avoid repeatedly access the .index data of PIT database when - sequentially observing the data (Because the latest index of a specific period of data certainly appear in after the one in last observation). - - Returns - ------- - the query value and byte index the index value - """ - DATA_DTYPE = "".join( - [ - C.pit_record_type["date"], - C.pit_record_type["period"], - C.pit_record_type["value"], - C.pit_record_type["index"], - ] - ) - - PERIOD_DTYPE = C.pit_record_type["period"] - INDEX_DTYPE = C.pit_record_type["index"] - - NAN_VALUE = C.pit_record_nan["value"] - NAN_INDEX = C.pit_record_nan["index"] - - # find the first index of linked revisions - if last_period_index is None: - with open(index_path, "rb") as fi: - (first_year,) = struct.unpack(PERIOD_DTYPE, fi.read(struct.calcsize(PERIOD_DTYPE))) - all_periods = np.fromfile(fi, dtype=INDEX_DTYPE) - offset = get_period_offset(first_year, period, quarterly) - _next = all_periods[offset] - else: - _next = last_period_index - - # load data following the `_next` link - prev_value = NAN_VALUE - prev_next = _next - - with open(data_path, "rb") as fd: - while _next != NAN_INDEX: - fd.seek(_next) - date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) - if date > cur_date_int: - break - prev_next = _next - _next = new_next - prev_value = value - return prev_value, prev_next - - def np_ffill(arr: np.array): """ forward fill a 1D numpy array From 87d65e1a33fdbcb819804d7f916c53bc4fe1de24 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 19 Oct 2023 21:33:41 +0800 Subject: [PATCH 18/34] improve pit performance --- .gitignore | 1 + qlib/data/base.py | 4 +- qlib/data/cache.py | 4 ++ qlib/data/data.py | 114 ++++++++++++++++++++++------------------- qlib/data/pit.py | 44 ++++++++++------ qlib/utils/__init__.py | 33 ++++++++++++ 6 files changed, 129 insertions(+), 71 deletions(-) diff --git a/.gitignore b/.gitignore index 8854c25e99..0be5d251a6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ _build build/ dist/ +tests/test_pit_data/ *.pkl *.hd5 diff --git a/qlib/data/base.py b/qlib/data/base.py index 496ae38ee2..f7d3df682a 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -267,10 +267,10 @@ class PFeature(Feature): def __str__(self): return "$$" + self._name - def _load_internal(self, instrument, start_index, end_index, cur_time, period=None): + def _load_internal(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): from .data import PITD # pylint: disable=C0415 - return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period) + return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period, start_time) class ExpressionOps(Expression): diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 3264dcd020..f1e6bd3764 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -160,6 +160,7 @@ def __init__(self, mem_cache_size_limit=None, limit_type="length"): self.__calendar_mem_cache = klass(size_limit) self.__instrument_mem_cache = klass(size_limit) self.__feature_mem_cache = klass(size_limit) + self.__pit_mem_cache = klass(size_limit) def __getitem__(self, key): if key == "c": @@ -168,6 +169,8 @@ def __getitem__(self, key): return self.__instrument_mem_cache elif key == "f": return self.__feature_mem_cache + elif key == "p": + return self.__pit_mem_cache else: raise KeyError("Unknown memcache unit") @@ -175,6 +178,7 @@ def clear(self): self.__calendar_mem_cache.clear() self.__instrument_mem_cache.clear() self.__feature_mem_cache.clear() + self.__pit_mem_cache.clear() class MemCacheExpire: diff --git a/qlib/data/data.py b/qlib/data/data.py index 116827f232..22b2f9c47b 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -33,8 +33,7 @@ normalize_cache_fields, code_to_fname, time_to_slc_point, - read_period_data, - get_period_list, + get_period_list_by_offset, ) from ..utils.paral import ParallelExt from .ops import Operators # pylint: disable=W0611 # noqa: F401 @@ -746,13 +745,24 @@ class LocalPITProvider(PITProvider): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! - def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None): + def period_feature(self, instrument, field, start_offset, end_offset, cur_time, period=None, start_time=None): + """get raw data from PIT + we have 3 modes to query data from PIT, all method need current datetime + + 1. given period, return value observed at current datetime + return series with index as datetime + 2. given start_time, return value **observed by each day** from start_time to current datetime + return series with index as datetime + 3. given start_offset and end_offset, return period data between [-start_offset, end_offset] observed at current datetime + return series with index as period + + """ if not isinstance(cur_time, pd.Timestamp): raise ValueError( f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')" ) - assert end_index <= 0 # PIT don't support querying future data + assert end_offset <= 0 # PIT don't support querying future data DATA_RECORDS = [ ("date", C.pit_record_type["date"]), @@ -777,58 +787,56 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe # self.period_index[field] = {} # For acceleration} - if not field.endswith("_q") and not field.endswith("_a"): - raise ValueError("period field must ends with '_q' or '_a'") + key = f"{instrument}.{field}" quarterly = field.endswith("_q") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" - data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" - if not (index_path.exists() and data_path.exists()): - raise FileNotFoundError("No file is found.") - # NOTE: The most significant performance loss is here. - # Does the acceleration that makes the program complicated really matters? - # - It makes parameters of the interface complicate - # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance) - # - If we design it carefully, we can go through for only once to get the historical evolution of the data. - # So I decide to deprecated previous implementation and keep the logic of the program simple - # Instead, I'll add a cache for the index file. - data = np.fromfile(data_path, dtype=DATA_RECORDS) - - # find all revision periods before `cur_time` - cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day) - loc = np.searchsorted(data["date"], cur_time_int, side="right") - if loc <= 0: - return pd.Series(dtype=C.pit_record_type["value"]) - last_period = data["period"][:loc].max() # return the latest quarter - first_period = data["period"][:loc].min() - period_list = get_period_list(first_period, last_period, quarterly) + if key in H["f"]: + df = H["f"][key] + else: + if not field.endswith("_q") and not field.endswith("_a"): + raise ValueError("period field must ends with '_q' or '_a'") + index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" + data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" + if not (index_path.exists() and data_path.exists()): + raise FileNotFoundError("No file is found.") + ## get first period offset + ## NOTE: current index file return offset from a given period not date + ## so we cannot findout the offset by given date + ## stop using index in this version + # start_point = get_pitdata_offset(index_path, period, ) + data = np.fromfile(data_path, dtype=DATA_RECORDS) + df = pd.DataFrame(data, columns=[i[0] for i in DATA_RECORDS]) + df.sort_values(by=["date", "period"], inplace=True) + df["date"] = pd.to_datetime(df["date"].astype(str)) + H["f"][key] = df + + df_ret = df[(df["date"] <= cur_time)] + if df_ret.empty: + return pd.Series(dtype=VALUE_DTYPE) + # keep only the latest period value + df_ret = df_ret.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") + df_ret = df_ret.set_index("period") + # return df if period is not None: - # NOTE: `period` has higher priority than `start_index` & `end_index` - if period not in period_list: - return pd.Series(dtype=C.pit_record_type["value"]) - else: - period_list = [period] + retur = df[df["period"] == period].set_index("date")["value"] + elif start_time is not None: + # df is sorted by date, and the term whose period is monotonically non-decreasing is selected. + s_sign = pd.Series(False, index=df.index) + max_p = df["period"].iloc[0] + for i in range(0, len(s_sign)): + if df["period"].iloc[i] >= max_p: + s_sign.iloc[i] = True + max_p = df["period"].iloc[i] + df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") + s_part = df_sim.set_index("date")[start_time:]["value"] + if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: + # add previous value to result to avoid nan in the first period + pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) + s_part = pd.concat([pre_value, s_part]) + return s_part else: - period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] - value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE) - for i, p in enumerate(period_list): - # last_period_index = self.period_index[field].get(period) # For acceleration - value[i], now_period_index = read_period_data( - index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration - ) - # self.period_index[field].update({period: now_period_index}) # For acceleration - # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan) - # when calculation between different features and only part of its financial indicator is published - series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) - - # {For acceleration - # if cur_index == end_index: - # self.all_fields.remove(field) - # if not len(self.all_fields): - # del self.all_fields - # del self.period_index - # For acceleration} - - return series + period_list = get_period_list_by_offset(df_ret.index[-1], -start_offset, quarterly) + retur = df_ret["value"].reindex(period_list, fill_value=np.nan) + return retur class LocalExpressionProvider(ExpressionProvider): diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 33d5e0c5cc..9b5b7a88c0 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -24,31 +24,43 @@ class P(ElemOperator): def _load_internal(self, instrument, start_index, end_index, freq): _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") - - for cur_index in range(start_index, end_index + 1): - cur_time = _calendar[cur_index] - # To load expression accurately, more historical data are required - start_ws, end_ws = self.feature.get_extended_window_size() - if end_ws > 0: - raise ValueError( - "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" - ) - - # The calculated value will always the last element, so the end_offset is zero. + # To load expression accurately, more historical data are required + start_ws, end_ws = self.feature.get_extended_window_size() + # if start_ws = 0, means expression use only current data, so pit history data is not required + if start_ws == 0 and end_ws == 0: try: - s = self._load_feature(instrument, -start_ws, 0, cur_time) - resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan + # get start and end date + s = self._load_feature(instrument, 0, 0, _calendar[end_index], None, _calendar[start_index]) + # index in s may not in calendar, so we need to reindex it to continue date first + s = s.reindex(pd.date_range(start=s.iloc[0], end=_calendar[end_index])).fillna(method="ffill") + resample_data = s.reindex(_calendar[start_index : end_index + 1]).fillna(method="ffill").values except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") return pd.Series(dtype="float32", name=str(self)) + else: + for cur_index in range(start_index, end_index + 1): + cur_time = _calendar[cur_index] + + if end_ws > 0: + raise ValueError( + "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" + ) + + # The calculated value will always the last element, so the end_offset is zero. + try: + s = self._load_feature(instrument, -start_ws, 0, cur_time) + resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan + except FileNotFoundError: + get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + return pd.Series(dtype="float32", name=str(self)) resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) ) return resample_series - def _load_feature(self, instrument, start_index, end_index, cur_time): - return self.feature.load(instrument, start_index, end_index, cur_time) + def _load_feature(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): + return self.feature.load(instrument, start_index, end_index, cur_time, period, start_time) def get_longest_back_rolling(self): # The period data will collapse as a normal feature. So no extending and looking back @@ -67,5 +79,5 @@ def __init__(self, feature, period): def __str__(self): return f"{super().__str__()}[{self.period}]" - def _load_feature(self, instrument, start_index, end_index, cur_time): + def _load_feature(self, instrument, start_index, end_index, cur_time, period=None, start_time=None): return self.feature.load(instrument, start_index, end_index, cur_time, self.period) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 9e63c104a1..5bd51922d5 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -88,6 +88,39 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res +def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[int]: + """ + This method will be used in PIT database. + It return all the possible values between `first(offset-last)` and `end` (first and end is included) + + Parameters + ---------- + offset: int + offset quarter or year from last + quarterly : bool + will it return quarterly index or yearly index. + + Returns + ------- + List[int] + the possible index between [first, last] + """ + + if not quarterly: + assert all(1900 <= x <= 2099 for x in (last,)), "invalid arguments" + return list(range(last - offset, last + 1)) + else: + assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" + res = [] + # last minus offset quarters + for year in range(int(last // 100 - (offset // 4 + 1)), int(last // 100 + 1)): + for q in range(1, 5): + period = year * 100 + q + if period <= last: + res.append(year * 100 + q) + return res[len(res) - offset - 1 :] + + def get_period_offset(first_year, period, quarterly): if quarterly: offset = (period // 100 - first_year) * 4 + period % 100 - 1 From b53bae6e18299d95bee21e093bd39cb6013337f8 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 11:14:54 +0800 Subject: [PATCH 19/34] improve pit cache --- qlib/data/data.py | 30 ++++++++++++++--------- qlib/utils/__init__.py | 55 +++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index 22b2f9c47b..e9e0c803da 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -787,10 +787,10 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, # self.period_index[field] = {} # For acceleration} - key = f"{instrument}.{field}" + key = (instrument, field) quarterly = field.endswith("_q") - if key in H["f"]: - df = H["f"][key] + if key in H["p"]: + df = H["p"][key] else: if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") @@ -809,12 +809,6 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df["date"] = pd.to_datetime(df["date"].astype(str)) H["f"][key] = df - df_ret = df[(df["date"] <= cur_time)] - if df_ret.empty: - return pd.Series(dtype=VALUE_DTYPE) - # keep only the latest period value - df_ret = df_ret.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") - df_ret = df_ret.set_index("period") # return df if period is not None: retur = df[df["period"] == period].set_index("date")["value"] @@ -834,8 +828,22 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, s_part = pd.concat([pre_value, s_part]) return s_part else: - period_list = get_period_list_by_offset(df_ret.index[-1], -start_offset, quarterly) - retur = df_ret["value"].reindex(period_list, fill_value=np.nan) + df_remain = df[(df["date"] <= cur_time)] + if df_remain.empty: + return pd.Series(dtype=VALUE_DTYPE) + last_observe_date = df_remain["date"].iloc[-1] + # keep only the latest period value + df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") + df_remain = df_remain.set_index("period") + + cache_key = (instrument, field, last_observe_date, start_offset, end_offset, quarterly) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" + if cache_key in H["p"]: + retur = H["p"][cache_key] + else: + last_period = df_remain.index[-1] + period_list = get_period_list_by_offset(last_period, start_offset, end_offset, quarterly) + retur = df_remain["value"].reindex(period_list, fill_value=np.nan) + H["p"][cache_key] = retur return retur diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 5bd51922d5..b04b459f1d 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -25,7 +25,12 @@ from pathlib import Path from typing import List, Union, Optional, Callable from packaging import version -from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer +from .file import ( + get_or_create_path, + save_multiple_parts_file, + unpack_archive_with_buffer, + get_tmp_file_with_buffer, +) from ..config import C from ..log import get_module_logger, set_log_with_config @@ -37,7 +42,9 @@ #################### Server #################### def get_redis_connection(): """get redis connection instance.""" - return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password) + return redis.StrictRedis( + host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password + ) #################### Data #################### @@ -88,7 +95,9 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res -def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[int]: +def get_period_list_by_offset( + last: int, start_offset: int, end_offset: int, quarterly: bool +) -> List[int]: """ This method will be used in PIT database. It return all the possible values between `first(offset-last)` and `end` (first and end is included) @@ -105,20 +114,22 @@ def get_period_list_by_offset(last: int, offset: int, quarterly: bool) -> List[i List[int] the possible index between [first, last] """ - + assert end_offset <= 0 if not quarterly: assert all(1900 <= x <= 2099 for x in (last,)), "invalid arguments" - return list(range(last - offset, last + 1)) + return list(range(last + start_offset, last + 1 + end_offset)) else: assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" res = [] # last minus offset quarters - for year in range(int(last // 100 - (offset // 4 + 1)), int(last // 100 + 1)): + for year in range( + int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset + ): for q in range(1, 5): period = year * 100 + q if period <= last: res.append(year * 100 + q) - return res[len(res) - offset - 1 :] + return res[len(res) + start_offset - 1 : len(res) + end_offset + 1] def get_period_offset(first_year, period, quarterly): @@ -129,7 +140,9 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): +def read_period_data( + index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None +): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. @@ -180,7 +193,9 @@ def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly with open(data_path, "rb") as fd: while _next != NAN_INDEX: fd.seek(_next) - date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) + date, period, value, new_next = struct.unpack( + DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE)) + ) if date > cur_date_int: break prev_next = _next @@ -416,7 +431,9 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False): return calendar -def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): +def get_date_by_shift( + trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None +): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date @@ -449,7 +466,9 @@ def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq=" if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: - raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range") + raise IndexError( + f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range" + ) return cal[shift_index] @@ -486,7 +505,11 @@ def transform_end_date(end_date=None, freq="day"): from ..data import D # pylint: disable=C0415 last_date = D.calendar(freq=freq)[-1] - if end_date is None or (str(end_date) == "-1") or (pd.Timestamp(last_date) < pd.Timestamp(end_date)): + if ( + end_date is None + or (str(end_date) == "-1") + or (pd.Timestamp(last_date) < pd.Timestamp(end_date)) + ): log.warning( "\nInfo: the end_date in the configuration file is {}, " "so the default last date {} is used.".format(end_date, last_date) @@ -602,7 +625,9 @@ def exists_qlib_data(qlib_dir): # check instruments code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir())) _instrument = instruments_dir.joinpath("all.txt") - miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names) + miss_code = set( + pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower) + ) - set(code_names) if miss_code and any(map(lambda x: "sht" not in x, miss_code)): return False @@ -838,7 +863,9 @@ def register(self, provider): self._provider = provider def __repr__(self): - return "{name}(provider={provider})".format(name=self.__class__.__name__, provider=self._provider) + return "{name}(provider={provider})".format( + name=self.__class__.__name__, provider=self._provider + ) def __getattr__(self, key): if self.__dict__.get("_provider", None) is None: From 23f16b9d874b64cc0e13b0517fbd79504a212fbe Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 11:16:36 +0800 Subject: [PATCH 20/34] lint --- qlib/data/data.py | 11 +++++++++-- qlib/utils/__init__.py | 42 ++++++++++-------------------------------- 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index e9e0c803da..bbfaeb9c12 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -835,8 +835,15 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") df_remain = df_remain.set_index("period") - - cache_key = (instrument, field, last_observe_date, start_offset, end_offset, quarterly) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" + + cache_key = ( + instrument, + field, + last_observe_date, + start_offset, + end_offset, + quarterly, + ) # f"{instrument}.{field}.{last_observe_date}.{start_offset}.{end_offset}.{quarterly}" if cache_key in H["p"]: retur = H["p"][cache_key] else: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index b04b459f1d..66aa2ef07d 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -42,9 +42,7 @@ #################### Server #################### def get_redis_connection(): """get redis connection instance.""" - return redis.StrictRedis( - host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password - ) + return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password) #################### Data #################### @@ -95,9 +93,7 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: return res -def get_period_list_by_offset( - last: int, start_offset: int, end_offset: int, quarterly: bool -) -> List[int]: +def get_period_list_by_offset(last: int, start_offset: int, end_offset: int, quarterly: bool) -> List[int]: """ This method will be used in PIT database. It return all the possible values between `first(offset-last)` and `end` (first and end is included) @@ -122,9 +118,7 @@ def get_period_list_by_offset( assert all(190000 <= x <= 209904 for x in (last,)), "invalid arguments" res = [] # last minus offset quarters - for year in range( - int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset - ): + for year in range(int(last // 100 + start_offset // 4 - 1), int(last // 100 + 1) + end_offset): for q in range(1, 5): period = year * 100 + q if period <= last: @@ -140,9 +134,7 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data( - index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None -): +def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. @@ -193,9 +185,7 @@ def read_period_data( with open(data_path, "rb") as fd: while _next != NAN_INDEX: fd.seek(_next) - date, period, value, new_next = struct.unpack( - DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE)) - ) + date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) if date > cur_date_int: break prev_next = _next @@ -431,9 +421,7 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False): return calendar -def get_date_by_shift( - trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None -): +def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date @@ -466,9 +454,7 @@ def get_date_by_shift( if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: - raise IndexError( - f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range" - ) + raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range") return cal[shift_index] @@ -505,11 +491,7 @@ def transform_end_date(end_date=None, freq="day"): from ..data import D # pylint: disable=C0415 last_date = D.calendar(freq=freq)[-1] - if ( - end_date is None - or (str(end_date) == "-1") - or (pd.Timestamp(last_date) < pd.Timestamp(end_date)) - ): + if end_date is None or (str(end_date) == "-1") or (pd.Timestamp(last_date) < pd.Timestamp(end_date)): log.warning( "\nInfo: the end_date in the configuration file is {}, " "so the default last date {} is used.".format(end_date, last_date) @@ -625,9 +607,7 @@ def exists_qlib_data(qlib_dir): # check instruments code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir())) _instrument = instruments_dir.joinpath("all.txt") - miss_code = set( - pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower) - ) - set(code_names) + miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names) if miss_code and any(map(lambda x: "sht" not in x, miss_code)): return False @@ -863,9 +843,7 @@ def register(self, provider): self._provider = provider def __repr__(self): - return "{name}(provider={provider})".format( - name=self.__class__.__name__, provider=self._provider - ) + return "{name}(provider={provider})".format(name=self.__class__.__name__, provider=self._provider) def __getattr__(self, key): if self.__dict__.get("_provider", None) is None: From 1a349d05feb7b41371bcf0121953a0986ec8d6bd Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 20 Oct 2023 13:29:01 +0800 Subject: [PATCH 21/34] deal with empty data --- qlib/data/data.py | 2 ++ qlib/data/pit.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index bbfaeb9c12..c15ff60885 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -822,6 +822,8 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, max_p = df["period"].iloc[i] df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] + if s_part.empty: + return pd.Series(dtype=VALUE_DTYPE) if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 9b5b7a88c0..97a6dff938 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -31,8 +31,10 @@ def _load_internal(self, instrument, start_index, end_index, freq): try: # get start and end date s = self._load_feature(instrument, 0, 0, _calendar[end_index], None, _calendar[start_index]) + if len(s) == 0: + return pd.Series(dtype="float32", name=str(self)) # index in s may not in calendar, so we need to reindex it to continue date first - s = s.reindex(pd.date_range(start=s.iloc[0], end=_calendar[end_index])).fillna(method="ffill") + s = s.reindex(pd.date_range(start=s.index[0], end=_calendar[end_index])).fillna(method="ffill") resample_data = s.reindex(_calendar[start_index : end_index + 1]).fillna(method="ffill").values except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") From f340776606865e2cecc2a601999985e647955c7a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 10:58:25 +0800 Subject: [PATCH 22/34] add pit backend: FilePITStorage --- qlib/data/data.py | 34 +++-- qlib/data/storage/file_storage.py | 209 +++++++++++++++++++++++++++++- qlib/data/storage/storage.py | 126 ++++++++++++++++++ tests/test_pit.py | 148 ++++++++++++++++----- 4 files changed, 463 insertions(+), 54 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index c15ff60885..86ddf893c5 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -47,7 +47,10 @@ class ProviderBackendMixin: def get_default_backend(self): backend = {} - provider_name: str = re.findall("[A-Z][^A-Z]*", self.__class__.__name__)[-2] + if hasattr(self, "provider_name"): + provider_name = getattr(self, "provider_name") + else: + provider_name: str = re.findall("[A-Z][^A-Z]*", self.__class__.__name__)[-2] # set default storage class backend.setdefault("class", f"File{provider_name}Storage") # set default storage module @@ -335,6 +338,10 @@ def feature(self, instrument, field, start_time, end_time, freq): class PITProvider(abc.ABC): + @property + def provider_name(self): + return "PIT" + @abc.abstractmethod def period_feature( self, @@ -741,10 +748,15 @@ def feature(self, instrument, field, start_index, end_index, freq): return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] -class LocalPITProvider(PITProvider): +class LocalPITProvider(PITProvider, ProviderBackendMixin): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! + def __init__(self, remote=False, backend={}): + super().__init__() + self.remote = remote + self.backend = backend + def period_feature(self, instrument, field, start_offset, end_offset, cur_time, period=None, start_time=None): """get raw data from PIT we have 3 modes to query data from PIT, all method need current datetime @@ -764,17 +776,11 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, assert end_offset <= 0 # PIT don't support querying future data - DATA_RECORDS = [ - ("date", C.pit_record_type["date"]), - ("period", C.pit_record_type["period"]), - ("value", C.pit_record_type["value"]), - ("_next", C.pit_record_type["index"]), - ] - VALUE_DTYPE = C.pit_record_type["value"] - field = str(field).lower()[2:] instrument = code_to_fname(instrument) + backend_obj = self.backend_obj(instrument=instrument, field=field) + # {For acceleration # start_index, end_index, cur_index = kwargs["info"] # if cur_index == start_index: @@ -803,8 +809,8 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, ## so we cannot findout the offset by given date ## stop using index in this version # start_point = get_pitdata_offset(index_path, period, ) - data = np.fromfile(data_path, dtype=DATA_RECORDS) - df = pd.DataFrame(data, columns=[i[0] for i in DATA_RECORDS]) + data = backend_obj.np_data() + df = pd.DataFrame(data) df.sort_values(by=["date", "period"], inplace=True) df["date"] = pd.to_datetime(df["date"].astype(str)) H["f"][key] = df @@ -823,7 +829,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] if s_part.empty: - return pd.Series(dtype=VALUE_DTYPE) + return pd.Series(index=backend_obj.columns, dtype="float64") if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) @@ -832,7 +838,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: df_remain = df[(df["date"] <= cur_time)] if df_remain.empty: - return pd.Series(dtype=VALUE_DTYPE) + return pd.Series(index=backend_obj.columns, dtype="float64") last_observe_date = df_remain["date"].iloc[-1] # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 8a100a2d19..2d36fe3bef 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -7,13 +7,21 @@ import numpy as np import pandas as pd +from qlib.data.storage.storage import PITStorage from qlib.utils.time import Freq from qlib.utils.resam import resam_calendar from qlib.config import C from qlib.data.cache import H from qlib.log import get_module_logger -from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage, CalVT, InstKT, InstVT +from qlib.data.storage import ( + CalendarStorage, + InstrumentStorage, + FeatureStorage, + CalVT, + InstKT, + InstVT, +) logger = get_module_logger("file_storage") @@ -48,7 +56,10 @@ def support_freq(self) -> List[str]: if len(self.provider_uri) == 1 and C.DEFAULT_FREQ in self.provider_uri: freq_l = filter( lambda _freq: not _freq.endswith("_future"), - map(lambda x: x.stem, self.dpm.get_data_uri(C.DEFAULT_FREQ).joinpath("calendars").glob("*.txt")), + map( + lambda x: x.stem, + self.dpm.get_data_uri(C.DEFAULT_FREQ).joinpath("calendars").glob("*.txt"), + ), ) else: freq_l = self.provider_uri.keys() @@ -140,7 +151,10 @@ def data(self) -> List[CalVT]: _calendar = self._read_calendar() if Freq(self._freq_file) != Freq(self.freq): _calendar = resam_calendar( - np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq, self.region + np.array(list(map(pd.Timestamp, _calendar))), + self._freq_file, + self.freq, + self.region, ) return _calendar @@ -287,6 +301,7 @@ def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" + self._start_index = None def clear(self): with self.uri.open("wb") as _: @@ -303,6 +318,7 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" ) return + self._start_index = None if not self.uri.exists(): # write index = 0 if index is None else index @@ -320,7 +336,9 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: _old_data = np.fromfile(fp, dtype=" None: def start_index(self) -> Union[int, None]: if not self.uri.exists(): return None - with self.uri.open("rb") as fp: - index = int(np.frombuffer(fp.read(4), dtype=" Union[int, None]: @@ -377,3 +396,179 @@ def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.Serie def __len__(self) -> int: self.check() return self.uri.stat().st_size // 4 - 1 + + +class FilePITStorage(FileStorageMixin, PITStorage): + """PIT data is a special case of Feature data, it looks like + + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + It is sorted by [date, period]. + + next field currently is not used. just for forward compatible. + """ + + # NOTE: + # PIT data should have two files, one is the index file, the other is the data file. + + # pesudo code: + # date_index = calendar.index(date) + # data_start_index, data_end_index = index_file[date_index] + # data = data_file[data_start_index:data_end_index] + + # the index file is like feature's data file, but given a start index in index file, it will return the first and the last observe index of the data file. + # the data file has tree columns, the first column is observe date, the second column is financial period, the third column is the value. + + # so given start and end date, we can get the start_index and end_index from calendar. + # use it to read two line from index file, then we can get the start and end index of the data file. + + # but consider this implementation, we will create a index file which will have 50 times lines than the data file. Is it a good idea? + # if we just create a index file the same line with data file, we have to read the whole index file for any time slice search, so why not read whole data file? + + def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: dict = None, **kwargs): + super(FilePITStorage, self).__init__(instrument, field, freq, **kwargs) + + if not field.endswith("_q") and not field.endswith("_a"): + raise ValueError("period field must ends with '_q' or '_a'") + self.quarterly = field.endswith("_q") + + self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) + self.file_name = f"{instrument.lower()}/{field.lower()}.data" + self.raw_dtype = [ + ("date", C.pit_record_type["date"]), + ("period", C.pit_record_type["period"]), + ("value", C.pit_record_type["value"]), + ("_next", C.pit_record_type["index"]), # not used in current implementation + ] + self.dtypes = np.dtype(self.raw_dtype) + self.itemsize = self.dtypes.itemsize + self.dtype_string = "".join([i[1] for i in self.raw_dtype]) + self.columns = [i[0] for i in self.raw_dtype] + + @property + def uri(self) -> Path: + if self.freq not in self.support_freq: + raise ValueError(f"{self.storage_name}: {self.provider_uri} does not contain data for {self.freq}") + return self.dpm.get_data_uri(self.freq).joinpath(f"{self.storage_name}", self.file_name) + + def clear(self): + with self.uri.open("wb") as _: + pass + + @property + def data(self) -> pd.DataFrame: + return self[:] + + def update(self, data_array: np.ndarray) -> None: + """update data to storage, replace current data from start_date to end_date with given data_array + + Args: + data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype + """ + if not self.uri.exists(): + # write + index = 0 + else: + # sort it + data_array = np.sort(data_array, order=["date", "period"]) + # get index + update_start_date = data_array[0][0] + update_end_date = data_array[-1][0] + current_data = self.np_data() + index = (current_data["date"] >= update_start_date).argmax() + end_index = (current_data["date"] > update_end_date).argmax() + new_data = np.concatenate([data_array, current_data[end_index:]]) + self.write(new_data, index) + + def write(self, data_array: np.ndarray, index: int = None) -> None: + """write data to storage at specific index + + Args: + data_array: Structured arrays contains date, period, value and next + index: _description_. Defaults to None. + """ + + if len(data_array) == 0: + logger.info( + "len(data_array) == 0, write" + "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" + ) + return + + # sort data_array with first 2 columns + data_array = np.sort(data_array, order=["date", "period"]) + + if not self.uri.exists(): + # write + index = 0 if index is None else index + with self.uri.open("wb") as fp: + data_array.tofile(self.uri) + else: + with self.uri.open("rb+") as fp: + fp.seek(index * self.itemsize) + data_array.tofile(fp) + + @property + def start_index(self) -> Union[int, None]: + return 0 + + @property + def end_index(self) -> Union[int, None]: + if not self.uri.exists(): + return None + # The next data appending index point will be `end_index + 1` + return self.start_index + len(self) - 1 + + def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + if not self.uri.exists(): + if isinstance(i, int): + return None, None + elif isinstance(i, slice): + return pd.Series(dtype=np.float32) + else: + raise TypeError(f"type(i) = {type(i)}") + + if i is None: + i = slice(None, None) + storage_start_index = self.start_index + storage_end_index = self.end_index + with self.uri.open("rb") as fp: + if isinstance(i, int): + if storage_start_index > i: + raise IndexError(f"{i}: start index is {storage_start_index}") + fp.seek(i * self.itemsize) + return np.array([struct.unpack(self.dtype_string, fp.read(self.itemsize))], dtype=self.dtypes) + elif isinstance(i, slice): + start_index = storage_start_index if i.start is None else i.start + end_index = storage_end_index if i.stop is None else i.stop - 1 + si = max(start_index, storage_start_index) + if si > end_index: + return pd.Series(dtype=np.float32) + fp.seek(start_index * self.itemsize) + # read n bytes + count = end_index - si + 1 + data = np.frombuffer(fp.read(self.itemsize * count), dtype=self.dtypes) + return data + else: + raise TypeError(f"type(i) = {type(i)}") + + def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.DataFrame]: + if isinstance(i, int): + return pd.Series(self.np_data(i), index=self.columns, name=i) + elif isinstance(i, slice): + data = self.np_data(i) + si = self.start_index if i.start is None else i.start + if si < 0: + si = len(self) + si + return pd.DataFrame(data, index=pd.RangeIndex(si, si + len(data)), columns=self.columns) + else: + raise TypeError(f"type(i) = {type(i)}") + + def __len__(self) -> int: + self.check() + return self.uri.stat().st_size // self.itemsize diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index 2eb7da1de6..0d0ee0e7eb 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -492,3 +492,129 @@ def __len__(self) -> int: """ raise NotImplementedError("Subclass of FeatureStorage must implement `__len__` method") + + +class PITStorage(FeatureStorage): + @property + def storage_name(self) -> str: + return "financial" # for compatibility + + @property + def data(self) -> pd.DataFrame: + """get all data + + dataframe index is date, columns are report_period and value + + Notes + ------ + if data(storage) does not exist, return empty pd.DataFrame: `return pd.DataFrame(dtype=np.float32)` + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") + + def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): + """Write data_array to FeatureStorage starting from index. + + Notes + ------ + If index is None, append data_array to feature. + + If len(data_array) == 0; return + + If (index - self.end_index) >= 1, self[end_index+1: index] will be filled with np.nan + + Examples + --------- + .. code-block:: + + feature: + 3 4 + 4 5 + 5 6 + + + >>> self.write([6, 7], index=6) + + feature: + 3 4 + 4 5 + 5 6 + 6 6 + 7 7 + + >>> self.write([8], index=9) + + feature: + 3 4 + 4 5 + 5 6 + 6 6 + 7 7 + 8 np.nan + 9 8 + + >>> self.write([1, np.nan], index=3) + + feature: + 3 1 + 4 np.nan + 5 6 + 6 6 + 7 7 + 8 np.nan + 9 8 + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") + + def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): + """overwrite all data in FeatureStorage with data + + Parameters + ---------- + data: Union[List, np.ndarray, Tuple] + data + index: int + data start index + """ + self.clear() + self.write(data, index) + + @overload + def __getitem__(self, s: slice) -> pd.Series: + """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step] + + Returns + ------- + pd.Series(values, index=pd.RangeIndex(start, len(values)) + """ + + @overload + def __getitem__(self, i: int) -> Tuple[int, float]: + """x.__getitem__(y) <==> x[y]""" + + def __getitem__(self, i) -> Union[Tuple[int, float], pd.Series]: + """x.__getitem__(y) <==> x[y] + + Notes + ------- + if data(storage) does not exist: + if isinstance(i, int): + return (None, None) + if isinstance(i, slice): + # return empty pd.Series + return pd.Series(dtype=np.float32) + """ + raise NotImplementedError( + "Subclass of FeatureStorage must implement `__getitem__(i: int)`/`__getitem__(s: slice)` method" + ) + + def __len__(self) -> int: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `__len__` method") diff --git a/tests/test_pit.py b/tests/test_pit.py index 8320e1d361..26655b85ab 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -3,6 +3,8 @@ import sys + +import numpy as np import qlib import shutil import unittest @@ -12,6 +14,7 @@ from pathlib import Path from qlib.data import D +from qlib.data.storage.file_storage import FilePITStorage from qlib.tests.data import GetData sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) @@ -32,37 +35,37 @@ class TestPIT(unittest.TestCase): - @classmethod - def tearDownClass(cls) -> None: - shutil.rmtree(str(DATA_DIR.resolve())) - - @classmethod - def setUpClass(cls) -> None: - cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) - pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) - pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) - GetData().qlib_data( - name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True - ) - GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) - - # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. - # bs.login() - # Run( - # source_dir=pit_dir, - # interval="quarterly", - # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") - # bs.logout() - - Run( - source_dir=pit_dir, - normalize_dir=pit_normalized_dir, - interval="quarterly", - ).normalize_data() - DumpPitData( - csv_path=pit_normalized_dir, - qlib_dir=cn_data_dir, - ).dump(interval="quarterly") + # @classmethod + # def tearDownClass(cls) -> None: + # shutil.rmtree(str(DATA_DIR.resolve())) + + # @classmethod + # def setUpClass(cls) -> None: + # cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) + # pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) + # pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) + # GetData().qlib_data( + # name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True + # ) + # GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) + + # # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. + # # bs.login() + # # Run( + # # source_dir=pit_dir, + # # interval="quarterly", + # # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") + # # bs.logout() + + # Run( + # source_dir=pit_dir, + # normalize_dir=pit_normalized_dir, + # interval="quarterly", + # ).normalize_data() + # DumpPitData( + # csv_path=pit_normalized_dir, + # qlib_dir=cn_data_dir, + # ).dump(interval="quarterly") def setUp(self): # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier @@ -70,11 +73,84 @@ def setUp(self): qlib.init(provider_uri=provider_uri) def to_str(self, obj): - return "".join(str(obj).split()) + return "\n".join(str(obj).split()) def check_same(self, a, b): self.assertEqual(self.to_str(a), self.to_str(b)) + def test_storage_read(self): + s = FilePITStorage("sh600519", "roewa_q") + np_data = s.np_data(1) + self.assertEqual(np_data.shape, (1,)) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + """, + ) + + def test_storage_write(self): + base = FilePITStorage("sh600519", "roewa_q") + s = FilePITStorage("sh600519", "roewa2_q") + + shutil.copy(base.uri, s.uri) + s.write( + np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), + 1, + ) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070917 200703 0.239330 0 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + """, + ) + + def test_storage_slice(self): + s = FilePITStorage("sh600519", "roewa_q") + data = s[1:4] + self.check_same( + data, + """ + date period value _next + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + """, + ) + + def test_storage_update(self): + base = FilePITStorage("sh600519", "roewa_q") + s = FilePITStorage("sh600519", "roewa3_q") + + shutil.copy(base.uri, s.uri) + s.update( + np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype), + ) + data = s.data + self.check_same( + data.head(), + """ + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20070917 200703 0.111111 0 + 3 20100314 200703 0.111111 0 + 4 20100402 200904 0.335461 4294967295 + """, + ) + def test_query(self): instruments = ["sh600519"] fields = ["P($$roewa_q)", "P($$yoyni_q)"] @@ -107,7 +183,13 @@ def test_query(self): def test_no_exist_data(self): fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"] - data = D.features(["sh600519", "sh601988"], fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") + data = D.features( + ["sh600519", "sh601988"], + fields, + start_time="2019-01-01", + end_time="2019-07-19", + freq="day", + ) data["$close"] = 1 # in case of different dataset gives different values expect = """ P($$roewa_q) P($$yoyni_q) $close From 38a04b6ed41200a58a84880cc10fe10ffa878f46 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:25:56 +0800 Subject: [PATCH 23/34] improve docstring --- qlib/data/storage/file_storage.py | 12 +++- qlib/data/storage/storage.py | 102 ++++++++++++++++++++---------- tests/test_pit.py | 62 +++++++++--------- 3 files changed, 111 insertions(+), 65 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 2d36fe3bef..a36d1cca60 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -490,7 +490,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: Args: data_array: Structured arrays contains date, period, value and next - index: _description_. Defaults to None. + index: target index to start writing. Defaults to None. """ if len(data_array) == 0: @@ -509,6 +509,8 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: with self.uri.open("wb") as fp: data_array.tofile(self.uri) else: + if index is None or index > self.end_index: + index = self.end_index + 1 with self.uri.open("rb+") as fp: fp.seek(index * self.itemsize) data_array.tofile(fp) @@ -525,6 +527,14 @@ def end_index(self) -> Union[int, None]: return self.start_index + len(self) - 1 def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ if not self.uri.exists(): if isinstance(i, int): return None, None diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index 0d0ee0e7eb..acd6172ab5 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -495,10 +495,36 @@ def __len__(self) -> int: class PITStorage(FeatureStorage): + """PIT data is a special case of Feature data, it looks like + + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + It is sorted by [date, period]. + + next field currently is not used. just for forward compatible. + """ + @property def storage_name(self) -> str: return "financial" # for compatibility + def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ + + raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") + @property def data(self) -> pd.DataFrame: """get all data @@ -511,7 +537,7 @@ def data(self) -> pd.DataFrame: """ raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") - def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): + def write(self, data_array: np.ndarray, index: int = None): """Write data_array to FeatureStorage starting from index. Notes @@ -526,42 +552,24 @@ def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): --------- .. code-block:: - feature: - 3 4 - 4 5 - 5 6 + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 - >>> self.write([6, 7], index=6) + >>> s.write(np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), 1) feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - - >>> self.write([8], index=9) - - feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 - - >>> self.write([1, np.nan], index=3) - - feature: - 3 1 - 4 np.nan - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070917 200703 0.239330 0 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 """ raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") @@ -579,6 +587,34 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): self.clear() self.write(data, index) + def update(self, data_array: np.ndarray) -> None: + """update data to storage, replace current data from start_date to end_date with given data_array + + Args: + data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype + + Examples + --------- + .. code-block:: + + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + >>> s.update(np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype)) + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20070917 200703 0.111111 0 + 3 20100314 200703 0.111111 0 + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `update` method") + @overload def __getitem__(self, s: slice) -> pd.Series: """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step] diff --git a/tests/test_pit.py b/tests/test_pit.py index 26655b85ab..359be618dd 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -35,37 +35,37 @@ class TestPIT(unittest.TestCase): - # @classmethod - # def tearDownClass(cls) -> None: - # shutil.rmtree(str(DATA_DIR.resolve())) - - # @classmethod - # def setUpClass(cls) -> None: - # cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) - # pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) - # pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) - # GetData().qlib_data( - # name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True - # ) - # GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) - - # # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. - # # bs.login() - # # Run( - # # source_dir=pit_dir, - # # interval="quarterly", - # # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") - # # bs.logout() - - # Run( - # source_dir=pit_dir, - # normalize_dir=pit_normalized_dir, - # interval="quarterly", - # ).normalize_data() - # DumpPitData( - # csv_path=pit_normalized_dir, - # qlib_dir=cn_data_dir, - # ).dump(interval="quarterly") + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(str(DATA_DIR.resolve())) + + @classmethod + def setUpClass(cls) -> None: + cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) + pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) + pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) + GetData().qlib_data( + name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True + ) + GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) + + # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. + # bs.login() + # Run( + # source_dir=pit_dir, + # interval="quarterly", + # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") + # bs.logout() + + Run( + source_dir=pit_dir, + normalize_dir=pit_normalized_dir, + interval="quarterly", + ).normalize_data() + DumpPitData( + csv_path=pit_normalized_dir, + qlib_dir=cn_data_dir, + ).dump(interval="quarterly") def setUp(self): # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier From 07cff6bfa1271f90d0d3ded227d27c76b10592d3 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:34:05 +0800 Subject: [PATCH 24/34] remove index file check --- qlib/data/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index 86ddf893c5..ac853fda64 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -800,9 +800,9 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" + # index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" - if not (index_path.exists() and data_path.exists()): + if not data_path.exists(): raise FileNotFoundError("No file is found.") ## get first period offset ## NOTE: current index file return offset from a given period not date From bdf80605a6872865f4a40701d2e5a4bfb315c231 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:40:07 +0800 Subject: [PATCH 25/34] pit rewrite does not need index --- qlib/data/storage/storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index acd6172ab5..d5151c7d3d 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -574,7 +574,7 @@ def write(self, data_array: np.ndarray, index: int = None): """ raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") - def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): + def rewrite(self, data: Union[List, np.ndarray, Tuple]): """overwrite all data in FeatureStorage with data Parameters @@ -585,7 +585,7 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): data start index """ self.clear() - self.write(data, index) + self.write(data, 0) def update(self, data_array: np.ndarray) -> None: """update data to storage, replace current data from start_date to end_date with given data_array From e3fff653c31266e25095118b1c91c5e587f96b58 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:07:14 +0800 Subject: [PATCH 26/34] fix typo --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a36d1cca60..f78a2f104e 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -507,7 +507,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: # write index = 0 if index is None else index with self.uri.open("wb") as fp: - data_array.tofile(self.uri) + data_array.tofile(fp) else: if index is None or index > self.end_index: index = self.end_index + 1 From c7542901547c8061585934ff7597d92b77d48258 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:22:48 +0800 Subject: [PATCH 27/34] make sure dir exist --- qlib/data/storage/file_storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index f78a2f104e..a94b6ff1f6 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -439,6 +439,7 @@ def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.data" + self.uri.parent.mkdir(exist_ok=True) self.raw_dtype = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), From 74fd9cba3d21a392323c1c2e57c7e1caf8e61429 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 14:44:59 +0800 Subject: [PATCH 28/34] fix parents not exist --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a94b6ff1f6..fe091a89ce 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -439,7 +439,7 @@ def __init__(self, instrument: str, field: str, freq: str = "day", provider_uri: self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.data" - self.uri.parent.mkdir(exist_ok=True) + self.uri.parent.mkdir(parents=True, exist_ok=True) self.raw_dtype = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), From 41648b9e61d5d78f37b892f01c2d5abfc1ba16c2 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 31 Oct 2023 17:08:19 +0800 Subject: [PATCH 29/34] fix pitstorage update --- qlib/data/storage/file_storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index fe091a89ce..a22c083fd5 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -474,6 +474,7 @@ def update(self, data_array: np.ndarray) -> None: if not self.uri.exists(): # write index = 0 + self.write(data_array, index) else: # sort it data_array = np.sort(data_array, order=["date", "period"]) From ca0d4bbedca74a0c884f6a3885332165e37d186a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Wed, 1 Nov 2023 11:07:47 +0800 Subject: [PATCH 30/34] check dtype --- qlib/data/storage/file_storage.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index a22c083fd5..f6d0ec517d 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -501,7 +501,10 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: "if you need to clear the FeatureStorage, please execute: FeatureStorage.clear" ) return - + # check data_array dtype + if data_array.dtype != self.dtypes: + raise ValueError(f"data_array.dtype = {data_array.dtype}, self.dtypes = {self.dtypes}") + # sort data_array with first 2 columns data_array = np.sort(data_array, order=["date", "period"]) From de9e6cf772fad2a9c3a75caecfac6d96974fe9c6 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Wed, 1 Nov 2023 13:12:21 +0800 Subject: [PATCH 31/34] fix empty data --- qlib/data/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index ac853fda64..a07a48beff 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -829,7 +829,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, df_sim = df[s_sign].drop_duplicates(subset=["date"], keep="last") s_part = df_sim.set_index("date")[start_time:]["value"] if s_part.empty: - return pd.Series(index=backend_obj.columns, dtype="float64") + return pd.Series(dtype="float64") if start_time != s_part.index[0] and start_time >= df["date"].iloc[0]: # add previous value to result to avoid nan in the first period pre_value = pd.Series(df[df["date"] < start_time]["value"].iloc[-1], index=[start_time]) @@ -838,7 +838,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_time, else: df_remain = df[(df["date"] <= cur_time)] if df_remain.empty: - return pd.Series(index=backend_obj.columns, dtype="float64") + return pd.Series(dtype="float64") last_observe_date = df_remain["date"].iloc[-1] # keep only the latest period value df_remain = df_remain.sort_values(by=["period"]).drop_duplicates(subset=["period"], keep="last") From e093a83ab12e331db99407b5431105b936856dd8 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 2 Nov 2023 13:46:20 +0800 Subject: [PATCH 32/34] lint --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index f6d0ec517d..d335bb22be 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -504,7 +504,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: # check data_array dtype if data_array.dtype != self.dtypes: raise ValueError(f"data_array.dtype = {data_array.dtype}, self.dtypes = {self.dtypes}") - + # sort data_array with first 2 columns data_array = np.sort(data_array, order=["date", "period"]) From 52c5cbae20e15ebcdc810a36f14b5a245b6d6455 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 7 Nov 2023 19:31:57 +0800 Subject: [PATCH 33/34] deal with empty data file --- qlib/data/storage/file_storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index d335bb22be..125cd35a2d 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -471,7 +471,7 @@ def update(self, data_array: np.ndarray) -> None: Args: data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype """ - if not self.uri.exists(): + if not self.uri.exists() or len(self) == 0: # write index = 0 self.write(data_array, index) @@ -544,7 +544,7 @@ def np_data(self, i: Union[int, slice] = None) -> np.ndarray: if isinstance(i, int): return None, None elif isinstance(i, slice): - return pd.Series(dtype=np.float32) + return np.array(dtype=self.dtypes) else: raise TypeError(f"type(i) = {type(i)}") @@ -563,7 +563,7 @@ def np_data(self, i: Union[int, slice] = None) -> np.ndarray: end_index = storage_end_index if i.stop is None else i.stop - 1 si = max(start_index, storage_start_index) if si > end_index: - return pd.Series(dtype=np.float32) + return np.array(dtype=self.dtypes) fp.seek(start_index * self.itemsize) # read n bytes count = end_index - si + 1 From 8dfc3930d7ab0e0dbbd1e6369720df79ed02453a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 28 Nov 2023 18:38:06 +0800 Subject: [PATCH 34/34] remove useless function --- qlib/utils/__init__.py | 60 ------------------------------------------ 1 file changed, 60 deletions(-) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 66aa2ef07d..3c0723138f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -134,66 +134,6 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): - """ - At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). - Only the updating info before cur_date or at cur_date will be used. - - Parameters - ---------- - period: int - date period represented by interger, e.g. 201901 corresponds to the first quarter in 2019 - cur_date_int: int - date which represented by interger, e.g. 20190102 - last_period_index: int - it is a optional parameter; it is designed to avoid repeatedly access the .index data of PIT database when - sequentially observing the data (Because the latest index of a specific period of data certainly appear in after the one in last observation). - - Returns - ------- - the query value and byte index the index value - """ - DATA_DTYPE = "".join( - [ - C.pit_record_type["date"], - C.pit_record_type["period"], - C.pit_record_type["value"], - C.pit_record_type["index"], - ] - ) - - PERIOD_DTYPE = C.pit_record_type["period"] - INDEX_DTYPE = C.pit_record_type["index"] - - NAN_VALUE = C.pit_record_nan["value"] - NAN_INDEX = C.pit_record_nan["index"] - - # find the first index of linked revisions - if last_period_index is None: - with open(index_path, "rb") as fi: - (first_year,) = struct.unpack(PERIOD_DTYPE, fi.read(struct.calcsize(PERIOD_DTYPE))) - all_periods = np.fromfile(fi, dtype=INDEX_DTYPE) - offset = get_period_offset(first_year, period, quarterly) - _next = all_periods[offset] - else: - _next = last_period_index - - # load data following the `_next` link - prev_value = NAN_VALUE - prev_next = _next - - with open(data_path, "rb") as fd: - while _next != NAN_INDEX: - fd.seek(_next) - date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) - if date > cur_date_int: - break - prev_next = _next - _next = new_next - prev_value = value - return prev_value, prev_next - - def np_ffill(arr: np.array): """ forward fill a 1D numpy array