From d745b00662df9305617bc8f68daceb0a75178bc5 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Thu, 4 Jan 2024 11:52:38 +0400 Subject: [PATCH 01/38] started ftse100 strategy --- cvxportfolio/data.py | 4 +- examples/strategies/ftse100_daily.py | 114 +++++++++++++++++++++++++++ examples/universes.py | 37 +++++++-- 3 files changed, 148 insertions(+), 7 deletions(-) create mode 100644 examples/strategies/ftse100_daily.py diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py index c19e1f404..e96958bce 100644 --- a/cvxportfolio/data.py +++ b/cvxportfolio/data.py @@ -543,9 +543,9 @@ class Fred(SymbolData): def _internal_download(self, symbol): try: - return pd.read_csv( + return pd.to_numeric(pd.read_csv( self.URL + f'?id={symbol}', - index_col=0, parse_dates=[0])[symbol] + index_col=0, parse_dates=[0])[symbol], errors='coerce') except URLError as exc: raise DataError(f"Download of {symbol}" + f" from {self.__class__.__name__} failed." diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py new file mode 100644 index 000000000..0f0b976db --- /dev/null +++ b/examples/strategies/ftse100_daily.py @@ -0,0 +1,114 @@ +# Copyright 2023 Enzo Busseti +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This is a simple example strategy which we run every day. + +It is a long-only, unit leverage, allocation on the FTSE 100 universe. + +We will see how it performs online. + +You run it from the root of the repository in the development environment by: + +.. code:: bash + + python -m examples.strategies.ftse100_daily +""" + +import cvxportfolio as cvx + +from ..universes import FTSE100 + +HYPERPAR_OPTIMIZE_START = '2012-01-01' + +OBJECTIVE = 'sharpe_ratio' + + +def policy(gamma_risk, gamma_trade): + """Create fresh policy object, also return handles to hyper-parameters. + + :param gamma_risk: Risk aversion multiplier. + :type gamma_risk: float + :param gamma_trade: Transaction cost aversion multiplier. + :type gamma_trade: float, optional + + :return: Policy object and dictionary mapping hyper-parameter names (which + must match the arguments of this function) to their respective objects. + :rtype: tuple + """ + gamma_risk_hp = cvx.Gamma(initial_value=gamma_risk) + gamma_trade_hp = cvx.Gamma(initial_value=gamma_trade) + return cvx.SinglePeriodOptimization( + cvx.ReturnsForecast() + - gamma_risk_hp * cvx.FullCovariance() + - gamma_trade_hp * cvx.StocksTransactionCost(), + [cvx.LongOnly(), cvx.LeverageLimit(1)], + benchmark=cvx.MarketBenchmark(), + ), {'gamma_risk': gamma_risk_hp, 'gamma_trade': gamma_trade_hp} + + +if __name__ == '__main__': + + RESEARCH = True + + if not RESEARCH: + from .strategy_executor import main + main(policy=policy, hyperparameter_opt_start=HYPERPAR_OPTIMIZE_START, + objective=OBJECTIVE, universe=FTSE100, cash_key='GBPOUND') + + else: + import matplotlib.pyplot as plt + #INDEX_ETF = 'DIA' + + research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND') + + research_policy, _ = policy(1., 1.) + + result_unif = research_sim.backtest( + cvx.Uniform(), start_time=HYPERPAR_OPTIMIZE_START) + print('uniform') + print(result_unif) + + result_market = research_sim.backtest( + cvx.MarketBenchmark(), start_time=HYPERPAR_OPTIMIZE_START) + print('market') + print(result_market) + + exit(0) + + # result_etf = cvx.StockMarketSimulator([INDEX_ETF]).backtest( + # cvx.Uniform(), start_time=HYPERPAR_OPTIMIZE_START) + # print(INDEX_ETF) + # print(result_etf) + + research_sim.optimize_hyperparameters( + research_policy, start_time=HYPERPAR_OPTIMIZE_START, + objective='sharpe_ratio') + + result_opt = research_sim.backtest( + research_policy, start_time=HYPERPAR_OPTIMIZE_START) + print('optimized') + print(result_opt) + + result_unif.plot() + result_opt.plot() + result_market.plot() + #result_etf.plot() + + plt.figure() + result_opt.growth_rates.iloc[-252*4:].cumsum().plot(label='optimized') + result_unif.growth_rates.iloc[-252*4:].cumsum().plot(label='uniform') + result_market.growth_rates.iloc[-252*4:].cumsum().plot(label='market') + #result_etf.growth_rates.iloc[-252*4:].cumsum().plot(label='market etf') + plt.legend() + + plt.show() diff --git a/examples/universes.py b/examples/universes.py index fb2b6348c..4bd34e050 100644 --- a/examples/universes.py +++ b/examples/universes.py @@ -20,7 +20,7 @@ We could also save each universe in a ``json`` file. """ -# This was generated on 2023-12-27 06:55:30.344592+00:00 +# This was generated on 2024-01-04 06:18:49.851642+00:00 SP500 = \ ['A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', @@ -89,6 +89,21 @@ 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'] +FTSE100 = \ +['AAF.L', 'AAL.L', 'ABF.L', 'ADM.L', 'AHT.L', 'ANTO.L', 'AUTO.L', 'AV.L', + 'AZN.L', 'BA.L', 'BARC.L', 'BATS.L', 'BDEV.L', 'BEZ.L', 'BKG.L', 'BME.L', + 'BNZL.L', 'BP.L', 'BRBY.L', 'BT-A.L', 'CCH.L', 'CNA.L', 'CPG.L', 'CRDA.L', + 'CTEC.L', 'DCC.L', 'DGE.L', 'DPH.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L', + 'FCIT.L', 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L', + 'HLN.L', 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L', + 'IMI.L', 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L', + 'LSEG.L', 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L', + 'OCDO.L', 'PHNX.L', 'PRU.L', 'PSH.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L', + 'RMV.L', 'RR.L', 'RS1.L', 'RTO.L', 'SBRY.L', 'SDR.L', 'SGE.L', 'SGRO.L', + 'SHEL.L', 'SKG.L', 'SMDS.L', 'SMIN.L', 'SMT.L', 'SN.L', 'SPX.L', 'SSE.L', + 'STAN.L', 'STJ.L', 'SVT.L', 'TSCO.L', 'TW.L', 'ULVR.L', 'UTG.L', 'UU.L', + 'VOD.L', 'WEIR.L', 'WPP.L', 'WTB.L'] + if __name__ == '__main__': # import json @@ -113,10 +128,16 @@ 'page': "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average", 'table_number': 0, 'column_number': 1, + }, + 'ftse100': { + 'page': 'https://en.wikipedia.org/wiki/FTSE_100_Index', + 'table_number': -1, + 'column_number':1, + 'suffix':'.L', } } - def get_column_wikipedia_page(page, table_number, column_number): + def get_column_wikipedia_page(page, table_number, column_number, **kwargs): """Get a column as list of strings from a table on wikipedia. This is adapted from: @@ -129,6 +150,8 @@ def get_column_wikipedia_page(page, table_number, column_number): :type table_number: int :param column_number: Which column to extract. :type column_number: int + :param kwargs: Unused arguments. + :type kwargs: dict :returns: Sorted strings of the column. :rtype: list @@ -143,17 +166,21 @@ def get_column_wikipedia_page(page, table_number, column_number): column.append(element.strip()) return sorted(column) - def adapt_for_yahoo_finance(tickers_list): + def adapt_for_yahoo_finance(tickers_list, suffix='', **kwargs): """Change tickers to match the spelling of Yahoo Finance. :param tickers_list: Tickers from Wikipedia. :type tickers_list: list + :param suffix: Suffix to add to each ticker, default empty string. + :type suffix: str + :param kwargs: Unused arguments. + :type kwargs: dict :returns: Adapted tickers. :rtype: list """ - return [el.replace('.', '-') for el in tickers_list] + return [el.replace('.', '-') + suffix for el in tickers_list] # re-write this file @@ -177,7 +204,7 @@ def adapt_for_yahoo_finance(tickers_list): for key, value in universes.items(): tickers = adapt_for_yahoo_finance( - get_column_wikipedia_page(**value)) + get_column_wikipedia_page(**value), **value) f.write(f'\n{key.upper()} = \\\n') pprint(tickers, compact=True, width=79, stream=f) From 1ac90d410d808458a7ac0eadc965935b885330ba Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 17 Jan 2024 21:05:07 +0400 Subject: [PATCH 02/38] data quality check --- cvxportfolio/data.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py index e96958bce..6b5dcfc1e 100644 --- a/cvxportfolio/data.py +++ b/cvxportfolio/data.py @@ -498,6 +498,47 @@ def _download(self, symbol, current=None, new = self._clean(new) return pd.concat([current.iloc[:-overlap], new]) + def _quality_check(self, data): + """Analyze quality of the OHLCV-TR data.""" + + # zero volume + zerovol_idx = data.index[data.volume == 0] + if len(zerovol_idx) > 0: + logger.warning( + '%s("%s") has volume equal to zero for timestamps: %s', + self.__class__.__name__, self.symbol, zerovol_idx) + + def print_extreme(logreturns, name, sigmas=50): + + # TODO: choose + m, s = logreturns.median(), np.sqrt((logreturns**2).median()) + normalized = (logreturns - m)/s + + # normalized = logreturns / logreturns.rolling(252).std().shift(1) + + extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] + if len(extremereturn_idx) > 0: + logger.warning( + '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', + self.__class__.__name__, self.symbol, name, sigmas, + extremereturn_idx) + + # extreme logreturns + logreturns = np.log(1 + data['return']).dropna() + print_extreme(logreturns, 'total returns') + + # extreme open2close + open2close = np.log(data['close']) - np.log(data['open']).dropna() + print_extreme(open2close, 'open to close returns') + + # extreme open2high + open2high = np.log(data['high']) - np.log(data['open']).dropna() + print_extreme(open2high, 'open to high returns') + + # extreme open2low + open2low = np.log(data['low']) - np.log(data['open']).dropna() + print_extreme(open2low, 'open to low returns') + def _preload(self, data): """Prepare data for use by Cvxportfolio. @@ -505,6 +546,8 @@ def _preload(self, data): replace it with `valuevolume` which is an estimate of the (e.g., US dollar) value of the volume exchanged on the day. """ + + self._quality_check(data) data["valuevolume"] = data["volume"] * data["open"] del data["volume"] From b45b9d7a915edc969a597cc56b8a26c26174928d Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 11:58:46 +0400 Subject: [PATCH 03/38] Split history cvxportfolio/data.py to cvxportfolio/market_data.py --- cvxportfolio/{data.py => market_data.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cvxportfolio/{data.py => market_data.py} (100%) diff --git a/cvxportfolio/data.py b/cvxportfolio/market_data.py similarity index 100% rename from cvxportfolio/data.py rename to cvxportfolio/market_data.py From 7988b64f69b67317d10f22648b860885c3501acb Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 11:58:46 +0400 Subject: [PATCH 04/38] Split history cvxportfolio/data.py to cvxportfolio/market_data.py --- cvxportfolio/data.py => temp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cvxportfolio/data.py => temp (100%) diff --git a/cvxportfolio/data.py b/temp similarity index 100% rename from cvxportfolio/data.py rename to temp From 4f4fa684b5fda79eecbc42438075b8cf22837908 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 11:58:46 +0400 Subject: [PATCH 05/38] Split history cvxportfolio/data.py to cvxportfolio/market_data.py --- temp => cvxportfolio/data.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename temp => cvxportfolio/data.py (100%) diff --git a/temp b/cvxportfolio/data.py similarity index 100% rename from temp rename to cvxportfolio/data.py From ade1db5d93c706f2e9f58ba75a4eccab7edd1a9a Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 12:00:16 +0400 Subject: [PATCH 06/38] moved both files --- cvxportfolio/data/__init__.py | 25 +++++++++++++++++++ cvxportfolio/{ => data}/market_data.py | 0 cvxportfolio/{data.py => data/symbol_data.py} | 0 3 files changed, 25 insertions(+) create mode 100644 cvxportfolio/data/__init__.py rename cvxportfolio/{ => data}/market_data.py (100%) rename cvxportfolio/{data.py => data/symbol_data.py} (100%) diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py new file mode 100644 index 000000000..df2459ce4 --- /dev/null +++ b/cvxportfolio/data/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2023 Enzo Busseti +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module include classes that download, store, and serve market data. +The two main abstractions are :class:`SymbolData` and :class:`MarketData`. +Neither are exposed outside this module. Their derived classes instead are. +If you want to interface cvxportfolio with financial data source other +than the ones we provide, you should derive from either of those two classes. +""" + +from .market_data import * +from .symbol_data import * + +__all__ = [ + "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"] \ No newline at end of file diff --git a/cvxportfolio/market_data.py b/cvxportfolio/data/market_data.py similarity index 100% rename from cvxportfolio/market_data.py rename to cvxportfolio/data/market_data.py diff --git a/cvxportfolio/data.py b/cvxportfolio/data/symbol_data.py similarity index 100% rename from cvxportfolio/data.py rename to cvxportfolio/data/symbol_data.py From fb9d0dbefc9d40916ec73fe4adaaf09a6c64aedc Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 12:15:44 +0400 Subject: [PATCH 07/38] completed split of data.py into data/ , no logic changes used script git-split.sh #!/bin/sh # used https://stackoverflow.com/questions/3887736/keep-git-history-when-splitting-a-file if [[ $# -ne 2 ]] ; then echo "Usage: git-split.sh original copy" exit 0 fi git mv $1 $2 git commit -n -m "Split history $1 to $2" REV=`git rev-parse HEAD` git reset --hard HEAD^ git mv $1 temp git commit -n -m "Split history $1 to $2" git merge $REV git commit -a -n -m "Split history $1 to $2" git mv temp $1 git commit -n -m "Split history $1 to $2" --- cvxportfolio/data/__init__.py | 1 + cvxportfolio/data/market_data.py | 755 +------------------------------ cvxportfolio/data/symbol_data.py | 701 +--------------------------- 3 files changed, 16 insertions(+), 1441 deletions(-) diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py index df2459ce4..97aad902d 100644 --- a/cvxportfolio/data/__init__.py +++ b/cvxportfolio/data/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """This module include classes that download, store, and serve market data. + The two main abstractions are :class:`SymbolData` and :class:`MarketData`. Neither are exposed outside this module. Their derived classes instead are. If you want to interface cvxportfolio with financial data source other diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index d75936d1b..be723a618 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -11,766 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""This module include classes that download, store, and serve market data. +"""This module defines the :class:`MarketData` abstraction and derived classes.""" -The two main abstractions are :class:`SymbolData` and :class:`MarketData`. -Neither are exposed outside this module. Their derived classes instead are. - -If you want to interface cvxportfolio with financial data source other -than the ones we provide, you should derive from either of those two classes. -""" - -import datetime import logging -import sqlite3 import sys -import warnings from pathlib import Path -from urllib.error import URLError import numpy as np import pandas as pd -import requests -import requests.exceptions - -from .errors import DataError -from .utils import (hash_, periods_per_year_from_datetime_index, - resample_returns) -__all__ = ["YahooFinance", "Fred", - "UserProvidedMarketData", "DownloadedMarketData"] +from ..errors import DataError +from ..utils import (hash_, periods_per_year_from_datetime_index, + resample_returns) +from .symbol_data import * logger = logging.getLogger(__name__) -BASE_LOCATION = Path.home() / "cvxportfolio_data" - -def now_timezoned(): - """Return current timestamp with local timezone. - - :returns: Current timestamp with local timezone. - :rtype: pandas.Timestamp - """ - return pd.Timestamp( - datetime.datetime.now(datetime.timezone.utc).astimezone()) - -class SymbolData: - """Base class for a single symbol time series data. - - The data is either in the form of a Pandas Series or DataFrame - and has datetime index. - - This class needs to be derived. At a minimum, - one should redefine the ``_download`` method, which - implements the downloading of the symbol's time series - from an external source. The method takes the current (already - downloaded and stored) data and is supposed to **only append** to it. - In this way we only store new data and don't modify already downloaded - data. - - Additionally one can redefine the ``_preload`` method, which prepares - data to serve to the user (so the data is stored in a different format - than what the user sees.) We found that this separation can be useful. - - This class interacts with module-level functions named ``_loader_BACKEND`` - and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage - system used. We define ``pickle``, ``csv``, and ``sqlite`` backends. - These may have limitations. See their docstrings for more information. - - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param base_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. By default it's one day. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded data for the symbol. - """ - - def __init__(self, symbol, - storage_backend='pickle', - base_location=BASE_LOCATION, - grace_period=pd.Timedelta('1d')): - self._symbol = symbol - self._storage_backend = storage_backend - self._base_location = base_location - self.update(grace_period) - self._data = self.load() - - @property - def storage_location(self): - """Storage location. Directory is created if not existent. - - :rtype: pathlib.Path - """ - loc = self._base_location / f"{self.__class__.__name__}" - loc.mkdir(parents=True, exist_ok=True) - return loc - - @property - def symbol(self): - """The symbol whose data this instance contains. - - :rtype: str - """ - return self._symbol - - @property - def data(self): - """Time series data, updated to the most recent observation. - - :rtype: pandas.Series or pandas.DataFrame - """ - return self._data - - def _load_raw(self): - """Load raw data from database.""" - # we could implement multiprocess safety here - loader = globals()['_loader_' + self._storage_backend] - try: - logger.info( - f"{self.__class__.__name__} is trying to load {self.symbol}" - + f" with {self._storage_backend} backend" - + f" from {self.storage_location}") - return loader(self.symbol, self.storage_location) - except FileNotFoundError: - return None - - def load(self): - """Load data from database using `self.preload` function to process. - - :returns: Loaded time-series data for the symbol. - :rtype: pandas.Series or pandas.DataFrame - """ - return self._preload(self._load_raw()) - - def _store(self, data): - """Store data in database. - - :param data: Time-series data to store. - :type data: pandas.Series or pandas.DataFrame - """ - # we could implement multiprocess safety here - storer = globals()['_storer_' + self._storage_backend] - logger.info( - f"{self.__class__.__name__} is storing {self.symbol}" - + f" with {self._storage_backend} backend" - + f" in {self.storage_location}") - storer(self.symbol, data, self.storage_location) - - def _print_difference(self, current, new): - """Helper method to print difference if update is not append-only. - - This is temporary and will be re-factored. - """ - print("TEMPORARY: Diff between overlap of downloaded and stored") - print((new - current).dropna(how='all').tail(5)) - - def update(self, grace_period): - """Update current stored data for symbol. - - :param grace_period: If the time between now and the last value stored - is less than this, we don't update the data already stored. - :type grace_period: pandas.Timedelta - """ - current = self._load_raw() - logger.info( - f"Downloading {self.symbol}" - + f" from {self.__class__.__name__}") - updated = self._download( - self.symbol, current, grace_period=grace_period) - - if np.any(updated.iloc[:-1].isnull()): - logger.warning( - " cvxportfolio.%s('%s').data contains NaNs." - + " You may want to inspect it. If you want, you can delete the" - + " data file in %s to force re-download from the start.", - self.__class__.__name__, self.symbol, self.storage_location) - - try: - if current is not None: - if not np.all( - # we use numpy.isclose because returns may be computed - # via logreturns and numerical errors can sift through - np.isclose(updated.loc[current.index[:-1]], - current.iloc[:-1], equal_nan=True, - rtol=1e-08, atol=1e-08)): - logger.error(f"{self.__class__.__name__} update" - + f" of {self.symbol} is not append-only!") - self._print_difference(current, updated) - if hasattr(current, 'columns'): - # the first column is open price - if not current.iloc[-1, 0] == updated.loc[ - current.index[-1]].iloc[0]: - logger.error( - f"{self.__class__.__name__} update " - + f" of {self.symbol} changed last open price!") - self._print_difference(current, updated) - else: - if not current.iloc[-1] == updated.loc[current.index[-1]]: - logger.error( - f"{self.__class__.__name__} update" - + f" of {self.symbol} changed last value!") - self._print_difference(current, updated) - except KeyError: - logger.error("%s update of %s could not be checked for" - + " append-only edits. Was there a DST change?", - self.__class__.__name__, self.symbol) - self._store(updated) - - def _download(self, symbol, current, grace_period, **kwargs): - """Download data from external source given already downloaded data. - - This method must be redefined by derived classes. - - :param symbol: The symbol we download. - :type symbol: str - :param current: The data already downloaded. We are supposed to - **only append** to it. If None, no data is present. - :type current: pandas.Series or pandas.DataFrame or None - :rtype: pandas.Series or pandas.DataFrame - """ - raise NotImplementedError #pragma: no cover - - def _preload(self, data): - """Prepare data to serve to the user. - - This method can be redefined by derived classes. - - :param data: The data returned by the storage backend. - :type data: pandas.Series or pandas.DataFrame - :rtype: pandas.Series or pandas.DataFrame - """ - return data - - -# -# Yahoo Finance. -# - -def _timestamp_convert(unix_seconds_ts): - """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" - return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') - - -class YahooFinance(SymbolData): - """Yahoo Finance symbol data. - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded, and cleaned, data for the symbol. - :type data: pandas.DataFrame - """ - - # is open-high-low-close-volume-(total)return - IS_OHLCVR = True - - @staticmethod - def _clean(data): - """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" - - # print(data) - # print(data.isnull().sum()) - - # nan-out nonpositive prices - data.loc[data["open"] <= 0, 'open'] = np.nan - data.loc[data["close"] <= 0, "close"] = np.nan - data.loc[data["high"] <= 0, "high"] = np.nan - data.loc[data["low"] <= 0, "low"] = np.nan - data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan - - # nan-out negative volumes - data.loc[data["volume"] < 0, 'volume'] = np.nan - - # all infinity values are nans - data.iloc[:, :] = np.nan_to_num( - data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) - - # print(data) - # print(data.isnull().sum()) - - # if low is not the lowest, set it to nan - data['low'].loc[ - data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan - - # if high is not the highest, set it to nan - data['high'].loc[ - data['high'] < data[['open', 'high', 'close']].max(1)] = np.nan - - # print(data) - # print(data.isnull().sum()) - - # - # fills - # - - # fill volumes with zeros (safest choice) - data['volume'] = data['volume'].fillna(0.) - - # fill close price with open price - data['close'] = data['close'].fillna(data['open']) - - # fill open price with close from day(s) before - # repeat as long as it helps (up to 1 year) - for shifter in range(252): - orig_missing_opens = data['open'].isnull().sum() - data['open'] = data['open'].fillna(data['close'].shift( - shifter+1)) - new_missing_opens = data['open'].isnull().sum() - if orig_missing_opens == new_missing_opens: - break - logger.info( - "Filled missing open prices with close from %s periods before", - shifter+1) - - # fill close price with same day's open - data['close'] = data['close'].fillna(data['open']) - - # fill high price with max - data['high'] = data['high'].fillna(data[['open', 'close']].max(1)) - - # fill low price with max - data['low'] = data['low'].fillna(data[['open', 'close']].min(1)) - - # print(data) - # print(data.isnull().sum()) - - # - # Compute returns - # - - # compute log of ratio between adjclose and close - log_adjustment_ratio = np.log(data['adjclose'] / data['close']) - - # forward fill adjustment ratio - log_adjustment_ratio = log_adjustment_ratio.ffill() - - # non-market log returns (dividends, splits) - non_market_lr = log_adjustment_ratio.diff().shift(-1) - - # full open-to-open returns - open_to_open = np.log(data["open"]).diff().shift(-1) - data['return'] = np.exp(open_to_open + non_market_lr) - 1 - - # print(data) - # print(data.isnull().sum()) - - # intraday_logreturn = np.log(data["close"]) - np.log(data["open"]) - # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1) - # open_to_open_logreturn = ( - # close_to_close_logreturn + intraday_logreturn - - # intraday_logreturn.shift(-1) - # ) - # data["return"] = np.exp(open_to_open_logreturn) - 1 - del data["adjclose"] - - # eliminate last period's intraday data - data.loc[data.index[-1], - ["high", "low", "close", "return", "volume"]] = np.nan - - # print(data) - # print(data.isnull().sum()) - - return data - - @staticmethod - def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): - """Get 1 day OHLC from Yahoo finance. - - Result is timestamped with the open time (time-zoned) of the - instrument. - """ - - base_url = 'https://query2.finance.yahoo.com' - - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)' - ' AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/39.0.2171.95 Safari/537.36'} - - # print(HEADERS) - start = int(pd.Timestamp(start).timestamp()) - end = int(pd.Timestamp(end).timestamp()) - - try: - res = requests.get( - url=f"{base_url}/v8/finance/chart/{ticker}", - params={'interval': '1d', - "period1": start, - "period2": end}, - headers=headers, - timeout=10) # seconds - except requests.ConnectionError as exc: - raise DataError( - f"Download of {ticker} from YahooFinance failed." - + " Are you connected to the Internet?") from exc - - # print(res) - - if res.status_code == 404: - raise DataError( - f'Data for symbol {ticker} is not available.' - + 'Json output:', str(res.json())) - - if res.status_code != 200: - raise DataError(f'Yahoo finance download of {ticker} failed. Json:', - str(res.json())) # pragma: no cover - - data = res.json()['chart']['result'][0] - - try: - index = pd.DatetimeIndex( - [_timestamp_convert(el) for el in data['timestamp']]) - - df_result = pd.DataFrame( - data['indicators']['quote'][0], index=index) - df_result['adjclose'] = data[ - 'indicators']['adjclose'][0]['adjclose'] - except KeyError: - raise DataError(f'Yahoo finance download of {ticker} failed.' - + ' Json:', str(res.json())) # pragma: no cover - - # last timestamp is probably broken (not timed to market open) - # we set its time to same as the day before, but this is wrong - # on days of DST switch. It's fine though because that line will be - # overwritten next update - if df_result.index[-1].time() != df_result.index[-2].time(): - tm1 = df_result.index[-2].time() - newlast = df_result.index[-1].replace( - hour=tm1.hour, minute=tm1.minute, second=tm1.second) - df_result.index = pd.DatetimeIndex( - list(df_result.index[:-1]) + [newlast]) - - return df_result[ - ['open', 'low', 'high', 'close', 'adjclose', 'volume']] - - def _download(self, symbol, current=None, - overlap=5, grace_period='5d', **kwargs): - """Download single stock from Yahoo Finance. - - If data was already downloaded we only download - the most recent missing portion. - - Args: - - symbol (str): yahoo name of the instrument - current (pandas.DataFrame or None): current data present locally - overlap (int): how many lines of current data will be overwritten - by newly downloaded data - kwargs (dict): extra arguments passed to yfinance.download - - Returns: - updated (pandas.DataFrame): updated DataFrame for the symbol - """ - if overlap < 2: - raise SyntaxError( - f'{self.__class__.__name__} with overlap smaller than 2' - + ' could have issues with DST.') - if (current is None) or (len(current) < overlap): - updated = self._get_data_yahoo(symbol, **kwargs) - logger.info('Downloading from the start.') - result = self._clean(updated) - # we remove first row if it contains NaNs - if np.any(result.iloc[0].isnull()): - result = result.iloc[1:] - return result - if (now_timezoned() - current.index[-1] - ) < pd.Timedelta(grace_period): - logger.info( - 'Skipping download because stored data is recent enough.') - return current - new = self._get_data_yahoo(symbol, start=current.index[-overlap]) - new = self._clean(new) - return pd.concat([current.iloc[:-overlap], new]) - - def _quality_check(self, data): - """Analyze quality of the OHLCV-TR data.""" - - # zero volume - zerovol_idx = data.index[data.volume == 0] - if len(zerovol_idx) > 0: - logger.warning( - '%s("%s") has volume equal to zero for timestamps: %s', - self.__class__.__name__, self.symbol, zerovol_idx) - - def print_extreme(logreturns, name, sigmas=50): - - # TODO: choose - m, s = logreturns.median(), np.sqrt((logreturns**2).median()) - normalized = (logreturns - m)/s - - # normalized = logreturns / logreturns.rolling(252).std().shift(1) - - extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] - if len(extremereturn_idx) > 0: - logger.warning( - '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', - self.__class__.__name__, self.symbol, name, sigmas, - extremereturn_idx) - - # extreme logreturns - logreturns = np.log(1 + data['return']).dropna() - print_extreme(logreturns, 'total returns') - - # extreme open2close - open2close = np.log(data['close']) - np.log(data['open']).dropna() - print_extreme(open2close, 'open to close returns') - - # extreme open2high - open2high = np.log(data['high']) - np.log(data['open']).dropna() - print_extreme(open2high, 'open to high returns') - - # extreme open2low - open2low = np.log(data['low']) - np.log(data['open']).dropna() - print_extreme(open2low, 'open to low returns') - - def _preload(self, data): - """Prepare data for use by Cvxportfolio. - - We drop the `volume` column expressed in number of stocks and - replace it with `valuevolume` which is an estimate of the (e.g., - US dollar) value of the volume exchanged on the day. - """ - - self._quality_check(data) - data["valuevolume"] = data["volume"] * data["open"] - del data["volume"] - - return data - -# -# Fred. -# - -class Fred(SymbolData): - """Fred single-symbol data. - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. By default it's one day. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded data for the symbol. - """ - - URL = "https://fred.stlouisfed.org/graph/fredgraph.csv" - - # TODO: implement Fred point-in-time - # example: - # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06 - # hourly wages time series **as it appeared** on 2023-07-06 - # store using pd.Series() of diff'ed values only. - - def _internal_download(self, symbol): - try: - return pd.to_numeric(pd.read_csv( - self.URL + f'?id={symbol}', - index_col=0, parse_dates=[0])[symbol], errors='coerce') - except URLError as exc: - raise DataError(f"Download of {symbol}" - + f" from {self.__class__.__name__} failed." - + " Are you connected to the Internet?") from exc - - def _download( - self, symbol="DFF", current=None, grace_period='5d', **kwargs): - """Download or update pandas Series from Fred. - - If already downloaded don't change data stored locally and only - add new entries at the end. - - Additionally, we allow for a `grace period`, if the data already - downloaded has a last entry not older than the grace period, we - don't download new data. - """ - if current is None: - return self._internal_download(symbol) - if (pd.Timestamp.today() - current.index[-1] - ) < pd.Timedelta(grace_period): - logger.info( - 'Skipping download because stored data is recent enough.') - return current - - new = self._internal_download(symbol) - new = new.loc[new.index > current.index[-1]] - - if new.empty: - logger.info('New downloaded data is empty!') - return current - - assert new.index[0] > current.index[-1] - return pd.concat([current, new]) - - def _preload(self, data): - """Add UTC timezone.""" - data.index = data.index.tz_localize('UTC') - return data - -# -# Sqlite storage backend. -# - -def _open_sqlite(storage_location): - return sqlite3.connect(storage_location/"db.sqlite") - -def _close_sqlite(connection): - connection.close() - -def _loader_sqlite(symbol, storage_location): - """Load data in sqlite format. - - We separately store dtypes for data consistency and safety. - - .. note:: If your pandas object's index has a name it will be lost, - the index is renamed 'index'. If you pass timestamp data (including - the index) it must have explicit timezone. - """ - try: - connection = _open_sqlite(storage_location) - dtypes = pd.read_sql_query( - f"SELECT * FROM {symbol}___dtypes", - connection, index_col="index", - dtype={"index": "str", "0": "str"}) - - parse_dates = 'index' - my_dtypes = dict(dtypes["0"]) - - tmp = pd.read_sql_query( - f"SELECT * FROM {symbol}", connection, - index_col="index", parse_dates=parse_dates, dtype=my_dtypes) - - _close_sqlite(connection) - multiindex = [] - for col in tmp.columns: - if col[:8] == "___level": - multiindex.append(col) - else: - break - if len(multiindex) > 0: - multiindex = [tmp.index.name] + multiindex - tmp = tmp.reset_index().set_index(multiindex) - return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp - except pd.errors.DatabaseError: - return None - -def _storer_sqlite(symbol, data, storage_location): - """Store data in sqlite format. - - We separately store dtypes for data consistency and safety. - - .. note:: If your pandas object's index has a name it will be lost, - the index is renamed 'index'. If you pass timestamp data (including - the index) it must have explicit timezone. - """ - connection = _open_sqlite(storage_location) - exists = pd.read_sql_query( - f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'", - connection) - - if len(exists): - _ = connection.cursor().execute(f"DROP TABLE '{symbol}'") - _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'") - connection.commit() - - if hasattr(data.index, "levels"): - data.index = data.index.set_names( - ["index"] + - [f"___level{i}" for i in range(1, len(data.index.levels))] - ) - data = data.reset_index().set_index("index") - else: - data.index.name = "index" - - if data.index[0].tzinfo is None: - warnings.warn('Index has not timezone, setting to UTC') - data.index = data.index.tz_localize('UTC') - - data.to_sql(f"{symbol}", connection) - pd.DataFrame(data).dtypes.astype("string").to_sql( - f"{symbol}___dtypes", connection) - _close_sqlite(connection) - - -# -# Pickle storage backend. -# - -def _loader_pickle(symbol, storage_location): - """Load data in pickle format.""" - return pd.read_pickle(storage_location / f"{symbol}.pickle") - -def _storer_pickle(symbol, data, storage_location): - """Store data in pickle format.""" - data.to_pickle(storage_location / f"{symbol}.pickle") - -# -# Csv storage backend. -# - -def _loader_csv(symbol, storage_location): - """Load data in csv format.""" - - index_dtypes = pd.read_csv( - storage_location / f"{symbol}___index_dtypes.csv", - index_col=0)["0"] - - dtypes = pd.read_csv( - storage_location / f"{symbol}___dtypes.csv", index_col=0, - dtype={"index": "str", "0": "str"}) - dtypes = dict(dtypes["0"]) - new_dtypes = {} - parse_dates = [] - for i, level in enumerate(index_dtypes): - if "datetime64[ns" in level: # includes all timezones - parse_dates.append(i) - for i, el in enumerate(dtypes): - if "datetime64[ns" in dtypes[el]: # includes all timezones - parse_dates += [i + len(index_dtypes)] - else: - new_dtypes[el] = dtypes[el] - - tmp = pd.read_csv(storage_location / f"{symbol}.csv", - index_col=list(range(len(index_dtypes))), - parse_dates=parse_dates, dtype=new_dtypes) - - return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp - - -def _storer_csv(symbol, data, storage_location): - """Store data in csv format.""" - pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels') - else [data.index.dtype]).astype("string").to_csv( - storage_location / f"{symbol}___index_dtypes.csv") - pd.DataFrame(data).dtypes.astype("string").to_csv( - storage_location / f"{symbol}___dtypes.csv") - data.to_csv(storage_location / f"{symbol}.csv") - -# -# Market Data -# +__all__ = ['DownloadedMarketData', 'MarketData', 'UserProvidedMarketData'] class MarketData: """Prepare, hold, and serve market data. diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index d75936d1b..142d5ff41 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -11,19 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""This module include classes that download, store, and serve market data. - -The two main abstractions are :class:`SymbolData` and :class:`MarketData`. -Neither are exposed outside this module. Their derived classes instead are. - -If you want to interface cvxportfolio with financial data source other -than the ones we provide, you should derive from either of those two classes. -""" - +"""This module defines the :class:`SymbolData` abstraction and derived classes.""" import datetime import logging import sqlite3 -import sys import warnings from pathlib import Path from urllib.error import URLError @@ -33,17 +24,17 @@ import requests import requests.exceptions -from .errors import DataError -from .utils import (hash_, periods_per_year_from_datetime_index, - resample_returns) - -__all__ = ["YahooFinance", "Fred", - "UserProvidedMarketData", "DownloadedMarketData"] +from ..errors import DataError logger = logging.getLogger(__name__) BASE_LOCATION = Path.home() / "cvxportfolio_data" +__all__ = [ + '_loader_csv', '_loader_pickle', '_loader_sqlite', + '_storer_csv', '_storer_pickle', '_storer_sqlite', + 'Fred', 'SymbolData', 'YahooFinance', 'BASE_LOCATION'] + def now_timezoned(): """Return current timestamp with local timezone. @@ -329,15 +320,14 @@ def _clean(data): # fill open price with close from day(s) before # repeat as long as it helps (up to 1 year) for shifter in range(252): + logger.info( + "Filling opens with close from %s days before", shifter) orig_missing_opens = data['open'].isnull().sum() data['open'] = data['open'].fillna(data['close'].shift( shifter+1)) new_missing_opens = data['open'].isnull().sum() if orig_missing_opens == new_missing_opens: break - logger.info( - "Filled missing open prices with close from %s periods before", - shifter+1) # fill close price with same day's open data['close'] = data['close'].fillna(data['open']) @@ -767,676 +757,3 @@ def _storer_csv(symbol, data, storage_location): pd.DataFrame(data).dtypes.astype("string").to_csv( storage_location / f"{symbol}___dtypes.csv") data.to_csv(storage_location / f"{symbol}.csv") - -# -# Market Data -# - -class MarketData: - """Prepare, hold, and serve market data. - - :method serve: Serve data for policy and simulator at time :math:`t`. - """ - - def serve(self, t): - """Serve data for policy and simulator at time :math:`t`. - - :param t: Trading time. It must be included in the timestamps returned - by :meth:`trading_calendar`. - :type t: pandas.Timestamp - - :returns: past_returns, current_returns, past_volumes, current_volumes, - current_prices - :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame, - pandas.Series, pandas.Series) - """ - raise NotImplementedError # pragma: no cover - - # pylint: disable=redundant-returns-doc - def trading_calendar( - self, start_time=None, end_time=None, include_end=True): - """Get trading calendar between times. - - :param start_time: Initial time of the trading calendar. Always - inclusive if present. If None, use the first available time. - :type start_time: pandas.Timestamp - :param end_time: Final time of the trading calendar. If None, - use the last available time. - :type end_time: pandas.Timestamp - :param include_end: Include end time. - :type include_end: bool - - :returns: Trading calendar. - :rtype: pandas.DatetimeIndex - """ - raise NotImplementedError # pragma: no cover - - @property - def periods_per_year(self): - """Average trading periods per year. - - :rtype: int - """ - raise NotImplementedError # pragma: no cover - - @property - def full_universe(self): # pylint: disable=redundant-returns-doc - """Full universe, which might not be available for trading. - - :returns: Full universe. - :rtype: pandas.Index - """ - raise NotImplementedError # pragma: no cover - - # pylint: disable=unused-argument, redundant-returns-doc - def partial_universe_signature(self, partial_universe): - """Unique signature of this instance with a partial universe. - - A partial universe is a subset of the full universe that is - available at some time for trading. - - This is used in cvxportfolio.cache to sign back-test caches that - are saved on disk. If not redefined it returns None which disables - on-disk caching. - - :param partial_universe: A subset of the full universe. - :type partial_universe: pandas.Index - - :returns: Signature. - :rtype: str - """ - return None - -# compiled based on Interactive Brokers benchmark rates choices -# (see https://www.ibkrguides.com/kb/article-2949.htm) -# and their FRED codes -RATES = { - 'USDOLLAR': 'DFF', # Federal funds effective rate - 'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate - 'GBPOUND': 'IUDSOIA', # SONIA - 'JPYEN': 'IRSTCB01JPM156N', # updated monthly - } - -class MarketDataInMemory(MarketData): - """Market data that is stored in memory when initialized.""" - - # this is overwritten in the derived classes' initializers - returns = None - - def __init__( - self, trading_frequency, base_location, cash_key, min_history, - online_usage = False): - """This must be called by the derived classes.""" - if (self.returns.index[-1] - self.returns.index[0]) < min_history: - raise DataError( - "The provided returns have less history " - + f"than the min_history {min_history}") - if trading_frequency: - self._downsample(trading_frequency) - self.trading_frequency = trading_frequency - - self._set_read_only() - self._check_sizes() - self._mask = None - self._masked_returns = None - self._masked_volumes = None - self._masked_prices = None - self.base_location = Path(base_location) - self.cash_key = cash_key - self._min_history_timedelta = min_history - self.online_usage = online_usage - - def _mask_dataframes(self, mask): - """Mask internal dataframes if necessary.""" - if (self._mask is None) or not np.all(self._mask == mask): - logger.info("Masking internal %s dataframes.", - self.__class__.__name__) - colmask = self.returns.columns[mask] - # self._masked_returns = self._df_or_ser_set_read_only( - # pd.DataFrame(self.returns.iloc[:, mask], copy=True)) - self._masked_returns = self._df_or_ser_set_read_only( - pd.DataFrame(self.returns.loc[:, colmask], copy=True)) - # self._masked_returns = self._df_or_ser_set_read_only( - # pd.DataFrame(np.array(self.returns.values[:, mask]), - # index=self.returns.index, columns=colmask)) - if not self.volumes is None: - # self._masked_volumes = self._df_or_ser_set_read_only( - # pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True)) - self._masked_volumes = self._df_or_ser_set_read_only( - pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True)) - # self._masked_volumes = self._df_or_ser_set_read_only( - # pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]), - # index=self.volumes.index, columns=colmask[:-1])) - if not self.prices is None: - # self._masked_prices = self._df_or_ser_set_read_only( - # pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True)) - self._masked_prices = self._df_or_ser_set_read_only( - pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True)) - self._mask = mask - - @property - def full_universe(self): - """Full universe, which might not be available for trading. - - :returns: Full universe. - :rtype: pandas.Index - """ - return self.returns.columns - - def serve(self, t): - """Serve data for policy and simulator at time :math:`t`. - - :param t: Time of execution, *e.g.*, stock market open of a given day. - :type t: pandas.Timestamp - - :returns: (past_returns, current_returns, past_volumes, - current_volumes, current_prices) - :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None, - pandas.Series or None, pandas.Series or None) - """ - - mask = self._universe_mask_at_time(t).values - self._mask_dataframes(mask) - - tidx = self.returns.index.get_loc(t) - past_returns = self._df_or_ser_set_read_only( - pd.DataFrame(self._masked_returns.iloc[:tidx])) - current_returns = self._df_or_ser_set_read_only( - pd.Series(self._masked_returns.iloc[tidx])) - - if not self.volumes is None: - tidx = self.volumes.index.get_loc(t) - past_volumes = self._df_or_ser_set_read_only( - pd.DataFrame(self._masked_volumes.iloc[:tidx])) - current_volumes = self._df_or_ser_set_read_only( - pd.Series(self._masked_volumes.iloc[tidx])) - else: - past_volumes = None - current_volumes = None - - if not self.prices is None: - tidx = self.prices.index.get_loc(t) - current_prices = self._df_or_ser_set_read_only( - pd.Series(self._masked_prices.iloc[tidx])) - else: - current_prices = None - - return (past_returns, current_returns, past_volumes, current_volumes, - current_prices) - - def _add_cash_column(self, cash_key, grace_period): - """Add the cash column to an already formed returns dataframe. - - This assumes that the trading periods are about equally spaced. - If, say, you have trading periods with very different lengths you - should redefine this method **and** replace the :class:`CashReturn` - objective term. - """ - - if not cash_key in RATES: - raise NotImplementedError( - 'Currently the only data pipelines built are for cash_key' - f' in {list(RATES)}') - - if self.returns.index.tz is None: - raise DataError( - 'Your provided dataframes are not timezone aware.' - + " This is not recommended, and doesn't allow to add the cash" - + " returns' column internally." - + " You can fix this by adding a timezone manually " - + "using pandas.DataFrame.tz_localize to the dataframes before" - + " you pass them, or you can provide" - + " the cash returns' column as the last column of the returns" - + " dataframe (so it has one more column than volumes and" - + " prices, if provided), and set the cash_key parameter to" - + " its name.") - - data = Fred( - RATES[cash_key], base_location=self.base_location, - grace_period=grace_period) - - cash_returns_per_period = resample_returns( - data.data/100, periods=self.periods_per_year) - - # we merge instead of assigning column because indexes might - # be misaligned (e.g., with tz-aware timestamps) - cash_returns_per_period.name = self.cash_key - original_returns_index = self.returns.index - tmp = pd.concat([self.returns, cash_returns_per_period], axis=1) - tmp[cash_key] = tmp[cash_key].ffill() - self.returns = tmp.loc[original_returns_index] - - def trading_calendar( - self, start_time=None, end_time=None, include_end=True): - """Get trading calendar from market data. - - :param start_time: Initial time of the trading calendar. Always - inclusive if present. If None, use the first available time. - :type start_time: pandas.Timestamp - :param end_time: Final time of the trading calendar. If None, - use the last available time. - :type end_time: pandas.Timestamp - :param include_end: Include end time. - :type include_end: bool - - :returns: Trading calendar. - :rtype: pandas.DatetimeIndex - """ - result = self.returns.index - result = result[result >= self._earliest_backtest_start] - if start_time: - result = result[result >= start_time] - if end_time: - result = result[(result <= end_time)] - if not include_end: - result = result[:-1] - return result - - def _universe_mask_at_time(self, t): - """Return the valid universe mask at time t.""" - past_returns = self.returns.loc[self.returns.index < t] - if self.online_usage: - valid_universe_mask = past_returns.count() >= self.min_history - else: - valid_universe_mask = ((past_returns.count() >= self.min_history) & - (~self.returns.loc[t].isnull())) - if sum(valid_universe_mask) <= 1: - raise DataError( - f'The trading universe at time {t} has size less or equal' - + ' than one, i.e., only the cash account. There are probably ' - + ' issues with missing data in the provided market returns.') - return valid_universe_mask - - @staticmethod - def _df_or_ser_set_read_only(df_or_ser): - """Set numpy array contained in dataframe to read only. - - This is done on data store internally before it is served to the - policy or the simulator to ensure data consistency in case some - element of the pipeline accidentally corrupts the data. - - This is enough to prevent direct assignement to the resulting - dataframe. However it could still be accidentally corrupted by - assigning to columns or indices that are not present in the - original. We avoid that case as well by returning a wrapped - dataframe (which doesn't copy data on creation) in - serve_data_policy and serve_data_simulator. - """ - data = df_or_ser.values - data.flags.writeable = False - if hasattr(df_or_ser, 'columns'): - return pd.DataFrame(data, index=df_or_ser.index, - columns=df_or_ser.columns) - return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name) - - def _set_read_only(self): - """Set internal dataframes to read-only.""" - - self.returns = self._df_or_ser_set_read_only(self.returns) - - if not self.prices is None: - self.prices = self._df_or_ser_set_read_only(self.prices) - - if not self.volumes is None: - self.volumes = self._df_or_ser_set_read_only(self.volumes) - - @property - def _earliest_backtest_start(self): - """Earliest date at which we can start a backtest.""" - return self.returns.iloc[:, :-1].dropna(how='all').index[ - self.min_history] - - sampling_intervals = { - 'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'} - - # @staticmethod - # def _is_first_interval_small(datetimeindex): - # """Check if post-resampling the first interval is small. - # - # We have no way of knowing exactly if the first interval - # needs to be dropped. We drop it if its length is smaller - # than the average of all others, minus 2 standard deviation. - # """ - # first_interval = (datetimeindex[1] - datetimeindex[0]) - # all_others = (datetimeindex[2:] - datetimeindex[1:-1]) - # return first_interval < (all_others.mean() - 2 * all_others.std()) - - def _downsample(self, interval): - """_downsample market data.""" - if not interval in self.sampling_intervals: - raise SyntaxError( - 'Unsopported trading interval for down-sampling.') - interval = self.sampling_intervals[interval] - new_returns_index = pd.Series(self.returns.index, self.returns.index - ).resample(interval, closed='left', - label='left').first().values - # print(new_returns_index) - self.returns = np.exp(np.log( - 1+self.returns).resample(interval, closed='left', label='left' - ).sum(min_count=1))-1 - self.returns.index = new_returns_index - - # last row is always unknown - self.returns.iloc[-1] = np.nan - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.returns.index): - # self.returns = self.returns.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.returns.columns[:-1]: - self.returns[col].loc[ - (~(self.returns[col].isnull())).idxmax() - ] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.returns = self.returns.iloc[1:] - - if self.volumes is not None: - new_volumes_index = pd.Series( - self.volumes.index, self.volumes.index - ).resample(interval, closed='left', - label='left').first().values - self.volumes = self.volumes.resample( - interval, closed='left', label='left').sum(min_count=1) - self.volumes.index = new_volumes_index - - # last row is always unknown - self.volumes.iloc[-1] = np.nan - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.volumes.index): - # self.volumes = self.volumes.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.volumes.columns: - self.volumes[col].loc[ - (~(self.volumes[col].isnull())).idxmax() - ] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.volumes = self.volumes.iloc[1:] - - if self.prices is not None: - new_prices_index = pd.Series( - self.prices.index, self.prices.index - ).resample( - interval, closed='left', label='left').first().values - self.prices = self.prices.resample( - interval, closed='left', label='left').first() - self.prices.index = new_prices_index - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.prices.index): - # self.prices = self.prices.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.prices.columns: - self.prices[col].loc[ - (~(self.prices[col].isnull())).idxmax() - ] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.prices = self.prices.iloc[1:] - - def _check_sizes(self): - """Check sizes of user-provided dataframes.""" - - if (not self.volumes is None) and ( - not (self.volumes.shape[1] == self.returns.shape[1] - 1) - or not all(self.volumes.columns == self.returns.columns[:-1])): - raise SyntaxError( - 'Volumes should have same columns as returns, minus cash_key.') - - if (not self.prices is None) and ( - not (self.prices.shape[1] == self.returns.shape[1] - 1) - or not all(self.prices.columns == self.returns.columns[:-1])): - raise SyntaxError( - 'Prices should have same columns as returns, minus cash_key.') - - @property - def periods_per_year(self): - """Average trading periods per year inferred from the data. - - :returns: Average periods per year. - :rtype: int - """ - return periods_per_year_from_datetime_index(self.returns.index) - - @property - def min_history(self): - """Min history expressed in periods. - - :returns: How many non-null elements of the past returns for a given - name are required to include it. - :rtype: int - """ - return int(np.round(self.periods_per_year * ( - self._min_history_timedelta / pd.Timedelta('365.24d')))) - - -class UserProvidedMarketData(MarketDataInMemory): - """User-provided market data. - - :param returns: Historical open-to-open returns. The return - at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where - :math:`p_t` is the (open) price at time :math:`t`. Must - have datetime index. You can also include cash - returns as its last column, and set ``cash_key`` below to the last - column's name. - :type returns: pandas.DataFrame - :param volumes: Historical market volumes, expressed in units - of value (*e.g.*, US dollars). - :type volumes: pandas.DataFrame or None - :param prices: Historical open prices (*e.g.*, used for rounding - trades in the :class:`MarketSimulator`). - :type prices: pandas.DataFrame or None - :param trading_frequency: Instead of using frequency implied by - the index of the returns, down-sample all dataframes. - We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and - ``'annual'``. By default (None) don't down-sample. - :type trading_frequency: str or None - :param min_history: Minimum amount of time for which the returns - are not ``np.nan`` before each assets enters in a back-test. - :type min_history: pandas.Timedelta - :param base_location: The location of the storage, only used - in case it downloads the cash returns. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param cash_key: Name of the cash account. If not the last column - of the provided returns, it will be downloaded. In that case you should - make sure your provided dataframes have a timezone aware datetime - index. Its returns are the risk-free rate. - :type cash_key: str - :param online_usage: Disable removal of assets that have ``np.nan`` returns - for the given time. Default False. - :type online_usage: bool - """ - - # pylint: disable=too-many-arguments - def __init__(self, returns, volumes=None, prices=None, - copy_dataframes=True, trading_frequency=None, - min_history=pd.Timedelta('365.24d'), - base_location=BASE_LOCATION, - grace_period=pd.Timedelta('1d'), - cash_key='USDOLLAR', - online_usage=False): - - if returns is None: - raise SyntaxError( - "If you don't specify a universe you should pass `returns`.") - - self.base_location = Path(base_location) - self.cash_key = cash_key - - self.returns = pd.DataFrame(returns, copy=copy_dataframes) - self.volumes = volumes if volumes is None else\ - pd.DataFrame(volumes, copy=copy_dataframes) - self.prices = prices if prices is None else\ - pd.DataFrame(prices, copy=copy_dataframes) - - if cash_key != returns.columns[-1]: - self._add_cash_column(cash_key, grace_period=grace_period) - - # this is mandatory - super().__init__( - trading_frequency=trading_frequency, - base_location=base_location, - cash_key=cash_key, - min_history=min_history, - online_usage=online_usage) - - -class DownloadedMarketData(MarketDataInMemory): - """Market data that is downloaded. - - :param universe: List of names as understood by the data source - used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default - Yahoo Finance data source. - :type universe: list - :param datasource: The data source used. - :type datasource: str or :class:`SymbolData` class - :param cash_key: Name of the cash account, its rates will be downloaded - and added as last columns of the returns. Its returns are the - risk-free rate. - :type cash_key: str - :param base_location: The location of the storage. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param min_history: Minimum amount of time for which the returns - are not ``np.nan`` before each assets enters in a back-test. - :type min_history: pandas.Timedelta - :param grace_period: If the most recent observation of each symbol's - data is less old than this we do not download new data. - By default it's one day. - :type grace_period: pandas.Timedelta - :param trading_frequency: Instead of using frequency implied by - the index of the returns, down-sample all dataframes. - We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and - ``'annual'``. By default (None) don't down-sample. - :type trading_frequency: str or None - :param online_usage: Disable removal of assets that have ``np.nan`` returns - for the given time. Default False. - :type online_usage: bool - """ - - # pylint: disable=too-many-arguments - def __init__(self, - universe=(), - datasource='YahooFinance', - cash_key='USDOLLAR', - base_location=BASE_LOCATION, - storage_backend='pickle', - min_history=pd.Timedelta('365.24d'), - grace_period=pd.Timedelta('1d'), - trading_frequency=None, - online_usage=False): - """Initializer.""" - - # drop duplicates and ensure ordering - universe = sorted(set(universe)) - - self.base_location = Path(base_location) - self.cash_key = cash_key - if isinstance(datasource, type): - self.datasource = datasource - else: # try to load in current module - self.datasource = globals()[datasource] - self._get_market_data( - universe, grace_period=grace_period, - storage_backend=storage_backend) - self._add_cash_column(self.cash_key, grace_period=grace_period) - self._remove_missing_recent() - - # this is mandatory - super().__init__( - trading_frequency=trading_frequency, - base_location=base_location, - cash_key=cash_key, - min_history=min_history, - online_usage=online_usage) - - def _get_market_data(self, universe, grace_period, storage_backend): - """Download market data.""" - database_accesses = {} - print('Updating data', end='') - sys.stdout.flush() - - for stock in universe: - logger.info( - 'Updating %s with %s.', stock, self.datasource.__name__) - print('.', end='') - sys.stdout.flush() - database_accesses[stock] = self.datasource( - stock, base_location=self.base_location, - grace_period=grace_period, storage_backend=storage_backend) - print() - - if hasattr(self.datasource, 'IS_OHLCVR') and self.datasource.IS_OHLCVR: - self.returns = pd.DataFrame( - {stock: database_accesses[stock].data['return'] - for stock in universe}) - self.volumes = pd.DataFrame( - {stock: database_accesses[stock].data['valuevolume'] - for stock in universe}) - self.prices = pd.DataFrame( - {stock: database_accesses[stock].data['open'] - for stock in universe}) - else: # for now only Fred for indexes, we assume prices! - assert isinstance(database_accesses[universe[0]].data, pd.Series) - self.prices = pd.DataFrame( - # open prices - {stock: database_accesses[stock].data for stock in universe}) - self.returns = 1 - self.prices / self.prices.shift(-1) - self.volumes = None - - def _remove_missing_recent(self): - """Clean recent data. - - Yahoo Finance may has issues with most recent data; we remove - recent days if there are NaNs. - """ - - if self.prices.iloc[-5:].isnull().any().any(): - logger.debug( - 'Removing some recent lines because there are missing values.') - drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax() - logger.debug('Dropping at index %s', drop_at) - self.returns = self.returns.loc[self.returns.index < drop_at] - if self.prices is not None: - self.prices = self.prices.loc[self.prices.index < drop_at] - if self.volumes is not None: - self.volumes = self.volumes.loc[self.volumes.index < drop_at] - - # for consistency we must also nan-out the last row - # of returns and volumes - self.returns.iloc[-1] = np.nan - if self.volumes is not None: - self.volumes.iloc[-1] = np.nan - - def partial_universe_signature(self, partial_universe): - """Unique signature of this instance with a partial universe. - - A partial universe is a subset of the full universe that is - available at some time for trading. - - This is used in cvxportfolio.cache to sign back-test caches that - are saved on disk. See its implementation below for details. If - not redefined it returns None which disables on-disk caching. - - :param partial_universe: A subset of the full universe. - :type partial_universe: pandas.Index - - :returns: Signature. - :rtype: str - """ - assert isinstance(partial_universe, pd.Index) - assert np.all(partial_universe.isin(self.full_universe)) - result = f'{self.__class__.__name__}(' - result += f'datasource={self.datasource.__name__}, ' - result += f'partial_universe_hash={hash_(np.array(partial_universe))},' - result += f' trading_frequency={self.trading_frequency})' - return result From 87ba504ac990864e670a7e7db85734806f9b1d8a Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 12:21:38 +0400 Subject: [PATCH 08/38] Applied changes of commit 564e1fe --- cvxportfolio/data/__init__.py | 2 +- cvxportfolio/data/market_data.py | 3 +- cvxportfolio/data/symbol_data.py | 129 +++++++++++++++++-------------- 3 files changed, 71 insertions(+), 63 deletions(-) diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py index 97aad902d..0c2bc403a 100644 --- a/cvxportfolio/data/__init__.py +++ b/cvxportfolio/data/__init__.py @@ -23,4 +23,4 @@ from .symbol_data import * __all__ = [ - "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"] \ No newline at end of file + "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"] diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index be723a618..d382abdc9 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -49,7 +49,6 @@ def serve(self, t): """ raise NotImplementedError # pragma: no cover - # pylint: disable=redundant-returns-doc def trading_calendar( self, start_time=None, end_time=None, include_end=True): """Get trading calendar between times. @@ -77,7 +76,7 @@ def periods_per_year(self): raise NotImplementedError # pragma: no cover @property - def full_universe(self): # pylint: disable=redundant-returns-doc + def full_universe(self): """Full universe, which might not be available for trading. :returns: Full universe. diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 142d5ff41..29ddcb634 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -249,27 +249,17 @@ def _timestamp_convert(unix_seconds_ts): """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') +class OHLCV(SymbolData): # pylint: disable=abstract-method + """Base class for Open-High-Low-Close-Volume symbol data.""" -class YahooFinance(SymbolData): - """Yahoo Finance symbol data. + # TODO: factor quality check and clean into total-return related and non- - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. - :type grace_period: pandas.Timedelta +class OHLCVTR(OHLCV): # pylint: disable=abstract-method + """Base class for Open-High-Low-Close-Volume-Total Return symbol data.""" - :attribute data: The downloaded, and cleaned, data for the symbol. - :type data: pandas.DataFrame - """ + # TODO: consider creating a OHLCVAC (adjusted closes) subclass - # is open-high-low-close-volume-(total)return + # is open-high-low-close-volume-total return IS_OHLCVR = True @staticmethod @@ -379,6 +369,66 @@ def _clean(data): return data + def _quality_check(self, data): + """Analyze quality of the OHLCV-TR data.""" + + # zero volume + zerovol_idx = data.index[data.volume == 0] + if len(zerovol_idx) > 0: + logger.warning( + '%s("%s") has volume equal to zero for timestamps: %s', + self.__class__.__name__, self.symbol, zerovol_idx) + + def print_extreme(logreturns, name, sigmas=50): + + # TODO: choose + m, s = logreturns.median(), np.sqrt((logreturns**2).median()) + normalized = (logreturns - m)/s + + # normalized = logreturns / logreturns.rolling(252).std().shift(1) + + extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] + if len(extremereturn_idx) > 0: + logger.warning( + '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', + self.__class__.__name__, self.symbol, name, sigmas, + extremereturn_idx) + + # extreme logreturns + logreturns = np.log(1 + data['return']).dropna() + print_extreme(logreturns, 'total returns') + + # extreme open2close + open2close = np.log(data['close']) - np.log(data['open']).dropna() + print_extreme(open2close, 'open to close returns') + + # extreme open2high + open2high = np.log(data['high']) - np.log(data['open']).dropna() + print_extreme(open2high, 'open to high returns') + + # extreme open2low + open2low = np.log(data['low']) - np.log(data['open']).dropna() + print_extreme(open2low, 'open to low returns') + +class YahooFinance(OHLCVTR): + """Yahoo Finance symbol data. + + :param symbol: The symbol that we downloaded. + :type symbol: str + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. + :type storage_backend: str + :param base_storage_location: The location of the storage. We store in a + subdirectory named after the class which derives from this. + :type base_storage_location: pathlib.Path + :param grace_period: If the most recent observation in the data is less + old than this we do not download new data. + :type grace_period: pandas.Timedelta + + :attribute data: The downloaded, and cleaned, data for the symbol. + :type data: pandas.DataFrame + """ + @staticmethod def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): """Get 1 day OHLC from Yahoo finance. @@ -432,9 +482,9 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): data['indicators']['quote'][0], index=index) df_result['adjclose'] = data[ 'indicators']['adjclose'][0]['adjclose'] - except KeyError: + except KeyError as exc: raise DataError(f'Yahoo finance download of {ticker} failed.' - + ' Json:', str(res.json())) # pragma: no cover + + ' Json:', str(res.json())) from exc # pragma: no cover # last timestamp is probably broken (not timed to market open) # we set its time to same as the day before, but this is wrong @@ -489,47 +539,6 @@ def _download(self, symbol, current=None, new = self._clean(new) return pd.concat([current.iloc[:-overlap], new]) - def _quality_check(self, data): - """Analyze quality of the OHLCV-TR data.""" - - # zero volume - zerovol_idx = data.index[data.volume == 0] - if len(zerovol_idx) > 0: - logger.warning( - '%s("%s") has volume equal to zero for timestamps: %s', - self.__class__.__name__, self.symbol, zerovol_idx) - - def print_extreme(logreturns, name, sigmas=50): - - # TODO: choose - m, s = logreturns.median(), np.sqrt((logreturns**2).median()) - normalized = (logreturns - m)/s - - # normalized = logreturns / logreturns.rolling(252).std().shift(1) - - extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] - if len(extremereturn_idx) > 0: - logger.warning( - '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', - self.__class__.__name__, self.symbol, name, sigmas, - extremereturn_idx) - - # extreme logreturns - logreturns = np.log(1 + data['return']).dropna() - print_extreme(logreturns, 'total returns') - - # extreme open2close - open2close = np.log(data['close']) - np.log(data['open']).dropna() - print_extreme(open2close, 'open to close returns') - - # extreme open2high - open2high = np.log(data['high']) - np.log(data['open']).dropna() - print_extreme(open2high, 'open to high returns') - - # extreme open2low - open2low = np.log(data['low']) - np.log(data['open']).dropna() - print_extreme(open2low, 'open to low returns') - def _preload(self, data): """Prepare data for use by Cvxportfolio. From 7d86c9ca4511ade3d5da9e28dafc2e641040b0dc Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 12:23:57 +0400 Subject: [PATCH 09/38] Applied changes of commit 13f119f --- cvxportfolio/data/symbol_data.py | 40 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 29ddcb634..f60a0f131 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -262,9 +262,8 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method # is open-high-low-close-volume-total return IS_OHLCVR = True - @staticmethod - def _clean(data): - """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" + def _nan_impossible(self, data): + """Set impossible values to NaN.""" # print(data) # print(data.isnull().sum()) @@ -286,6 +285,8 @@ def _clean(data): # print(data) # print(data.isnull().sum()) + # TODO: these can be made smarter (sometimes the open is clearly wrong) + # if low is not the lowest, set it to nan data['low'].loc[ data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan @@ -297,9 +298,11 @@ def _clean(data): # print(data) # print(data.isnull().sum()) - # - # fills - # + def _fill_easy(self, data): + """Make easy fills.""" + + # print(data) + # print(data.isnull().sum()) # fill volumes with zeros (safest choice) data['volume'] = data['volume'].fillna(0.) @@ -331,9 +334,11 @@ def _clean(data): # print(data) # print(data.isnull().sum()) - # - # Compute returns - # + def _compute_total_returns(self, data): + """Compute total open-to-open returns.""" + + # print(data) + # print(data.isnull().sum()) # compute log of ratio between adjclose and close log_adjustment_ratio = np.log(data['adjclose'] / data['close']) @@ -358,15 +363,26 @@ def _clean(data): # intraday_logreturn.shift(-1) # ) # data["return"] = np.exp(open_to_open_logreturn) - 1 + + # print(data) + # print(data.isnull().sum()) + + def _clean(self, data): + """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" + + self._nan_impossible(data) + + self._fill_easy(data) + + self._compute_total_returns(data) + + # eliminate adjclose column del data["adjclose"] # eliminate last period's intraday data data.loc[data.index[-1], ["high", "low", "close", "return", "volume"]] = np.nan - # print(data) - # print(data.isnull().sum()) - return data def _quality_check(self, data): From 60b1459fdbc69e4113dc0e2db7924511e8e1fa38 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 22 Jan 2024 12:26:15 +0400 Subject: [PATCH 10/38] Applied changes of commit b794c7dc --- cvxportfolio/data/symbol_data.py | 98 ++++++++++++++++++++++++++++---- 1 file changed, 86 insertions(+), 12 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index f60a0f131..0047e0953 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -262,6 +262,54 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method # is open-high-low-close-volume-total return IS_OHLCVR = True + # rolstd windows for finding wrong logreturns + _ROLSTD_WINDOWS = [20, 60, 252] + + # threshold for finding wrong logreturns + _WRONG_LOGRET_THRESHOLD = 15 + + def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold): + """Get indexes of logreturns that are extreme wrt trailing stddev.""" + trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1)) + bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold] + return bad_indexes + + def _find_wrong_daily_logreturns(self, lrets): + """Find indexes of logreturns that are most probably data errors.""" + bad_indexes = [] + for window in self._ROLSTD_WINDOWS: + bad_indexes.append( + set(self._indexes_extreme_logrets_wrt_rolstddev( + lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD))) + bad_indexes.append( + set(self._indexes_extreme_logrets_wrt_rolstddev( + lrets.iloc[::-1], window=window, + treshold=self._WRONG_LOGRET_THRESHOLD))) + bad_indexes = set.intersection(*bad_indexes) + return bad_indexes + + # TODO: plan + # ffill adj closes & compute adj close logreturns + # use code above to get indexes of wrong ones, raise warnings, set to 0 + # + # check close vs adj close, there should be only dividends (with y finance) + # + # throw out opens that are not in [low, high] + # + # apply similar logic (perhaps using total lrets for the stddev) for + # open-close , close-high , close-low, throw out open/low/close not OK + # + # fill + # + # compute open-open total returns, then check with same logic for errors + # + # when doing append, make past data adhere to same format: recompute adj close + # + # could use volumes as well, if there are jumps in price due to + # splits not recorded, then price * volume should be more stable + # + # + def _nan_impossible(self, data): """Set impossible values to NaN.""" @@ -269,18 +317,32 @@ def _nan_impossible(self, data): # print(data.isnull().sum()) # nan-out nonpositive prices - data.loc[data["open"] <= 0, 'open'] = np.nan - data.loc[data["close"] <= 0, "close"] = np.nan - data.loc[data["high"] <= 0, "high"] = np.nan - data.loc[data["low"] <= 0, "low"] = np.nan - data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan + for column in ["open", "close", "high", "low", "adjclose"]: + bad_indexes = data.index[data[column] <= 0] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") has non-positive %s prices on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, column, bad_indexes) + data.loc[bad_indexes, column] = np.nan # nan-out negative volumes - data.loc[data["volume"] < 0, 'volume'] = np.nan + bad_indexes = data.index[data["volume"] < 0] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") has negative volumes on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + data.loc[bad_indexes, "volume"] = np.nan # all infinity values are nans - data.iloc[:, :] = np.nan_to_num( - data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) + if np.isinf(data).sum().sum() > 0: + logger.warning( + '%s("%s") has +/- infinity values, setting those to nan', + self.__class__.__name__, self.symbol) + data.iloc[:, :] = np.nan_to_num( + data.values, copy=True, nan=np.nan, posinf=np.nan, + neginf=np.nan) # print(data) # print(data.isnull().sum()) @@ -288,12 +350,24 @@ def _nan_impossible(self, data): # TODO: these can be made smarter (sometimes the open is clearly wrong) # if low is not the lowest, set it to nan - data['low'].loc[ - data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan + bad_indexes = data.index[ + data['low'] > data[['open', 'high', 'close']].min(1)] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") low prices are not the lowest on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + data.loc[bad_indexes, "low"] = np.nan # if high is not the highest, set it to nan - data['high'].loc[ - data['high'] < data[['open', 'high', 'close']].max(1)] = np.nan + bad_indexes = data.index[ + data['high'] < data[['open', 'high', 'close']].max(1)] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") high prices are not the highest on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + data.loc[bad_indexes, "high"] = np.nan # print(data) # print(data.isnull().sum()) From 24489eb4447ccd5be8a5ff5f4080c210fc7bf469 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 7 Feb 2024 11:05:25 +0400 Subject: [PATCH 11/38] trying different approach for timestamping last row in yahoofinance --- cvxportfolio/data/symbol_data.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 0047e0953..bca12bd44 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -559,7 +559,8 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): + 'Json output:', str(res.json())) if res.status_code != 200: - raise DataError(f'Yahoo finance download of {ticker} failed. Json:', + raise DataError( + f'Yahoo finance download of {ticker} failed. Json:', str(res.json())) # pragma: no cover data = res.json()['chart']['result'][0] @@ -576,16 +577,25 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): raise DataError(f'Yahoo finance download of {ticker} failed.' + ' Json:', str(res.json())) from exc # pragma: no cover - # last timestamp is probably broken (not timed to market open) - # we set its time to same as the day before, but this is wrong - # on days of DST switch. It's fine though because that line will be - # overwritten next update - if df_result.index[-1].time() != df_result.index[-2].time(): - tm1 = df_result.index[-2].time() - newlast = df_result.index[-1].replace( - hour=tm1.hour, minute=tm1.minute, second=tm1.second) - df_result.index = pd.DatetimeIndex( - list(df_result.index[:-1]) + [newlast]) + # last timestamp could be not timed to market open + this_periods_open_time = _timestamp_convert( + data['meta']['currentTradingPeriod']['regular']['start']) + + if df_result.index[-1] > this_periods_open_time: + index = df_result.index.to_numpy() + index[-1] = this_periods_open_time + df_result.index = pd.DatetimeIndex(index) + + # # last timestamp is probably broken (not timed to market open) + # # we set its time to same as the day before, but this is wrong + # # on days of DST switch. It's fine though because that line will be + # # overwritten next update + # if df_result.index[-1].time() != df_result.index[-2].time(): + # tm1 = df_result.index[-2].time() + # newlast = df_result.index[-1].replace( + # hour=tm1.hour, minute=tm1.minute, second=tm1.second) + # df_result.index = pd.DatetimeIndex( + # list(df_result.index[:-1]) + [newlast]) return df_result[ ['open', 'low', 'high', 'close', 'adjclose', 'volume']] From 66e7b903106e67009d52487784b2220ee9cb5baa Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Fri, 9 Feb 2024 10:38:11 +0400 Subject: [PATCH 12/38] minor --- cvxportfolio/data/symbol_data.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index bca12bd44..a42694cd4 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""This module defines the :class:`SymbolData` abstraction and derived classes.""" +"""This module defines :class:`SymbolData` and derived classes.""" + import datetime import logging import sqlite3 @@ -303,8 +304,8 @@ def _find_wrong_daily_logreturns(self, lrets): # # compute open-open total returns, then check with same logic for errors # - # when doing append, make past data adhere to same format: recompute adj close - # + # when doing append, make past data adhere to same format: recompute adj + # close # could use volumes as well, if there are jumps in price due to # splits not recorded, then price * volume should be more stable # @@ -521,7 +522,20 @@ class YahooFinance(OHLCVTR): @staticmethod def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): - """Get 1 day OHLC from Yahoo finance. + """Get 1-day OHLC-AC-V from Yahoo finance. + + This is roughly equivalent to + + .. code-block:: + + import yfinance as yf + yf.download(ticker) + + But it does no caching of any sort; only a single request call, + error checking (which result in exceptions going all the way to the + user, in the current design), json parsing, and a minimal effort to + restore the last timestamp. All processing and cleaning is done + elsewhere. Result is timestamped with the open time (time-zoned) of the instrument. @@ -597,6 +611,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): # df_result.index = pd.DatetimeIndex( # list(df_result.index[:-1]) + [newlast]) + # these are all the columns, we simply re-order them return df_result[ ['open', 'low', 'high', 'close', 'adjclose', 'volume']] From 788e72e2abac42806f715ec18f6c0c734c392767 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Fri, 9 Feb 2024 11:38:12 +0400 Subject: [PATCH 13/38] symbol_data --- cvxportfolio/data/symbol_data.py | 148 +++++++++++++++++++-------- examples/strategies/ftse100_daily.py | 1 + examples/universes.py | 4 +- 3 files changed, 107 insertions(+), 46 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index a42694cd4..ad51db479 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -250,44 +250,105 @@ def _timestamp_convert(unix_seconds_ts): """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') +# Windows for filtering extreme logreturns +_WINDOWS = (10, 20, 50, 100, 200) + +def _median_scale_around(lrets, window): + """Median absolute logreturn in a window around each timestamp.""" + return np.abs(lrets).rolling(window, center=True, min_periods=1).median() + +def _mean_scale_around(lrets, window): + """Root mean squared logreturn in a window around each timestamp.""" + return np.sqrt( + (lrets**2).rolling(window, center=True, min_periods=1).mean()) + +def _unlikeliness_score( + test_logreturns, reference_logreturns, scaler=_median_scale_around, + windows=_WINDOWS): + """Find problematic indexes for test logreturns compared w/ reference.""" + scaled = [ + np.abs(test_logreturns) / scaler(reference_logreturns, window) + for window in windows] + scaled = pd.DataFrame(scaled).T + return scaled.min(axis=1), scaled + + class OHLCV(SymbolData): # pylint: disable=abstract-method - """Base class for Open-High-Low-Close-Volume symbol data.""" + """Base class for Open-High-Low-Close-Volume symbol data. + + This operates on a dataframe with columns + + .. code-block:: + + ['open', 'low', 'high', 'close', 'volume'] + + or + + .. code-block:: + + ['open', 'low', 'high', 'close', 'volume', 'return'] + + in which case the ``'return'`` column is not processed. It only matters in + the :meth:`_preload`, method: if open-to-open returns are not present, + we compute them there. Otherwise these may be total returns (including + dividends, ...) and they're dealt with in derived classes. + """ # TODO: factor quality check and clean into total-return related and non- + def _preload(self, data): + """Prepare data for use by Cvxportfolio. + + We drop the `volume` column expressed in number of shares and + replace it with `valuevolume` which is an estimate of the (e.g., + US dollar) value of the volume exchanged on the day. + """ + + # this is not used currently, but if we implement an interface to a + # pure OHLCV data source there is no need to store the open-to-open + # returns, they can be computed here + if not 'return' in data.columns: + data['return'] = data['open'].pct_change().shift(-1) + + self._quality_check(data) + data["valuevolume"] = data["volume"] * data["open"] + del data["volume"] + + return data + class OHLCVTR(OHLCV): # pylint: disable=abstract-method - """Base class for Open-High-Low-Close-Volume-Total Return symbol data.""" + """Open-High-Low-Close-Volume-TotalReturn symbol data.""" # TODO: consider creating a OHLCVAC (adjusted closes) subclass # is open-high-low-close-volume-total return IS_OHLCVR = True - # rolstd windows for finding wrong logreturns - _ROLSTD_WINDOWS = [20, 60, 252] - - # threshold for finding wrong logreturns - _WRONG_LOGRET_THRESHOLD = 15 - - def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold): - """Get indexes of logreturns that are extreme wrt trailing stddev.""" - trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1)) - bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold] - return bad_indexes - - def _find_wrong_daily_logreturns(self, lrets): - """Find indexes of logreturns that are most probably data errors.""" - bad_indexes = [] - for window in self._ROLSTD_WINDOWS: - bad_indexes.append( - set(self._indexes_extreme_logrets_wrt_rolstddev( - lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD))) - bad_indexes.append( - set(self._indexes_extreme_logrets_wrt_rolstddev( - lrets.iloc[::-1], window=window, - treshold=self._WRONG_LOGRET_THRESHOLD))) - bad_indexes = set.intersection(*bad_indexes) - return bad_indexes + # # rolstd windows for finding wrong logreturns + # _ROLSTD_WINDOWS = [20, 60, 252] + + # # threshold for finding wrong logreturns + # _WRONG_LOGRET_THRESHOLD = 15 + + # def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold): + # """Get indexes of logreturns that are extreme wrt trailing stddev.""" + # trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1)) + # bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold] + # return bad_indexes + + # def _find_wrong_daily_logreturns(self, lrets): + # """Find indexes of logreturns that are most probably data errors.""" + # bad_indexes = [] + # for window in self._ROLSTD_WINDOWS: + # bad_indexes.append( + # set(self._indexes_extreme_logrets_wrt_rolstddev( + # lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD))) + # bad_indexes.append( + # set(self._indexes_extreme_logrets_wrt_rolstddev( + # lrets.iloc[::-1], window=window, + # treshold=self._WRONG_LOGRET_THRESHOLD))) + # bad_indexes = set.intersection(*bad_indexes) + # return bad_indexes # TODO: plan # ffill adj closes & compute adj close logreturns @@ -442,7 +503,7 @@ def _compute_total_returns(self, data): # print(data) # print(data.isnull().sum()) - def _clean(self, data): + def _process(self, data): """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" self._nan_impossible(data) @@ -501,7 +562,18 @@ def print_extreme(logreturns, name, sigmas=50): open2low = np.log(data['low']) - np.log(data['open']).dropna() print_extreme(open2low, 'open to low returns') -class YahooFinance(OHLCVTR): + +class OHLCVAC(OHLCVTR): + """Open-High-Low-Close-Volume-AdjustedClose data. + + This is modeled after the data returned by Yahoo Finance. It implements + the transformation required to conform to the + Open-High-Low-Close-Volume-TotalReturn model, that is, compute + returns from the adjusted closes, and do some error checks. + """ + + +class YahooFinance(OHLCVAC): """Yahoo Finance symbol data. :param symbol: The symbol that we downloaded. @@ -633,6 +705,7 @@ def _download(self, symbol, current=None, Returns: updated (pandas.DataFrame): updated DataFrame for the symbol """ + # TODO this could be put at a much lower class hierarchy if overlap < 2: raise SyntaxError( f'{self.__class__.__name__} with overlap smaller than 2' @@ -640,7 +713,7 @@ def _download(self, symbol, current=None, if (current is None) or (len(current) < overlap): updated = self._get_data_yahoo(symbol, **kwargs) logger.info('Downloading from the start.') - result = self._clean(updated) + result = self._process(updated) # we remove first row if it contains NaNs if np.any(result.iloc[0].isnull()): result = result.iloc[1:] @@ -651,22 +724,9 @@ def _download(self, symbol, current=None, 'Skipping download because stored data is recent enough.') return current new = self._get_data_yahoo(symbol, start=current.index[-overlap]) - new = self._clean(new) + new = self._process(new) return pd.concat([current.iloc[:-overlap], new]) - def _preload(self, data): - """Prepare data for use by Cvxportfolio. - - We drop the `volume` column expressed in number of stocks and - replace it with `valuevolume` which is an estimate of the (e.g., - US dollar) value of the volume exchanged on the day. - """ - - self._quality_check(data) - data["valuevolume"] = data["volume"] * data["open"] - del data["volume"] - - return data # # Fred. diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py index 0f0b976db..bc99dd472 100644 --- a/examples/strategies/ftse100_daily.py +++ b/examples/strategies/ftse100_daily.py @@ -67,6 +67,7 @@ def policy(gamma_risk, gamma_trade): else: import matplotlib.pyplot as plt + #INDEX_ETF = 'DIA' research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND') diff --git a/examples/universes.py b/examples/universes.py index 8fd2ca747..3687a1112 100644 --- a/examples/universes.py +++ b/examples/universes.py @@ -132,8 +132,8 @@ 'ftse100': { 'page': 'https://en.wikipedia.org/wiki/FTSE_100_Index', 'table_number': -1, - 'column_number':1, - 'suffix':'.L', + 'column_number': 1, + 'suffix': '.L', } } From 96b239a329482104b8cb101c61c76478894e85db Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 12 Feb 2024 11:25:40 +0400 Subject: [PATCH 14/38] refactoring _process of OLHCV --- cvxportfolio/data.py | 1398 ++++++++++++++++++++++++++++++ cvxportfolio/data/market_data.py | 2 +- cvxportfolio/data/symbol_data.py | 325 ++++--- cvxportfolio/tests/test_data.py | 2 +- 4 files changed, 1604 insertions(+), 123 deletions(-) create mode 100644 cvxportfolio/data.py diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py new file mode 100644 index 000000000..3e2be8232 --- /dev/null +++ b/cvxportfolio/data.py @@ -0,0 +1,1398 @@ +# Copyright 2023 Enzo Busseti +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module include classes that download, store, and serve market data. + +The two main abstractions are :class:`SymbolData` and :class:`MarketData`. +Neither are exposed outside this module. Their derived classes instead are. + +If you want to interface cvxportfolio with financial data source other +than the ones we provide, you should derive from either of those two classes. +""" + +import datetime +import logging +import sqlite3 +import sys +import warnings +from pathlib import Path +from urllib.error import URLError + +import numpy as np +import pandas as pd +import requests +import requests.exceptions + +from .errors import DataError +from .utils import (hash_, make_numeric, periods_per_year_from_datetime_index, + resample_returns) + +__all__ = ["YahooFinance", "Fred", + "UserProvidedMarketData", "DownloadedMarketData"] + +logger = logging.getLogger(__name__) + +BASE_LOCATION = Path.home() / "cvxportfolio_data" + +def now_timezoned(): + """Return current timestamp with local timezone. + + :returns: Current timestamp with local timezone. + :rtype: pandas.Timestamp + """ + return pd.Timestamp( + datetime.datetime.now(datetime.timezone.utc).astimezone()) + +class SymbolData: + """Base class for a single symbol time series data. + + The data is either in the form of a Pandas Series or DataFrame + and has datetime index. + + This class needs to be derived. At a minimum, + one should redefine the ``_download`` method, which + implements the downloading of the symbol's time series + from an external source. The method takes the current (already + downloaded and stored) data and is supposed to **only append** to it. + In this way we only store new data and don't modify already downloaded + data. + + Additionally one can redefine the ``_preload`` method, which prepares + data to serve to the user (so the data is stored in a different format + than what the user sees.) We found that this separation can be useful. + + This class interacts with module-level functions named ``_loader_BACKEND`` + and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage + system used. We define ``pickle``, ``csv``, and ``sqlite`` backends. + These may have limitations. See their docstrings for more information. + + + :param symbol: The symbol that we downloaded. + :type symbol: str + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. + :type storage_backend: str + :param base_location: The location of the storage. We store in a + subdirectory named after the class which derives from this. By default + it's a directory named ``cvxportfolio_data`` in your home folder. + :type base_location: pathlib.Path + :param grace_period: If the most recent observation in the data is less + old than this we do not download new data. By default it's one day. + :type grace_period: pandas.Timedelta + + :attribute data: The downloaded data for the symbol. + """ + + def __init__(self, symbol, + storage_backend='pickle', + base_location=BASE_LOCATION, + grace_period=pd.Timedelta('1d')): + self._symbol = symbol + self._storage_backend = storage_backend + self._base_location = base_location + self.update(grace_period) + self._data = self.load() + + @property + def storage_location(self): + """Storage location. Directory is created if not existent. + + :rtype: pathlib.Path + """ + loc = self._base_location / f"{self.__class__.__name__}" + loc.mkdir(parents=True, exist_ok=True) + return loc + + @property + def symbol(self): + """The symbol whose data this instance contains. + + :rtype: str + """ + return self._symbol + + @property + def data(self): + """Time series data, updated to the most recent observation. + + :rtype: pandas.Series or pandas.DataFrame + """ + return self._data + + def _load_raw(self): + """Load raw data from database.""" + # we could implement multiprocess safety here + loader = globals()['_loader_' + self._storage_backend] + try: + logger.info( + f"{self.__class__.__name__} is trying to load {self.symbol}" + + f" with {self._storage_backend} backend" + + f" from {self.storage_location}") + return loader(self.symbol, self.storage_location) + except FileNotFoundError: + return None + + def load(self): + """Load data from database using `self.preload` function to process. + + :returns: Loaded time-series data for the symbol. + :rtype: pandas.Series or pandas.DataFrame + """ + return self._preload(self._load_raw()) + + def _store(self, data): + """Store data in database. + + :param data: Time-series data to store. + :type data: pandas.Series or pandas.DataFrame + """ + # we could implement multiprocess safety here + storer = globals()['_storer_' + self._storage_backend] + logger.info( + f"{self.__class__.__name__} is storing {self.symbol}" + + f" with {self._storage_backend} backend" + + f" in {self.storage_location}") + storer(self.symbol, data, self.storage_location) + + def _print_difference(self, current, new): + """Helper method to print difference if update is not append-only. + + This is temporary and will be re-factored. + """ + print("TEMPORARY: Diff between overlap of downloaded and stored") + print((new - current).dropna(how='all').tail(5)) + + def update(self, grace_period): + """Update current stored data for symbol. + + :param grace_period: If the time between now and the last value stored + is less than this, we don't update the data already stored. + :type grace_period: pandas.Timedelta + """ + current = self._load_raw() + logger.info( + f"Downloading {self.symbol}" + + f" from {self.__class__.__name__}") + updated = self._download( + self.symbol, current, grace_period=grace_period) + + if np.any(updated.iloc[:-1].isnull()): + logger.warning( + " cvxportfolio.%s('%s').data contains NaNs." + + " You may want to inspect it. If you want, you can delete the" + + " data file in %s to force re-download from the start.", + self.__class__.__name__, self.symbol, self.storage_location) + + try: + if current is not None: + if not np.all( + # we use numpy.isclose because returns may be computed + # via logreturns and numerical errors can sift through + np.isclose(updated.loc[current.index[:-1]], + current.iloc[:-1], equal_nan=True, + rtol=1e-08, atol=1e-08)): + logger.error(f"{self.__class__.__name__} update" + + f" of {self.symbol} is not append-only!") + self._print_difference(current, updated) + if hasattr(current, 'columns'): + # the first column is open price + if not current.iloc[-1, 0] == updated.loc[ + current.index[-1]].iloc[0]: + logger.error( + f"{self.__class__.__name__} update " + + f" of {self.symbol} changed last open price!") + self._print_difference(current, updated) + else: + if not current.iloc[-1] == updated.loc[current.index[-1]]: + logger.error( + f"{self.__class__.__name__} update" + + f" of {self.symbol} changed last value!") + self._print_difference(current, updated) + except KeyError: + logger.error("%s update of %s could not be checked for" + + " append-only edits. Was there a DST change?", + self.__class__.__name__, self.symbol) + self._store(updated) + + def _download(self, symbol, current, grace_period, **kwargs): + """Download data from external source given already downloaded data. + + This method must be redefined by derived classes. + + :param symbol: The symbol we download. + :type symbol: str + :param current: The data already downloaded. We are supposed to + **only append** to it. If None, no data is present. + :type current: pandas.Series or pandas.DataFrame or None + :rtype: pandas.Series or pandas.DataFrame + """ + raise NotImplementedError #pragma: no cover + + def _preload(self, data): + """Prepare data to serve to the user. + + This method can be redefined by derived classes. + + :param data: The data returned by the storage backend. + :type data: pandas.Series or pandas.DataFrame + :rtype: pandas.Series or pandas.DataFrame + """ + return data + + +# +# Yahoo Finance. +# + +def _timestamp_convert(unix_seconds_ts): + """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" + return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') + + +class YahooFinance(SymbolData): + """Yahoo Finance symbol data. + + :param symbol: The symbol that we downloaded. + :type symbol: str + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. + :type storage_backend: str + :param base_storage_location: The location of the storage. We store in a + subdirectory named after the class which derives from this. + :type base_storage_location: pathlib.Path + :param grace_period: If the most recent observation in the data is less + old than this we do not download new data. + :type grace_period: pandas.Timedelta + + :attribute data: The downloaded, and cleaned, data for the symbol. + :type data: pandas.DataFrame + """ + + # is open-high-low-close-volume-(total)return + IS_OLHCVR = True + + @staticmethod + def _clean(data): + """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" + + # print(data) + # print(data.isnull().sum()) + + # nan-out nonpositive prices + data.loc[data["open"] <= 0, 'open'] = np.nan + data.loc[data["close"] <= 0, "close"] = np.nan + data.loc[data["high"] <= 0, "high"] = np.nan + data.loc[data["low"] <= 0, "low"] = np.nan + data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan + + # nan-out negative volumes + data.loc[data["volume"] < 0, 'volume'] = np.nan + + # all infinity values are nans + data.iloc[:, :] = np.nan_to_num( + data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) + + # print(data) + # print(data.isnull().sum()) + + # if low is not the lowest, set it to nan + data.loc[data['low'] > data[['open', 'high', 'close']].min(1), + 'low'] = np.nan + + # if high is not the highest, set it to nan + data.loc[data['high'] < data[['open', 'high', 'close']].max(1), + 'high'] = np.nan + + # print(data) + # print(data.isnull().sum()) + + # + # fills + # + + # fill volumes with zeros (safest choice) + data['volume'] = data['volume'].fillna(0.) + + # fill close price with open price + data['close'] = data['close'].fillna(data['open']) + + # fill open price with close from day(s) before + # repeat as long as it helps (up to 1 year) + for shifter in range(252): + orig_missing_opens = data['open'].isnull().sum() + data['open'] = data['open'].fillna(data['close'].shift( + shifter+1)) + new_missing_opens = data['open'].isnull().sum() + if orig_missing_opens == new_missing_opens: + break + logger.info( + "Filled missing open prices with close from %s periods before", + shifter+1) + + # fill close price with same day's open + data['close'] = data['close'].fillna(data['open']) + + # fill high price with max + data['high'] = data['high'].fillna(data[['open', 'close']].max(1)) + + # fill low price with max + data['low'] = data['low'].fillna(data[['open', 'close']].min(1)) + + # print(data) + # print(data.isnull().sum()) + + # + # Compute returns + # + + # compute log of ratio between adjclose and close + log_adjustment_ratio = np.log(data['adjclose'] / data['close']) + + # forward fill adjustment ratio + log_adjustment_ratio = log_adjustment_ratio.ffill() + + # non-market log returns (dividends, splits) + non_market_lr = log_adjustment_ratio.diff().shift(-1) + + # full open-to-open returns + open_to_open = np.log(data["open"]).diff().shift(-1) + data['return'] = np.exp(open_to_open + non_market_lr) - 1 + + # print(data) + # print(data.isnull().sum()) + + # intraday_logreturn = np.log(data["close"]) - np.log(data["open"]) + # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1) + # open_to_open_logreturn = ( + # close_to_close_logreturn + intraday_logreturn - + # intraday_logreturn.shift(-1) + # ) + # data["return"] = np.exp(open_to_open_logreturn) - 1 + del data["adjclose"] + + # eliminate last period's intraday data + data.loc[data.index[-1], + ["high", "low", "close", "return", "volume"]] = np.nan + + # print(data) + # print(data.isnull().sum()) + + return data + + @staticmethod + def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): + """Get 1 day OLHC from Yahoo finance. + + Result is timestamped with the open time (time-zoned) of the + instrument. + """ + + base_url = 'https://query2.finance.yahoo.com' + + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)' + ' AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/39.0.2171.95 Safari/537.36'} + + # print(HEADERS) + start = int(pd.Timestamp(start).timestamp()) + end = int(pd.Timestamp(end).timestamp()) + + try: + res = requests.get( + url=f"{base_url}/v8/finance/chart/{ticker}", + params={'interval': '1d', + "period1": start, + "period2": end}, + headers=headers, + timeout=10) # seconds + except requests.ConnectionError as exc: + raise DataError( + f"Download of {ticker} from YahooFinance failed." + + " Are you connected to the Internet?") from exc + + # print(res) + + if res.status_code == 404: + raise DataError( + f'Data for symbol {ticker} is not available.' + + 'Json output:', str(res.json())) + + if res.status_code != 200: + raise DataError(f'Yahoo finance download of {ticker} failed. Json:', + str(res.json())) # pragma: no cover + + data = res.json()['chart']['result'][0] + + try: + index = pd.DatetimeIndex( + [_timestamp_convert(el) for el in data['timestamp']]) + + df_result = pd.DataFrame( + data['indicators']['quote'][0], index=index) + df_result['adjclose'] = data[ + 'indicators']['adjclose'][0]['adjclose'] + except KeyError: + raise DataError(f'Yahoo finance download of {ticker} failed.' + + ' Json:', str(res.json())) # pragma: no cover + + # last timestamp is probably broken (not timed to market open) + # we set its time to same as the day before, but this is wrong + # on days of DST switch. It's fine though because that line will be + # overwritten next update + if df_result.index[-1].time() != df_result.index[-2].time(): + tm1 = df_result.index[-2].time() + newlast = df_result.index[-1].replace( + hour=tm1.hour, minute=tm1.minute, second=tm1.second) + df_result.index = pd.DatetimeIndex( + list(df_result.index[:-1]) + [newlast]) + + return df_result[ + ['open', 'low', 'high', 'close', 'adjclose', 'volume']] + + def _download(self, symbol, current=None, + overlap=5, grace_period='5d', **kwargs): + """Download single stock from Yahoo Finance. + + If data was already downloaded we only download + the most recent missing portion. + + Args: + + symbol (str): yahoo name of the instrument + current (pandas.DataFrame or None): current data present locally + overlap (int): how many lines of current data will be overwritten + by newly downloaded data + kwargs (dict): extra arguments passed to yfinance.download + + Returns: + updated (pandas.DataFrame): updated DataFrame for the symbol + """ + if overlap < 2: + raise SyntaxError( + f'{self.__class__.__name__} with overlap smaller than 2' + + ' could have issues with DST.') + if (current is None) or (len(current) < overlap): + updated = self._get_data_yahoo(symbol, **kwargs) + logger.info('Downloading from the start.') + result = self._clean(updated) + # we remove first row if it contains NaNs + if np.any(result.iloc[0].isnull()): + result = result.iloc[1:] + return result + if (now_timezoned() - current.index[-1] + ) < pd.Timedelta(grace_period): + logger.info( + 'Skipping download because stored data is recent enough.') + return current + new = self._get_data_yahoo(symbol, start=current.index[-overlap]) + new = self._clean(new) + return pd.concat([current.iloc[:-overlap], new]) + + def _preload(self, data): + """Prepare data for use by Cvxportfolio. + + We drop the `volume` column expressed in number of stocks and + replace it with `valuevolume` which is an estimate of the (e.g., + US dollar) value of the volume exchanged on the day. + """ + data["valuevolume"] = data["volume"] * data["open"] + del data["volume"] + + return data + +# +# Fred. +# + +class Fred(SymbolData): + """Fred single-symbol data. + + :param symbol: The symbol that we downloaded. + :type symbol: str + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. + :type storage_backend: str + :param base_storage_location: The location of the storage. We store in a + subdirectory named after the class which derives from this. By default + it's a directory named ``cvxportfolio_data`` in your home folder. + :type base_storage_location: pathlib.Path + :param grace_period: If the most recent observation in the data is less + old than this we do not download new data. By default it's one day. + :type grace_period: pandas.Timedelta + + :attribute data: The downloaded data for the symbol. + """ + + URL = "https://fred.stlouisfed.org/graph/fredgraph.csv" + + # TODO: implement Fred point-in-time + # example: + # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06 + # hourly wages time series **as it appeared** on 2023-07-06 + # store using pd.Series() of diff'ed values only. + + def _internal_download(self, symbol): + try: + return pd.read_csv( + self.URL + f'?id={symbol}', + index_col=0, parse_dates=[0])[symbol] + except URLError as exc: + raise DataError(f"Download of {symbol}" + + f" from {self.__class__.__name__} failed." + + " Are you connected to the Internet?") from exc + + def _download( + self, symbol="DFF", current=None, grace_period='5d', **kwargs): + """Download or update pandas Series from Fred. + + If already downloaded don't change data stored locally and only + add new entries at the end. + + Additionally, we allow for a `grace period`, if the data already + downloaded has a last entry not older than the grace period, we + don't download new data. + """ + if current is None: + return self._internal_download(symbol) + if (pd.Timestamp.today() - current.index[-1] + ) < pd.Timedelta(grace_period): + logger.info( + 'Skipping download because stored data is recent enough.') + return current + + new = self._internal_download(symbol) + new = new.loc[new.index > current.index[-1]] + + if new.empty: + logger.info('New downloaded data is empty!') + return current + + assert new.index[0] > current.index[-1] + return pd.concat([current, new]) + + def _preload(self, data): + """Add UTC timezone.""" + data.index = data.index.tz_localize('UTC') + return data + +# +# Sqlite storage backend. +# + +def _open_sqlite(storage_location): + return sqlite3.connect(storage_location/"db.sqlite") + +def _close_sqlite(connection): + connection.close() + +def _loader_sqlite(symbol, storage_location): + """Load data in sqlite format. + + We separately store dtypes for data consistency and safety. + + .. note:: If your pandas object's index has a name it will be lost, + the index is renamed 'index'. If you pass timestamp data (including + the index) it must have explicit timezone. + """ + try: + connection = _open_sqlite(storage_location) + dtypes = pd.read_sql_query( + f"SELECT * FROM {symbol}___dtypes", + connection, index_col="index", + dtype={"index": "str", "0": "str"}) + + parse_dates = 'index' + my_dtypes = dict(dtypes["0"]) + + tmp = pd.read_sql_query( + f"SELECT * FROM {symbol}", connection, + index_col="index", parse_dates=parse_dates, dtype=my_dtypes) + + _close_sqlite(connection) + multiindex = [] + for col in tmp.columns: + if col[:8] == "___level": + multiindex.append(col) + else: + break + if len(multiindex) > 0: + multiindex = [tmp.index.name] + multiindex + tmp = tmp.reset_index().set_index(multiindex) + return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp + except pd.errors.DatabaseError: + return None + +def _storer_sqlite(symbol, data, storage_location): + """Store data in sqlite format. + + We separately store dtypes for data consistency and safety. + + .. note:: If your pandas object's index has a name it will be lost, + the index is renamed 'index'. If you pass timestamp data (including + the index) it must have explicit timezone. + """ + connection = _open_sqlite(storage_location) + exists = pd.read_sql_query( + f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'", + connection) + + if len(exists): + _ = connection.cursor().execute(f"DROP TABLE '{symbol}'") + _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'") + connection.commit() + + if hasattr(data.index, "levels"): + data.index = data.index.set_names( + ["index"] + + [f"___level{i}" for i in range(1, len(data.index.levels))] + ) + data = data.reset_index().set_index("index") + else: + data.index.name = "index" + + if data.index[0].tzinfo is None: + warnings.warn('Index has not timezone, setting to UTC') + data.index = data.index.tz_localize('UTC') + + data.to_sql(f"{symbol}", connection) + pd.DataFrame(data).dtypes.astype("string").to_sql( + f"{symbol}___dtypes", connection) + _close_sqlite(connection) + + +# +# Pickle storage backend. +# + +def _loader_pickle(symbol, storage_location): + """Load data in pickle format.""" + return pd.read_pickle(storage_location / f"{symbol}.pickle") + +def _storer_pickle(symbol, data, storage_location): + """Store data in pickle format.""" + data.to_pickle(storage_location / f"{symbol}.pickle") + +# +# Csv storage backend. +# + +def _loader_csv(symbol, storage_location): + """Load data in csv format.""" + + index_dtypes = pd.read_csv( + storage_location / f"{symbol}___index_dtypes.csv", + index_col=0)["0"] + + dtypes = pd.read_csv( + storage_location / f"{symbol}___dtypes.csv", index_col=0, + dtype={"index": "str", "0": "str"}) + dtypes = dict(dtypes["0"]) + new_dtypes = {} + parse_dates = [] + for i, level in enumerate(index_dtypes): + if "datetime64[ns" in level: # includes all timezones + parse_dates.append(i) + for i, el in enumerate(dtypes): + if "datetime64[ns" in dtypes[el]: # includes all timezones + parse_dates += [i + len(index_dtypes)] + else: + new_dtypes[el] = dtypes[el] + + tmp = pd.read_csv(storage_location / f"{symbol}.csv", + index_col=list(range(len(index_dtypes))), + parse_dates=parse_dates, dtype=new_dtypes) + + return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp + + +def _storer_csv(symbol, data, storage_location): + """Store data in csv format.""" + pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels') + else [data.index.dtype]).astype("string").to_csv( + storage_location / f"{symbol}___index_dtypes.csv") + pd.DataFrame(data).dtypes.astype("string").to_csv( + storage_location / f"{symbol}___dtypes.csv") + data.to_csv(storage_location / f"{symbol}.csv") + +# +# Market Data +# + +class MarketData: + """Prepare, hold, and serve market data. + + :method serve: Serve data for policy and simulator at time :math:`t`. + """ + + def serve(self, t): + """Serve data for policy and simulator at time :math:`t`. + + :param t: Trading time. It must be included in the timestamps returned + by :meth:`trading_calendar`. + :type t: pandas.Timestamp + + :returns: past_returns, current_returns, past_volumes, current_volumes, + current_prices + :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame, + pandas.Series, pandas.Series) + """ + raise NotImplementedError # pragma: no cover + + # pylint: disable=redundant-returns-doc + def trading_calendar( + self, start_time=None, end_time=None, include_end=True): + """Get trading calendar between times. + + :param start_time: Initial time of the trading calendar. Always + inclusive if present. If None, use the first available time. + :type start_time: pandas.Timestamp + :param end_time: Final time of the trading calendar. If None, + use the last available time. + :type end_time: pandas.Timestamp + :param include_end: Include end time. + :type include_end: bool + + :returns: Trading calendar. + :rtype: pandas.DatetimeIndex + """ + raise NotImplementedError # pragma: no cover + + @property + def periods_per_year(self): + """Average trading periods per year. + + :rtype: int + """ + raise NotImplementedError # pragma: no cover + + @property + def full_universe(self): # pylint: disable=redundant-returns-doc + """Full universe, which might not be available for trading. + + :returns: Full universe. + :rtype: pandas.Index + """ + raise NotImplementedError # pragma: no cover + + # pylint: disable=unused-argument, redundant-returns-doc + def partial_universe_signature(self, partial_universe): + """Unique signature of this instance with a partial universe. + + A partial universe is a subset of the full universe that is + available at some time for trading. + + This is used in cvxportfolio.cache to sign back-test caches that + are saved on disk. If not redefined it returns None which disables + on-disk caching. + + :param partial_universe: A subset of the full universe. + :type partial_universe: pandas.Index + + :returns: Signature. + :rtype: str + """ + return None + +# compiled based on Interactive Brokers benchmark rates choices +# (see https://www.ibkrguides.com/kb/article-2949.htm) +# and their FRED codes +RATES = { + 'USDOLLAR': 'DFF', # Federal funds effective rate + 'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate + 'GBPOUND': 'IUDSOIA', # SONIA + 'JPYEN': 'IRSTCB01JPM156N', # updated monthly + } + +class MarketDataInMemory(MarketData): + """Market data that is stored in memory when initialized.""" + + # this is overwritten in the derived classes' initializers + returns = None + + def __init__( + self, trading_frequency, base_location, cash_key, min_history, + online_usage = False): + """This must be called by the derived classes.""" + if (self.returns.index[-1] - self.returns.index[0]) < min_history: + raise DataError( + "The provided returns have less history " + + f"than the min_history {min_history}") + if trading_frequency: + self._downsample(trading_frequency) + self.trading_frequency = trading_frequency + + self._set_read_only() + self._check_sizes() + self._mask = None + self._masked_returns = None + self._masked_volumes = None + self._masked_prices = None + self.base_location = Path(base_location) + self.cash_key = cash_key + self._min_history_timedelta = min_history + self.online_usage = online_usage + + def _mask_dataframes(self, mask): + """Mask internal dataframes if necessary.""" + if (self._mask is None) or not np.all(self._mask == mask): + logger.info("Masking internal %s dataframes.", + self.__class__.__name__) + colmask = self.returns.columns[mask] + # self._masked_returns = self._df_or_ser_set_read_only( + # pd.DataFrame(self.returns.iloc[:, mask], copy=True)) + self._masked_returns = self._df_or_ser_set_read_only( + pd.DataFrame(self.returns.loc[:, colmask], copy=True)) + # self._masked_returns = self._df_or_ser_set_read_only( + # pd.DataFrame(np.array(self.returns.values[:, mask]), + # index=self.returns.index, columns=colmask)) + if not self.volumes is None: + # self._masked_volumes = self._df_or_ser_set_read_only( + # pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True)) + self._masked_volumes = self._df_or_ser_set_read_only( + pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True)) + # self._masked_volumes = self._df_or_ser_set_read_only( + # pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]), + # index=self.volumes.index, columns=colmask[:-1])) + if not self.prices is None: + # self._masked_prices = self._df_or_ser_set_read_only( + # pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True)) + self._masked_prices = self._df_or_ser_set_read_only( + pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True)) + self._mask = mask + + @property + def full_universe(self): + """Full universe, which might not be available for trading. + + :returns: Full universe. + :rtype: pandas.Index + """ + return self.returns.columns + + def serve(self, t): + """Serve data for policy and simulator at time :math:`t`. + + :param t: Time of execution, *e.g.*, stock market open of a given day. + :type t: pandas.Timestamp + + :returns: (past_returns, current_returns, past_volumes, + current_volumes, current_prices) + :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None, + pandas.Series or None, pandas.Series or None) + """ + + mask = self._universe_mask_at_time(t).values + self._mask_dataframes(mask) + + tidx = self.returns.index.get_loc(t) + past_returns = self._df_or_ser_set_read_only( + pd.DataFrame(self._masked_returns.iloc[:tidx])) + current_returns = self._df_or_ser_set_read_only( + pd.Series(self._masked_returns.iloc[tidx])) + + if not self.volumes is None: + tidx = self.volumes.index.get_loc(t) + past_volumes = self._df_or_ser_set_read_only( + pd.DataFrame(self._masked_volumes.iloc[:tidx])) + current_volumes = self._df_or_ser_set_read_only( + pd.Series(self._masked_volumes.iloc[tidx])) + else: + past_volumes = None + current_volumes = None + + if not self.prices is None: + tidx = self.prices.index.get_loc(t) + current_prices = self._df_or_ser_set_read_only( + pd.Series(self._masked_prices.iloc[tidx])) + else: + current_prices = None + + return (past_returns, current_returns, past_volumes, current_volumes, + current_prices) + + def _add_cash_column(self, cash_key, grace_period): + """Add the cash column to an already formed returns dataframe. + + This assumes that the trading periods are about equally spaced. + If, say, you have trading periods with very different lengths you + should redefine this method **and** replace the :class:`CashReturn` + objective term. + """ + + if not cash_key in RATES: + raise NotImplementedError( + 'Currently the only data pipelines built are for cash_key' + f' in {list(RATES)}') + + if self.returns.index.tz is None: + raise DataError( + 'Your provided dataframes are not timezone aware.' + + " This is not recommended, and doesn't allow to add the cash" + + " returns' column internally." + + " You can fix this by adding a timezone manually " + + "using pandas.DataFrame.tz_localize to the dataframes before" + + " you pass them, or you can provide" + + " the cash returns' column as the last column of the returns" + + " dataframe (so it has one more column than volumes and" + + " prices, if provided), and set the cash_key parameter to" + + " its name.") + + data = Fred( + RATES[cash_key], base_location=self.base_location, + grace_period=grace_period) + + cash_returns_per_period = resample_returns( + data.data/100, periods=self.periods_per_year) + + # we merge instead of assigning column because indexes might + # be misaligned (e.g., with tz-aware timestamps) + cash_returns_per_period.name = self.cash_key + original_returns_index = self.returns.index + tmp = pd.concat( + [self.returns, cash_returns_per_period], sort=True, axis=1) + tmp[cash_key] = tmp[cash_key].ffill() + self.returns = tmp.loc[original_returns_index] + + def trading_calendar( + self, start_time=None, end_time=None, include_end=True): + """Get trading calendar from market data. + + :param start_time: Initial time of the trading calendar. Always + inclusive if present. If None, use the first available time. + :type start_time: pandas.Timestamp + :param end_time: Final time of the trading calendar. If None, + use the last available time. + :type end_time: pandas.Timestamp + :param include_end: Include end time. + :type include_end: bool + + :returns: Trading calendar. + :rtype: pandas.DatetimeIndex + """ + result = self.returns.index + result = result[result >= self._earliest_backtest_start] + if start_time: + result = result[result >= start_time] + if end_time: + result = result[(result <= end_time)] + if not include_end: + result = result[:-1] + return result + + def _universe_mask_at_time(self, t): + """Return the valid universe mask at time t.""" + past_returns = self.returns.loc[self.returns.index < t] + if self.online_usage: + valid_universe_mask = past_returns.count() >= self.min_history + else: + valid_universe_mask = ((past_returns.count() >= self.min_history) & + (~self.returns.loc[t].isnull())) + if sum(valid_universe_mask) <= 1: + raise DataError( + f'The trading universe at time {t} has size less or equal' + + ' than one, i.e., only the cash account. There are probably ' + + ' issues with missing data in the provided market returns.') + return valid_universe_mask + + @staticmethod + def _df_or_ser_set_read_only(df_or_ser): + """Set numpy array contained in dataframe to read only. + + This is done on data store internally before it is served to the + policy or the simulator to ensure data consistency in case some + element of the pipeline accidentally corrupts the data. + + This is enough to prevent direct assignement to the resulting + dataframe. However it could still be accidentally corrupted by + assigning to columns or indices that are not present in the + original. We avoid that case as well by returning a wrapped + dataframe (which doesn't copy data on creation) in + serve_data_policy and serve_data_simulator. + """ + data = df_or_ser.values + data.flags.writeable = False + if hasattr(df_or_ser, 'columns'): + return pd.DataFrame(data, index=df_or_ser.index, + columns=df_or_ser.columns) + return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name) + + def _set_read_only(self): + """Set internal dataframes to read-only.""" + + self.returns = self._df_or_ser_set_read_only(self.returns) + + if not self.prices is None: + self.prices = self._df_or_ser_set_read_only(self.prices) + + if not self.volumes is None: + self.volumes = self._df_or_ser_set_read_only(self.volumes) + + @property + def _earliest_backtest_start(self): + """Earliest date at which we can start a backtest.""" + return self.returns.iloc[:, :-1].dropna(how='all').index[ + self.min_history] + + sampling_intervals = { + 'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'} + + # @staticmethod + # def _is_first_interval_small(datetimeindex): + # """Check if post-resampling the first interval is small. + # + # We have no way of knowing exactly if the first interval + # needs to be dropped. We drop it if its length is smaller + # than the average of all others, minus 2 standard deviation. + # """ + # first_interval = (datetimeindex[1] - datetimeindex[0]) + # all_others = (datetimeindex[2:] - datetimeindex[1:-1]) + # return first_interval < (all_others.mean() - 2 * all_others.std()) + + def _downsample(self, interval): + """_downsample market data.""" + if not interval in self.sampling_intervals: + raise SyntaxError( + 'Unsopported trading interval for down-sampling.') + interval = self.sampling_intervals[interval] + new_returns_index = pd.Series(self.returns.index, self.returns.index + ).resample(interval, closed='left', + label='left').first().values + # print(new_returns_index) + self.returns = np.exp(np.log( + 1+self.returns).resample(interval, closed='left', label='left' + ).sum(min_count=1))-1 + self.returns.index = new_returns_index + + # last row is always unknown + self.returns.iloc[-1] = np.nan + + # # we drop the first row if its interval is small + # if self._is_first_interval_small(self.returns.index): + # self.returns = self.returns.iloc[1:] + + # we nan-out the first non-nan element of every col + for col in self.returns.columns[:-1]: + self.returns.loc[ + (~(self.returns[col].isnull())).idxmax(), col] = np.nan + + # and we drop the first row, which is mostly NaNs anyway + self.returns = self.returns.iloc[1:] + + if self.volumes is not None: + new_volumes_index = pd.Series( + self.volumes.index, self.volumes.index + ).resample(interval, closed='left', + label='left').first().values + self.volumes = self.volumes.resample( + interval, closed='left', label='left').sum(min_count=1) + self.volumes.index = new_volumes_index + + # last row is always unknown + self.volumes.iloc[-1] = np.nan + + # # we drop the first row if its interval is small + # if self._is_first_interval_small(self.volumes.index): + # self.volumes = self.volumes.iloc[1:] + + # we nan-out the first non-nan element of every col + for col in self.volumes.columns: + self.volumes.loc[ + (~(self.volumes[col].isnull())).idxmax(), col] = np.nan + + # and we drop the first row, which is mostly NaNs anyway + self.volumes = self.volumes.iloc[1:] + + if self.prices is not None: + new_prices_index = pd.Series( + self.prices.index, self.prices.index + ).resample( + interval, closed='left', label='left').first().values + self.prices = self.prices.resample( + interval, closed='left', label='left').first() + self.prices.index = new_prices_index + + # # we drop the first row if its interval is small + # if self._is_first_interval_small(self.prices.index): + # self.prices = self.prices.iloc[1:] + + # we nan-out the first non-nan element of every col + for col in self.prices.columns: + self.prices.loc[ + (~(self.prices[col].isnull())).idxmax(), col] = np.nan + + # and we drop the first row, which is mostly NaNs anyway + self.prices = self.prices.iloc[1:] + + def _check_sizes(self): + """Check sizes of user-provided dataframes.""" + + if (not self.volumes is None) and ( + not (self.volumes.shape[1] == self.returns.shape[1] - 1) + or not all(self.volumes.columns == self.returns.columns[:-1])): + raise SyntaxError( + 'Volumes should have same columns as returns, minus cash_key.') + + if (not self.prices is None) and ( + not (self.prices.shape[1] == self.returns.shape[1] - 1) + or not all(self.prices.columns == self.returns.columns[:-1])): + raise SyntaxError( + 'Prices should have same columns as returns, minus cash_key.') + + @property + def periods_per_year(self): + """Average trading periods per year inferred from the data. + + :returns: Average periods per year. + :rtype: int + """ + return periods_per_year_from_datetime_index(self.returns.index) + + @property + def min_history(self): + """Min history expressed in periods. + + :returns: How many non-null elements of the past returns for a given + name are required to include it. + :rtype: int + """ + return int(np.round(self.periods_per_year * ( + self._min_history_timedelta / pd.Timedelta('365.24d')))) + + +class UserProvidedMarketData(MarketDataInMemory): + """User-provided market data. + + :param returns: Historical open-to-open returns. The return + at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where + :math:`p_t` is the (open) price at time :math:`t`. Must + have datetime index. You can also include cash + returns as its last column, and set ``cash_key`` below to the last + column's name. + :type returns: pandas.DataFrame + :param volumes: Historical market volumes, expressed in units + of value (*e.g.*, US dollars). + :type volumes: pandas.DataFrame or None + :param prices: Historical open prices (*e.g.*, used for rounding + trades in the :class:`MarketSimulator`). + :type prices: pandas.DataFrame or None + :param trading_frequency: Instead of using frequency implied by + the index of the returns, down-sample all dataframes. + We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and + ``'annual'``. By default (None) don't down-sample. + :type trading_frequency: str or None + :param min_history: Minimum amount of time for which the returns + are not ``np.nan`` before each assets enters in a back-test. + :type min_history: pandas.Timedelta + :param base_location: The location of the storage, only used + in case it downloads the cash returns. By default + it's a directory named ``cvxportfolio_data`` in your home folder. + :type base_location: pathlib.Path + :param cash_key: Name of the cash account. If not the last column + of the provided returns, it will be downloaded. In that case you should + make sure your provided dataframes have a timezone aware datetime + index. Its returns are the risk-free rate. + :type cash_key: str + :param online_usage: Disable removal of assets that have ``np.nan`` returns + for the given time. Default False. + :type online_usage: bool + """ + + # pylint: disable=too-many-arguments + def __init__(self, returns, volumes=None, prices=None, + copy_dataframes=True, trading_frequency=None, + min_history=pd.Timedelta('365.24d'), + base_location=BASE_LOCATION, + grace_period=pd.Timedelta('1d'), + cash_key='USDOLLAR', + online_usage=False): + + if returns is None: + raise SyntaxError( + "If you don't specify a universe you should pass `returns`.") + + self.base_location = Path(base_location) + self.cash_key = cash_key + + self.returns = pd.DataFrame( + make_numeric(returns), copy=copy_dataframes) + self.volumes = volumes if volumes is None else\ + pd.DataFrame(make_numeric(volumes), copy=copy_dataframes) + self.prices = prices if prices is None else\ + pd.DataFrame(make_numeric(prices), copy=copy_dataframes) + + if cash_key != returns.columns[-1]: + self._add_cash_column(cash_key, grace_period=grace_period) + + # this is mandatory + super().__init__( + trading_frequency=trading_frequency, + base_location=base_location, + cash_key=cash_key, + min_history=min_history, + online_usage=online_usage) + + +class DownloadedMarketData(MarketDataInMemory): + """Market data that is downloaded. + + :param universe: List of names as understood by the data source + used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default + Yahoo Finance data source. + :type universe: list + :param datasource: The data source used. + :type datasource: str or :class:`SymbolData` class + :param cash_key: Name of the cash account, its rates will be downloaded + and added as last columns of the returns. Its returns are the + risk-free rate. + :type cash_key: str + :param base_location: The location of the storage. By default + it's a directory named ``cvxportfolio_data`` in your home folder. + :type base_location: pathlib.Path + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. + :type storage_backend: str + :param min_history: Minimum amount of time for which the returns + are not ``np.nan`` before each assets enters in a back-test. + :type min_history: pandas.Timedelta + :param grace_period: If the most recent observation of each symbol's + data is less old than this we do not download new data. + By default it's one day. + :type grace_period: pandas.Timedelta + :param trading_frequency: Instead of using frequency implied by + the index of the returns, down-sample all dataframes. + We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and + ``'annual'``. By default (None) don't down-sample. + :type trading_frequency: str or None + :param online_usage: Disable removal of assets that have ``np.nan`` returns + for the given time. Default False. + :type online_usage: bool + """ + + # pylint: disable=too-many-arguments + def __init__(self, + universe=(), + datasource='YahooFinance', + cash_key='USDOLLAR', + base_location=BASE_LOCATION, + storage_backend='pickle', + min_history=pd.Timedelta('365.24d'), + grace_period=pd.Timedelta('1d'), + trading_frequency=None, + online_usage=False): + """Initializer.""" + + # drop duplicates and ensure ordering + universe = sorted(set(universe)) + + self.base_location = Path(base_location) + self.cash_key = cash_key + if isinstance(datasource, type): + self.datasource = datasource + else: # try to load in current module + self.datasource = globals()[datasource] + self._get_market_data( + universe, grace_period=grace_period, + storage_backend=storage_backend) + self._add_cash_column(self.cash_key, grace_period=grace_period) + self._remove_missing_recent() + + # this is mandatory + super().__init__( + trading_frequency=trading_frequency, + base_location=base_location, + cash_key=cash_key, + min_history=min_history, + online_usage=online_usage) + + def _get_market_data(self, universe, grace_period, storage_backend): + """Download market data.""" + database_accesses = {} + print('Updating data', end='') + sys.stdout.flush() + + for stock in universe: + logger.info( + 'Updating %s with %s.', stock, self.datasource.__name__) + print('.', end='') + sys.stdout.flush() + database_accesses[stock] = self.datasource( + stock, base_location=self.base_location, + grace_period=grace_period, storage_backend=storage_backend) + print() + + if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR: + self.returns = pd.DataFrame( + {stock: database_accesses[stock].data['return'] + for stock in universe}) + self.volumes = pd.DataFrame( + {stock: database_accesses[stock].data['valuevolume'] + for stock in universe}) + self.prices = pd.DataFrame( + {stock: database_accesses[stock].data['open'] + for stock in universe}) + else: # for now only Fred for indexes, we assume prices! + assert isinstance(database_accesses[universe[0]].data, pd.Series) + self.prices = pd.DataFrame( + # open prices + {stock: database_accesses[stock].data for stock in universe}) + self.returns = 1 - self.prices / self.prices.shift(-1) + self.volumes = None + + def _remove_missing_recent(self): + """Clean recent data. + + Yahoo Finance may has issues with most recent data; we remove + recent days if there are NaNs. + """ + + if self.prices.iloc[-5:].isnull().any().any(): + logger.debug( + 'Removing some recent lines because there are missing values.') + drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax() + logger.debug('Dropping at index %s', drop_at) + self.returns = self.returns.loc[self.returns.index < drop_at] + if self.prices is not None: + self.prices = self.prices.loc[self.prices.index < drop_at] + if self.volumes is not None: + self.volumes = self.volumes.loc[self.volumes.index < drop_at] + + # for consistency we must also nan-out the last row + # of returns and volumes + self.returns.iloc[-1] = np.nan + if self.volumes is not None: + self.volumes.iloc[-1] = np.nan + + def partial_universe_signature(self, partial_universe): + """Unique signature of this instance with a partial universe. + + A partial universe is a subset of the full universe that is + available at some time for trading. + + This is used in cvxportfolio.cache to sign back-test caches that + are saved on disk. See its implementation below for details. If + not redefined it returns None which disables on-disk caching. + + :param partial_universe: A subset of the full universe. + :type partial_universe: pandas.Index + + :returns: Signature. + :rtype: str + """ + assert isinstance(partial_universe, pd.Index) + assert np.all(partial_universe.isin(self.full_universe)) + result = f'{self.__class__.__name__}(' + result += f'datasource={self.datasource.__name__}, ' + result += f'partial_universe_hash={hash_(np.array(partial_universe))},' + result += f' trading_frequency={self.trading_frequency})' + return result diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index ecaef70c9..1502d4dd5 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -631,7 +631,7 @@ def _get_market_data(self, universe, grace_period, storage_backend): grace_period=grace_period, storage_backend=storage_backend) print() - if hasattr(self.datasource, 'IS_OHLCVR') and self.datasource.IS_OHLCVR: + if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR: self.returns = pd.DataFrame( {stock: database_accesses[stock].data['return'] for stock in universe}) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index ad51db479..b85b56cff 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -273,8 +273,8 @@ def _unlikeliness_score( return scaled.min(axis=1), scaled -class OHLCV(SymbolData): # pylint: disable=abstract-method - """Base class for Open-High-Low-Close-Volume symbol data. +class OLHCV(SymbolData): # pylint: disable=abstract-method + """Base class for Open-Low-High-Close-Volume symbol data. This operates on a dataframe with columns @@ -294,6 +294,159 @@ class OHLCV(SymbolData): # pylint: disable=abstract-method dividends, ...) and they're dealt with in derived classes. """ + def _process(self, new_data, saved_data=None): + """Base method for processing (cleaning) data. + + It operates on the ``new_data`` dataframe, which is the newly + downloaded data. The ``saved_data`` dataframe is provided as well + (None if there is none). It has the same columns, older timestamps + with possible overlap with new_data at its end, and is **read only**: + it is used as reference to help with the cleaning, it has already + been cleaned. + + The method is composed of the following steps, split between child + classes at the appropriate hierarchy level. + + #. :meth:`_nan_impossible`: Nan-out impossible values in ``new_data``. + #. :meth:`_specific_process`: Do processing specific to the class, + before the following step (*e.g.,*, because we might want unlikely + values to still be there). + #. :meth:`_nan_unlikely`: Nan-out values that are (highly) unlikely, + with threshold-based testing. + #. :meth:`_fill`: Fill nans. + #. :meth:`_post_process`: Do final processing specific to the class. + + With this factoring we should have the flexibility to handle various + data sources, by choosing at each level if each method calls + the parent's before or after its own processing. + """ + + self._nan_impossible(new_data, saved_data=saved_data) + self._specific_process(new_data, saved_data=saved_data) + self._nan_unlikely(new_data, saved_data=saved_data) + self._fill(new_data, saved_data=saved_data) + self._post_process(new_data, saved_data=saved_data) + + return new_data + + def _specific_process(self, new_data, saved_data=None): + """Specific process, do nothing.""" + # return new_data + + def _post_process(self, new_data, saved_data=None): + """Post process, do nothing.""" + # return new_data + + def _nan_unlikely(self, new_data, saved_data=None): + """Nan-out unlikely values.""" + # return new_data + + def _fill(self, new_data, saved_data=None): + """Make easy fills.""" + + # TODO: simplify + + # print(data) + # print(data.isnull().sum()) + + # fill volumes with zeros (safest choice) + new_data['volume'] = new_data['volume'].fillna(0.) + + # fill close price with open price + new_data['close'] = new_data['close'].fillna(new_data['open']) + + # fill open price with close from day(s) before + # repeat as long as it helps (up to 1 year) + for shifter in range(252): + logger.info( + "Filling opens with close from %s days before", shifter) + orig_missing_opens = new_data['open'].isnull().sum() + new_data['open'] = new_data['open'].fillna(new_data['close'].shift( + shifter+1)) + new_missing_opens = new_data['open'].isnull().sum() + if orig_missing_opens == new_missing_opens: + break + + # fill close price with same day's open + new_data['close'] = new_data['close'].fillna(new_data['open']) + + # fill high price with max + new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1)) + + # fill low price with max + new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1)) + + # print(data) + # print(data.isnull().sum()) + + def _nan_nonpositive_prices(self, data, prices_name): + """Set non-positive prices (chosen column) to NaN, in-place.""" + + bad_indexes = data.index[data[prices_name] <= 0] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") has non-positive %s prices on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, prices_name, bad_indexes) + data.loc[bad_indexes, prices_name] = np.nan + + def _nan_negative_volumes(self, data): + """Set negative volumes to NaN, in-place.""" + + bad_indexes = data.index[data["volume"] < 0] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") has negative volumes on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + data.loc[bad_indexes, "volume"] = np.nan + + def _set_infty_to_nan(self, data): + """Set all +/- infty elements of data to NaN, in-place.""" + + if np.isinf(data).sum().sum() > 0: + logger.warning( + '%s("%s") has +/- infinity values, setting those to nan', + self.__class__.__name__, self.symbol) + data.iloc[:, :] = np.nan_to_num( + data.values, copy=True, nan=np.nan, posinf=np.nan, + neginf=np.nan) + + def _nan_impossible(self, new_data, saved_data=None): + """Set some impossible values of new_data to NaN, in-place.""" + + # nan-out nonpositive prices + for column in ["open", "close", "high", "low"]: + self._nan_nonpositive_prices(new_data, column) + + # nan-out negative volumes + self._nan_negative_volumes(new_data) + + # all infinity values are nans + self._set_infty_to_nan(new_data) + + # TODO: these can be made smarter (sometimes the open is clearly wrong) + + # if low is not the lowest, set it to nan + bad_indexes = new_data.index[ + new_data['low'] > new_data[['open', 'high', 'close']].min(1)] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") low prices are not the lowest on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + new_data.loc[bad_indexes, "low"] = np.nan + + # if high is not the highest, set it to nan + bad_indexes = new_data.index[ + new_data['high'] < new_data[['open', 'high', 'close']].max(1)] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s") high prices are not the highest on timestamps: %s,' + + ' setting to nan', + self.__class__.__name__, self.symbol, bad_indexes) + new_data.loc[bad_indexes, "high"] = np.nan + # TODO: factor quality check and clean into total-return related and non- def _preload(self, data): @@ -305,7 +458,7 @@ def _preload(self, data): """ # this is not used currently, but if we implement an interface to a - # pure OHLCV data source there is no need to store the open-to-open + # pure OLHCV data source there is no need to store the open-to-open # returns, they can be computed here if not 'return' in data.columns: data['return'] = data['open'].pct_change().shift(-1) @@ -316,13 +469,12 @@ def _preload(self, data): return data -class OHLCVTR(OHLCV): # pylint: disable=abstract-method - """Open-High-Low-Close-Volume-TotalReturn symbol data.""" - - # TODO: consider creating a OHLCVAC (adjusted closes) subclass +class OLHCVTR(OLHCV): # pylint: disable=abstract-method + """Open-Low-High-Close-Volume-TotalReturn symbol data.""" + # TODO: this becomes a isinstance(OLHC) in the caller # is open-high-low-close-volume-total return - IS_OHLCVR = True + IS_OLHCVR = True # # rolstd windows for finding wrong logreturns # _ROLSTD_WINDOWS = [20, 60, 252] @@ -372,104 +524,6 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method # # - def _nan_impossible(self, data): - """Set impossible values to NaN.""" - - # print(data) - # print(data.isnull().sum()) - - # nan-out nonpositive prices - for column in ["open", "close", "high", "low", "adjclose"]: - bad_indexes = data.index[data[column] <= 0] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") has non-positive %s prices on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, column, bad_indexes) - data.loc[bad_indexes, column] = np.nan - - # nan-out negative volumes - bad_indexes = data.index[data["volume"] < 0] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") has negative volumes on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - data.loc[bad_indexes, "volume"] = np.nan - - # all infinity values are nans - if np.isinf(data).sum().sum() > 0: - logger.warning( - '%s("%s") has +/- infinity values, setting those to nan', - self.__class__.__name__, self.symbol) - data.iloc[:, :] = np.nan_to_num( - data.values, copy=True, nan=np.nan, posinf=np.nan, - neginf=np.nan) - - # print(data) - # print(data.isnull().sum()) - - # TODO: these can be made smarter (sometimes the open is clearly wrong) - - # if low is not the lowest, set it to nan - bad_indexes = data.index[ - data['low'] > data[['open', 'high', 'close']].min(1)] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") low prices are not the lowest on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - data.loc[bad_indexes, "low"] = np.nan - - # if high is not the highest, set it to nan - bad_indexes = data.index[ - data['high'] < data[['open', 'high', 'close']].max(1)] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") high prices are not the highest on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - data.loc[bad_indexes, "high"] = np.nan - - # print(data) - # print(data.isnull().sum()) - - def _fill_easy(self, data): - """Make easy fills.""" - - # print(data) - # print(data.isnull().sum()) - - # fill volumes with zeros (safest choice) - data['volume'] = data['volume'].fillna(0.) - - # fill close price with open price - data['close'] = data['close'].fillna(data['open']) - - # fill open price with close from day(s) before - # repeat as long as it helps (up to 1 year) - for shifter in range(252): - logger.info( - "Filling opens with close from %s days before", shifter) - orig_missing_opens = data['open'].isnull().sum() - data['open'] = data['open'].fillna(data['close'].shift( - shifter+1)) - new_missing_opens = data['open'].isnull().sum() - if orig_missing_opens == new_missing_opens: - break - - # fill close price with same day's open - data['close'] = data['close'].fillna(data['open']) - - # fill high price with max - data['high'] = data['high'].fillna(data[['open', 'close']].max(1)) - - # fill low price with max - data['low'] = data['low'].fillna(data[['open', 'close']].min(1)) - - # print(data) - # print(data.isnull().sum()) - def _compute_total_returns(self, data): """Compute total open-to-open returns.""" @@ -503,26 +557,38 @@ def _compute_total_returns(self, data): # print(data) # print(data.isnull().sum()) - def _process(self, data): - """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" - - self._nan_impossible(data) + def _post_process(self, new_data, saved_data=None): + """Temporary.""" - self._fill_easy(data) - - self._compute_total_returns(data) + self._compute_total_returns(new_data) # eliminate adjclose column - del data["adjclose"] + del new_data["adjclose"] # eliminate last period's intraday data - data.loc[data.index[-1], + new_data.loc[new_data.index[-1], ["high", "low", "close", "return", "volume"]] = np.nan - return data + # def _process(self, data): + # """Clean Yahoo Finance open-low-high-close-volume-adjclose data.""" + + # self._nan_impossible(data) + + # self._fill(data) + + # self._compute_total_returns(data) + + # # eliminate adjclose column + # del data["adjclose"] + + # # eliminate last period's intraday data + # data.loc[data.index[-1], + # ["high", "low", "close", "return", "volume"]] = np.nan + + # return data def _quality_check(self, data): - """Analyze quality of the OHLCV-TR data.""" + """Analyze quality of the OLHCV-TR data.""" # zero volume zerovol_idx = data.index[data.volume == 0] @@ -563,7 +629,7 @@ def print_extreme(logreturns, name, sigmas=50): print_extreme(open2low, 'open to low returns') -class OHLCVAC(OHLCVTR): +class OLHCVAC(OLHCVTR): """Open-High-Low-Close-Volume-AdjustedClose data. This is modeled after the data returned by Yahoo Finance. It implements @@ -572,8 +638,25 @@ class OHLCVAC(OHLCVTR): returns from the adjusted closes, and do some error checks. """ + def _nan_impossible(self, new_data, saved_data=None): + """Set impossible values to NaN.""" + + # call the OLHCV method + super()._nan_impossible(new_data) + + # also do it on adjclose + self._nan_nonpositive_prices(new_data, "adjclose") + + # def _process(self, data): + # """Obtain total returns and call parent's method.""" + + # # data['total_return'] = data['adjclose'].ffill().pct_change() + + # # Then continue with OLHCVTR processing + # return super()._process(data) + -class YahooFinance(OHLCVAC): +class YahooFinance(OLHCVAC): """Yahoo Finance symbol data. :param symbol: The symbol that we downloaded. @@ -594,7 +677,7 @@ class YahooFinance(OHLCVAC): @staticmethod def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): - """Get 1-day OHLC-AC-V from Yahoo finance. + """Get 1-day OLHC-AC-V from Yahoo finance. This is roughly equivalent to @@ -705,7 +788,7 @@ def _download(self, symbol, current=None, Returns: updated (pandas.DataFrame): updated DataFrame for the symbol """ - # TODO this could be put at a much lower class hierarchy + # TODO this could be put at a lower class hierarchy if overlap < 2: raise SyntaxError( f'{self.__class__.__name__} with overlap smaller than 2' diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index bb23f2b50..a1d4d048b 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -144,7 +144,7 @@ def test_yahoo_finance(self): base_location=self.datadir) def test_yahoo_finance_removefirstline(self): - """Test that the first line of OHLCV is removed if there are NaNs.""" + """Test that the first line of OLHCV is removed if there are NaNs.""" # this symbol was found to have NaNs in the first line _ = YahooFinance( From fddac3e534ea28aa9b6706216b639a48a740ccde Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 12 Feb 2024 12:07:42 +0400 Subject: [PATCH 15/38] refactoring --- cvxportfolio/data/symbol_data.py | 32 ++++++++++++++++++++++++++------ cvxportfolio/tests/test_data.py | 2 +- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index b85b56cff..8e020986f 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -300,7 +300,7 @@ def _process(self, new_data, saved_data=None): It operates on the ``new_data`` dataframe, which is the newly downloaded data. The ``saved_data`` dataframe is provided as well (None if there is none). It has the same columns, older timestamps - with possible overlap with new_data at its end, and is **read only**: + (possibly overlapping with new_data at the end), and is **read only**: it is used as reference to help with the cleaning, it has already been cleaned. @@ -562,6 +562,21 @@ def _post_process(self, new_data, saved_data=None): self._compute_total_returns(new_data) + # close2close_total = np.log(1 + new_data['total_return']) + # open2close = np.log(new_data['close']) - np.log(new_data['open']) + # open2open_total = close2close_total - open2close + open2close.shift(1) + # alt = (np.exp(open2open_total) - 1).shift(-1) + + close_div_open = new_data['close'] / new_data['open'] + alt = ((1 + new_data['total_return']) / close_div_open) * close_div_open.shift(1) - 1 + alt = alt.shift(-1) + + # import code; code.interact(local=locals()) + + assert np.allclose(new_data['return'].dropna(), alt.dropna()) + + new_data['return'] = alt + # eliminate adjclose column del new_data["adjclose"] @@ -647,13 +662,18 @@ def _nan_impossible(self, new_data, saved_data=None): # also do it on adjclose self._nan_nonpositive_prices(new_data, "adjclose") - # def _process(self, data): - # """Obtain total returns and call parent's method.""" + def _specific_process(self, new_data, saved_data=None): + """Specific process, compute total returns.""" - # # data['total_return'] = data['adjclose'].ffill().pct_change() + # Close-to-close total return, so we can delegate to parent class. + # Note that this uses different time alignment than Cvxportfolio, + # Here today's return uses yesterday close and today close, while + # today's returns in Cvxportfolio use today open and tomorrow open. + # However this is the format more common among data vendors. + new_data['total_return'] = new_data['adjclose'].ffill().pct_change() - # # Then continue with OLHCVTR processing - # return super()._process(data) + # We don't need this any more. + # del new_data['adjclose'] class YahooFinance(OLHCVAC): diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index a1d4d048b..bb23f2b50 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -144,7 +144,7 @@ def test_yahoo_finance(self): base_location=self.datadir) def test_yahoo_finance_removefirstline(self): - """Test that the first line of OLHCV is removed if there are NaNs.""" + """Test that the first line of OHLCV is removed if there are NaNs.""" # this symbol was found to have NaNs in the first line _ = YahooFinance( From a5988541f08fdc2fd73026522764a230fc2eab84 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 12 Feb 2024 16:28:16 +0400 Subject: [PATCH 16/38] refactoring, test cvxportfolio/tests/test_data.py TestData.test_yfinance_download became fragile, need to understand why --- cvxportfolio/data/market_data.py | 3 +- cvxportfolio/data/symbol_data.py | 48 ++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index 1502d4dd5..6ba1c406e 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -24,6 +24,7 @@ from ..utils import (hash_, make_numeric, periods_per_year_from_datetime_index, resample_returns) from .symbol_data import * +from .symbol_data import OLHCV logger = logging.getLogger(__name__) @@ -631,7 +632,7 @@ def _get_market_data(self, universe, grace_period, storage_backend): grace_period=grace_period, storage_backend=storage_backend) print() - if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR: + if issubclass(self.datasource, OLHCV): self.returns = pd.DataFrame( {stock: database_accesses[stock].data['return'] for stock in universe}) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 8e020986f..440410522 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -379,6 +379,9 @@ def _fill(self, new_data, saved_data=None): # print(data) # print(data.isnull().sum()) + def _nan_indexes(self, data, bad_indexes, columns, message): + pass + def _nan_nonpositive_prices(self, data, prices_name): """Set non-positive prices (chosen column) to NaN, in-place.""" @@ -460,8 +463,8 @@ def _preload(self, data): # this is not used currently, but if we implement an interface to a # pure OLHCV data source there is no need to store the open-to-open # returns, they can be computed here - if not 'return' in data.columns: - data['return'] = data['open'].pct_change().shift(-1) + #if not 'return' in data.columns: + # data['return'] = data['open'].pct_change().shift(-1) self._quality_check(data) data["valuevolume"] = data["volume"] * data["open"] @@ -469,12 +472,22 @@ def _preload(self, data): return data -class OLHCVTR(OLHCV): # pylint: disable=abstract-method - """Open-Low-High-Close-Volume-TotalReturn symbol data.""" +class OLHCVAC(OLHCV): + """Open-High-Low-Close-Volume-AdjustedClose data. + + This is modeled after the data returned by Yahoo Finance. + """ +# It implements +# the transformation required to conform to the +# Open-High-Low-Close-Volume-TotalReturn model, that is, compute +# returns from the adjusted closes, and do some error checks. +# """ +# class OLHCVTR(OLHCV): # pylint: disable=abstract-method +# """Open-Low-High-Close-Volume-TotalReturn symbol data.""" # TODO: this becomes a isinstance(OLHC) in the caller # is open-high-low-close-volume-total return - IS_OLHCVR = True + # IS_OLHCVR = True # # rolstd windows for finding wrong logreturns # _ROLSTD_WINDOWS = [20, 60, 252] @@ -567,15 +580,18 @@ def _post_process(self, new_data, saved_data=None): # open2open_total = close2close_total - open2close + open2close.shift(1) # alt = (np.exp(open2open_total) - 1).shift(-1) - close_div_open = new_data['close'] / new_data['open'] - alt = ((1 + new_data['total_return']) / close_div_open) * close_div_open.shift(1) - 1 - alt = alt.shift(-1) + # close_div_open = new_data['close'] / new_data['open'] + # open_to_open_total = ( + # (1 + new_data['total_return']) / close_div_open + # ) * close_div_open.shift(1) - 1 # import code; code.interact(local=locals()) - assert np.allclose(new_data['return'].dropna(), alt.dropna()) + # assert np.allclose(new_data['return'].dropna(), open_to_open_total.shift(-1).dropna()) + + # new_data['return'] = open_to_open_total.shift(-1) - new_data['return'] = alt + # del new_data['total_return'] # eliminate adjclose column del new_data["adjclose"] @@ -643,16 +659,6 @@ def print_extreme(logreturns, name, sigmas=50): open2low = np.log(data['low']) - np.log(data['open']).dropna() print_extreme(open2low, 'open to low returns') - -class OLHCVAC(OLHCVTR): - """Open-High-Low-Close-Volume-AdjustedClose data. - - This is modeled after the data returned by Yahoo Finance. It implements - the transformation required to conform to the - Open-High-Low-Close-Volume-TotalReturn model, that is, compute - returns from the adjusted closes, and do some error checks. - """ - def _nan_impossible(self, new_data, saved_data=None): """Set impossible values to NaN.""" @@ -670,7 +676,7 @@ def _specific_process(self, new_data, saved_data=None): # Here today's return uses yesterday close and today close, while # today's returns in Cvxportfolio use today open and tomorrow open. # However this is the format more common among data vendors. - new_data['total_return'] = new_data['adjclose'].ffill().pct_change() + # new_data['total_return'] = new_data['adjclose'].ffill().pct_change() # We don't need this any more. # del new_data['adjclose'] From af76cb997f81ddf8e260737a73588b02e626f3e5 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Mon, 12 Feb 2024 23:08:15 +0400 Subject: [PATCH 17/38] more, cleaning needed --- cvxportfolio/data/symbol_data.py | 184 +++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 57 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 440410522..516d95de8 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -322,20 +322,20 @@ def _process(self, new_data, saved_data=None): """ self._nan_impossible(new_data, saved_data=saved_data) - self._specific_process(new_data, saved_data=saved_data) + # self._specific_process(new_data, saved_data=saved_data) self._nan_unlikely(new_data, saved_data=saved_data) self._fill(new_data, saved_data=saved_data) - self._post_process(new_data, saved_data=saved_data) + # self._post_process(new_data, saved_data=saved_data) return new_data - def _specific_process(self, new_data, saved_data=None): - """Specific process, do nothing.""" - # return new_data + # def _specific_process(self, new_data, saved_data=None): + # """Specific process, do nothing.""" + # # return new_data - def _post_process(self, new_data, saved_data=None): - """Post process, do nothing.""" - # return new_data + # def _post_process(self, new_data, saved_data=None): + # """Post process, do nothing.""" + # # return new_data def _nan_unlikely(self, new_data, saved_data=None): """Nan-out unlikely values.""" @@ -379,30 +379,86 @@ def _fill(self, new_data, saved_data=None): # print(data) # print(data.isnull().sum()) - def _nan_indexes(self, data, bad_indexes, columns, message): - pass - - def _nan_nonpositive_prices(self, data, prices_name): - """Set non-positive prices (chosen column) to NaN, in-place.""" + def _nan_values(self, data, condition, columns_to_nan, message): + """Set to NaN in-place on indexing condition chosen columns.""" - bad_indexes = data.index[data[prices_name] <= 0] + bad_indexes = data.index[condition] if len(bad_indexes) > 0: logger.warning( - '%s("%s") has non-positive %s prices on timestamps: %s,' + '%s("%s") has %s on timestamps: %s,' + ' setting to nan', - self.__class__.__name__, self.symbol, prices_name, bad_indexes) - data.loc[bad_indexes, prices_name] = np.nan + self.__class__.__name__, self.symbol, message, bad_indexes) + data.loc[bad_indexes, columns_to_nan] = np.nan + + def _nan_nonpositive_prices(self, data, prices_name): + """Set non-positive prices (chosen price name) to NaN, in-place.""" + self._nan_values( + data=data, condition = data[prices_name] <= 0, + columns_to_nan = prices_name, + message = f'non-positive {prices_name} prices') def _nan_negative_volumes(self, data): """Set negative volumes to NaN, in-place.""" - - bad_indexes = data.index[data["volume"] < 0] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") has negative volumes on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - data.loc[bad_indexes, "volume"] = np.nan + self._nan_values( + data=data, condition = data["volume"] < 0, + columns_to_nan = "volume", message = 'negative volumes') + + def _nan_open_lower_low(self, data): + """Set open price to NaN if lower than low, in-place.""" + self._nan_values( + data=data, condition = data['open'] < data['low'], + columns_to_nan = "open", + message = 'open price lower than low price') + + def _nan_open_higher_high(self, data): + """Set open price to NaN if higher than high, in-place.""" + self._nan_values( + data=data, condition = data['open'] > data['high'], + columns_to_nan = "open", + message = 'open price higher than high price') + + def _nan_incompatible_low_high(self, data): + """Set low and high to NaN if low is higher, in-place.""" + self._nan_values( + data=data, condition = data['low'] > data['high'], + columns_to_nan = ["low", "high"], + message = 'low price higher than high price') + + def _nan_high_lower_close(self, data): + """Set high price to NaN if lower than close, in-place.""" + self._nan_values( + data=data, condition = data['high'] < data['close'], + columns_to_nan = "high", + message = 'high price lower than close price') + + def _nan_low_higher_close(self, data): + """Set low price to NaN if higher than close, in-place.""" + self._nan_values( + data=data, condition = data['low'] > data['close'], + columns_to_nan = "low", + message = 'low price higher than close price') + + # def _nan_nonpositive_prices(self, data, prices_name): + # """Set non-positive prices (chosen column) to NaN, in-place.""" + + # bad_indexes = data.index[data[prices_name] <= 0] + # if len(bad_indexes) > 0: + # logger.warning( + # '%s("%s") has non-positive %s prices on timestamps: %s,' + # + ' setting to nan', + # self.__class__.__name__, self.symbol, prices_name, bad_indexes) + # data.loc[bad_indexes, prices_name] = np.nan + + # def _nan_negative_volumes(self, data): + # """Set negative volumes to NaN, in-place.""" + + # bad_indexes = data.index[data["volume"] < 0] + # if len(bad_indexes) > 0: + # logger.warning( + # '%s("%s") has negative volumes on timestamps: %s,' + # + ' setting to nan', + # self.__class__.__name__, self.symbol, bad_indexes) + # data.loc[bad_indexes, "volume"] = np.nan def _set_infty_to_nan(self, data): """Set all +/- infty elements of data to NaN, in-place.""" @@ -428,27 +484,34 @@ def _nan_impossible(self, new_data, saved_data=None): # all infinity values are nans self._set_infty_to_nan(new_data) - # TODO: these can be made smarter (sometimes the open is clearly wrong) + # more + self._nan_open_lower_low(new_data) + self._nan_open_higher_high(new_data) + self._nan_incompatible_low_high(new_data) + self._nan_high_lower_close(new_data) + self._nan_low_higher_close(new_data) - # if low is not the lowest, set it to nan - bad_indexes = new_data.index[ - new_data['low'] > new_data[['open', 'high', 'close']].min(1)] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") low prices are not the lowest on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - new_data.loc[bad_indexes, "low"] = np.nan + # TODO: these can be made smarter (sometimes the open is clearly wrong) - # if high is not the highest, set it to nan - bad_indexes = new_data.index[ - new_data['high'] < new_data[['open', 'high', 'close']].max(1)] - if len(bad_indexes) > 0: - logger.warning( - '%s("%s") high prices are not the highest on timestamps: %s,' - + ' setting to nan', - self.__class__.__name__, self.symbol, bad_indexes) - new_data.loc[bad_indexes, "high"] = np.nan + # # if low is not the lowest, set it to nan + # bad_indexes = new_data.index[ + # new_data['low'] > new_data[['open', 'high', 'close']].min(1)] + # if len(bad_indexes) > 0: + # logger.warning( + # '%s("%s") low prices are not the lowest on timestamps: %s,' + # + ' setting to nan', + # self.__class__.__name__, self.symbol, bad_indexes) + # new_data.loc[bad_indexes, "low"] = np.nan + + # # if high is not the highest, set it to nan + # bad_indexes = new_data.index[ + # new_data['high'] < new_data[['open', 'high', 'close']].max(1)] + # if len(bad_indexes) > 0: + # logger.warning( + # '%s("%s") high prices are not the highest on timestamps: %s,' + # + ' setting to nan', + # self.__class__.__name__, self.symbol, bad_indexes) + # new_data.loc[bad_indexes, "high"] = np.nan # TODO: factor quality check and clean into total-return related and non- @@ -463,8 +526,8 @@ def _preload(self, data): # this is not used currently, but if we implement an interface to a # pure OLHCV data source there is no need to store the open-to-open # returns, they can be computed here - #if not 'return' in data.columns: - # data['return'] = data['open'].pct_change().shift(-1) + if not 'return' in data.columns: + data['return'] = data['open'].pct_change().shift(-1) self._quality_check(data) data["valuevolume"] = data["volume"] * data["open"] @@ -552,6 +615,10 @@ def _compute_total_returns(self, data): # non-market log returns (dividends, splits) non_market_lr = log_adjustment_ratio.diff().shift(-1) + # dividend_return = (data['adjclose'] / data['close']).pct_change().shift(-1) + + # import code; code.interact(local=locals()) + # full open-to-open returns open_to_open = np.log(data["open"]).diff().shift(-1) data['return'] = np.exp(open_to_open + non_market_lr) - 1 @@ -570,9 +637,10 @@ def _compute_total_returns(self, data): # print(data) # print(data.isnull().sum()) - def _post_process(self, new_data, saved_data=None): + def _process(self, new_data, saved_data=None): """Temporary.""" + super()._process(new_data, saved_data=saved_data) self._compute_total_returns(new_data) # close2close_total = np.log(1 + new_data['total_return']) @@ -600,6 +668,8 @@ def _post_process(self, new_data, saved_data=None): new_data.loc[new_data.index[-1], ["high", "low", "close", "return", "volume"]] = np.nan + return new_data + # def _process(self, data): # """Clean Yahoo Finance open-low-high-close-volume-adjclose data.""" @@ -668,18 +738,18 @@ def _nan_impossible(self, new_data, saved_data=None): # also do it on adjclose self._nan_nonpositive_prices(new_data, "adjclose") - def _specific_process(self, new_data, saved_data=None): - """Specific process, compute total returns.""" + # def _specific_process(self, new_data, saved_data=None): + # """Specific process, compute total returns.""" - # Close-to-close total return, so we can delegate to parent class. - # Note that this uses different time alignment than Cvxportfolio, - # Here today's return uses yesterday close and today close, while - # today's returns in Cvxportfolio use today open and tomorrow open. - # However this is the format more common among data vendors. - # new_data['total_return'] = new_data['adjclose'].ffill().pct_change() + # # Close-to-close total return, so we can delegate to parent class. + # # Note that this uses different time alignment than Cvxportfolio, + # # Here today's return uses yesterday close and today close, while + # # today's returns in Cvxportfolio use today open and tomorrow open. + # # However this is the format more common among data vendors. + # # new_data['total_return'] = new_data['adjclose'].ffill().pct_change() - # We don't need this any more. - # del new_data['adjclose'] + # # We don't need this any more. + # # del new_data['adjclose'] class YahooFinance(OLHCVAC): From f085125a758685ab76198738ae99d493eb941c00 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 10:53:17 +0400 Subject: [PATCH 18/38] some cleaning, adding read_only --- cvxportfolio/data.py | 1398 ------------------------------ cvxportfolio/data/market_data.py | 56 +- cvxportfolio/data/symbol_data.py | 86 +- cvxportfolio/tests/test_utils.py | 2 +- cvxportfolio/utils.py | 30 + 5 files changed, 51 insertions(+), 1521 deletions(-) delete mode 100644 cvxportfolio/data.py diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py deleted file mode 100644 index 3e2be8232..000000000 --- a/cvxportfolio/data.py +++ /dev/null @@ -1,1398 +0,0 @@ -# Copyright 2023 Enzo Busseti -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This module include classes that download, store, and serve market data. - -The two main abstractions are :class:`SymbolData` and :class:`MarketData`. -Neither are exposed outside this module. Their derived classes instead are. - -If you want to interface cvxportfolio with financial data source other -than the ones we provide, you should derive from either of those two classes. -""" - -import datetime -import logging -import sqlite3 -import sys -import warnings -from pathlib import Path -from urllib.error import URLError - -import numpy as np -import pandas as pd -import requests -import requests.exceptions - -from .errors import DataError -from .utils import (hash_, make_numeric, periods_per_year_from_datetime_index, - resample_returns) - -__all__ = ["YahooFinance", "Fred", - "UserProvidedMarketData", "DownloadedMarketData"] - -logger = logging.getLogger(__name__) - -BASE_LOCATION = Path.home() / "cvxportfolio_data" - -def now_timezoned(): - """Return current timestamp with local timezone. - - :returns: Current timestamp with local timezone. - :rtype: pandas.Timestamp - """ - return pd.Timestamp( - datetime.datetime.now(datetime.timezone.utc).astimezone()) - -class SymbolData: - """Base class for a single symbol time series data. - - The data is either in the form of a Pandas Series or DataFrame - and has datetime index. - - This class needs to be derived. At a minimum, - one should redefine the ``_download`` method, which - implements the downloading of the symbol's time series - from an external source. The method takes the current (already - downloaded and stored) data and is supposed to **only append** to it. - In this way we only store new data and don't modify already downloaded - data. - - Additionally one can redefine the ``_preload`` method, which prepares - data to serve to the user (so the data is stored in a different format - than what the user sees.) We found that this separation can be useful. - - This class interacts with module-level functions named ``_loader_BACKEND`` - and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage - system used. We define ``pickle``, ``csv``, and ``sqlite`` backends. - These may have limitations. See their docstrings for more information. - - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param base_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. By default it's one day. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded data for the symbol. - """ - - def __init__(self, symbol, - storage_backend='pickle', - base_location=BASE_LOCATION, - grace_period=pd.Timedelta('1d')): - self._symbol = symbol - self._storage_backend = storage_backend - self._base_location = base_location - self.update(grace_period) - self._data = self.load() - - @property - def storage_location(self): - """Storage location. Directory is created if not existent. - - :rtype: pathlib.Path - """ - loc = self._base_location / f"{self.__class__.__name__}" - loc.mkdir(parents=True, exist_ok=True) - return loc - - @property - def symbol(self): - """The symbol whose data this instance contains. - - :rtype: str - """ - return self._symbol - - @property - def data(self): - """Time series data, updated to the most recent observation. - - :rtype: pandas.Series or pandas.DataFrame - """ - return self._data - - def _load_raw(self): - """Load raw data from database.""" - # we could implement multiprocess safety here - loader = globals()['_loader_' + self._storage_backend] - try: - logger.info( - f"{self.__class__.__name__} is trying to load {self.symbol}" - + f" with {self._storage_backend} backend" - + f" from {self.storage_location}") - return loader(self.symbol, self.storage_location) - except FileNotFoundError: - return None - - def load(self): - """Load data from database using `self.preload` function to process. - - :returns: Loaded time-series data for the symbol. - :rtype: pandas.Series or pandas.DataFrame - """ - return self._preload(self._load_raw()) - - def _store(self, data): - """Store data in database. - - :param data: Time-series data to store. - :type data: pandas.Series or pandas.DataFrame - """ - # we could implement multiprocess safety here - storer = globals()['_storer_' + self._storage_backend] - logger.info( - f"{self.__class__.__name__} is storing {self.symbol}" - + f" with {self._storage_backend} backend" - + f" in {self.storage_location}") - storer(self.symbol, data, self.storage_location) - - def _print_difference(self, current, new): - """Helper method to print difference if update is not append-only. - - This is temporary and will be re-factored. - """ - print("TEMPORARY: Diff between overlap of downloaded and stored") - print((new - current).dropna(how='all').tail(5)) - - def update(self, grace_period): - """Update current stored data for symbol. - - :param grace_period: If the time between now and the last value stored - is less than this, we don't update the data already stored. - :type grace_period: pandas.Timedelta - """ - current = self._load_raw() - logger.info( - f"Downloading {self.symbol}" - + f" from {self.__class__.__name__}") - updated = self._download( - self.symbol, current, grace_period=grace_period) - - if np.any(updated.iloc[:-1].isnull()): - logger.warning( - " cvxportfolio.%s('%s').data contains NaNs." - + " You may want to inspect it. If you want, you can delete the" - + " data file in %s to force re-download from the start.", - self.__class__.__name__, self.symbol, self.storage_location) - - try: - if current is not None: - if not np.all( - # we use numpy.isclose because returns may be computed - # via logreturns and numerical errors can sift through - np.isclose(updated.loc[current.index[:-1]], - current.iloc[:-1], equal_nan=True, - rtol=1e-08, atol=1e-08)): - logger.error(f"{self.__class__.__name__} update" - + f" of {self.symbol} is not append-only!") - self._print_difference(current, updated) - if hasattr(current, 'columns'): - # the first column is open price - if not current.iloc[-1, 0] == updated.loc[ - current.index[-1]].iloc[0]: - logger.error( - f"{self.__class__.__name__} update " - + f" of {self.symbol} changed last open price!") - self._print_difference(current, updated) - else: - if not current.iloc[-1] == updated.loc[current.index[-1]]: - logger.error( - f"{self.__class__.__name__} update" - + f" of {self.symbol} changed last value!") - self._print_difference(current, updated) - except KeyError: - logger.error("%s update of %s could not be checked for" - + " append-only edits. Was there a DST change?", - self.__class__.__name__, self.symbol) - self._store(updated) - - def _download(self, symbol, current, grace_period, **kwargs): - """Download data from external source given already downloaded data. - - This method must be redefined by derived classes. - - :param symbol: The symbol we download. - :type symbol: str - :param current: The data already downloaded. We are supposed to - **only append** to it. If None, no data is present. - :type current: pandas.Series or pandas.DataFrame or None - :rtype: pandas.Series or pandas.DataFrame - """ - raise NotImplementedError #pragma: no cover - - def _preload(self, data): - """Prepare data to serve to the user. - - This method can be redefined by derived classes. - - :param data: The data returned by the storage backend. - :type data: pandas.Series or pandas.DataFrame - :rtype: pandas.Series or pandas.DataFrame - """ - return data - - -# -# Yahoo Finance. -# - -def _timestamp_convert(unix_seconds_ts): - """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" - return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') - - -class YahooFinance(SymbolData): - """Yahoo Finance symbol data. - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded, and cleaned, data for the symbol. - :type data: pandas.DataFrame - """ - - # is open-high-low-close-volume-(total)return - IS_OLHCVR = True - - @staticmethod - def _clean(data): - """Clean Yahoo Finance open-close-high-low-volume-adjclose data.""" - - # print(data) - # print(data.isnull().sum()) - - # nan-out nonpositive prices - data.loc[data["open"] <= 0, 'open'] = np.nan - data.loc[data["close"] <= 0, "close"] = np.nan - data.loc[data["high"] <= 0, "high"] = np.nan - data.loc[data["low"] <= 0, "low"] = np.nan - data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan - - # nan-out negative volumes - data.loc[data["volume"] < 0, 'volume'] = np.nan - - # all infinity values are nans - data.iloc[:, :] = np.nan_to_num( - data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) - - # print(data) - # print(data.isnull().sum()) - - # if low is not the lowest, set it to nan - data.loc[data['low'] > data[['open', 'high', 'close']].min(1), - 'low'] = np.nan - - # if high is not the highest, set it to nan - data.loc[data['high'] < data[['open', 'high', 'close']].max(1), - 'high'] = np.nan - - # print(data) - # print(data.isnull().sum()) - - # - # fills - # - - # fill volumes with zeros (safest choice) - data['volume'] = data['volume'].fillna(0.) - - # fill close price with open price - data['close'] = data['close'].fillna(data['open']) - - # fill open price with close from day(s) before - # repeat as long as it helps (up to 1 year) - for shifter in range(252): - orig_missing_opens = data['open'].isnull().sum() - data['open'] = data['open'].fillna(data['close'].shift( - shifter+1)) - new_missing_opens = data['open'].isnull().sum() - if orig_missing_opens == new_missing_opens: - break - logger.info( - "Filled missing open prices with close from %s periods before", - shifter+1) - - # fill close price with same day's open - data['close'] = data['close'].fillna(data['open']) - - # fill high price with max - data['high'] = data['high'].fillna(data[['open', 'close']].max(1)) - - # fill low price with max - data['low'] = data['low'].fillna(data[['open', 'close']].min(1)) - - # print(data) - # print(data.isnull().sum()) - - # - # Compute returns - # - - # compute log of ratio between adjclose and close - log_adjustment_ratio = np.log(data['adjclose'] / data['close']) - - # forward fill adjustment ratio - log_adjustment_ratio = log_adjustment_ratio.ffill() - - # non-market log returns (dividends, splits) - non_market_lr = log_adjustment_ratio.diff().shift(-1) - - # full open-to-open returns - open_to_open = np.log(data["open"]).diff().shift(-1) - data['return'] = np.exp(open_to_open + non_market_lr) - 1 - - # print(data) - # print(data.isnull().sum()) - - # intraday_logreturn = np.log(data["close"]) - np.log(data["open"]) - # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1) - # open_to_open_logreturn = ( - # close_to_close_logreturn + intraday_logreturn - - # intraday_logreturn.shift(-1) - # ) - # data["return"] = np.exp(open_to_open_logreturn) - 1 - del data["adjclose"] - - # eliminate last period's intraday data - data.loc[data.index[-1], - ["high", "low", "close", "return", "volume"]] = np.nan - - # print(data) - # print(data.isnull().sum()) - - return data - - @staticmethod - def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): - """Get 1 day OLHC from Yahoo finance. - - Result is timestamped with the open time (time-zoned) of the - instrument. - """ - - base_url = 'https://query2.finance.yahoo.com' - - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)' - ' AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/39.0.2171.95 Safari/537.36'} - - # print(HEADERS) - start = int(pd.Timestamp(start).timestamp()) - end = int(pd.Timestamp(end).timestamp()) - - try: - res = requests.get( - url=f"{base_url}/v8/finance/chart/{ticker}", - params={'interval': '1d', - "period1": start, - "period2": end}, - headers=headers, - timeout=10) # seconds - except requests.ConnectionError as exc: - raise DataError( - f"Download of {ticker} from YahooFinance failed." - + " Are you connected to the Internet?") from exc - - # print(res) - - if res.status_code == 404: - raise DataError( - f'Data for symbol {ticker} is not available.' - + 'Json output:', str(res.json())) - - if res.status_code != 200: - raise DataError(f'Yahoo finance download of {ticker} failed. Json:', - str(res.json())) # pragma: no cover - - data = res.json()['chart']['result'][0] - - try: - index = pd.DatetimeIndex( - [_timestamp_convert(el) for el in data['timestamp']]) - - df_result = pd.DataFrame( - data['indicators']['quote'][0], index=index) - df_result['adjclose'] = data[ - 'indicators']['adjclose'][0]['adjclose'] - except KeyError: - raise DataError(f'Yahoo finance download of {ticker} failed.' - + ' Json:', str(res.json())) # pragma: no cover - - # last timestamp is probably broken (not timed to market open) - # we set its time to same as the day before, but this is wrong - # on days of DST switch. It's fine though because that line will be - # overwritten next update - if df_result.index[-1].time() != df_result.index[-2].time(): - tm1 = df_result.index[-2].time() - newlast = df_result.index[-1].replace( - hour=tm1.hour, minute=tm1.minute, second=tm1.second) - df_result.index = pd.DatetimeIndex( - list(df_result.index[:-1]) + [newlast]) - - return df_result[ - ['open', 'low', 'high', 'close', 'adjclose', 'volume']] - - def _download(self, symbol, current=None, - overlap=5, grace_period='5d', **kwargs): - """Download single stock from Yahoo Finance. - - If data was already downloaded we only download - the most recent missing portion. - - Args: - - symbol (str): yahoo name of the instrument - current (pandas.DataFrame or None): current data present locally - overlap (int): how many lines of current data will be overwritten - by newly downloaded data - kwargs (dict): extra arguments passed to yfinance.download - - Returns: - updated (pandas.DataFrame): updated DataFrame for the symbol - """ - if overlap < 2: - raise SyntaxError( - f'{self.__class__.__name__} with overlap smaller than 2' - + ' could have issues with DST.') - if (current is None) or (len(current) < overlap): - updated = self._get_data_yahoo(symbol, **kwargs) - logger.info('Downloading from the start.') - result = self._clean(updated) - # we remove first row if it contains NaNs - if np.any(result.iloc[0].isnull()): - result = result.iloc[1:] - return result - if (now_timezoned() - current.index[-1] - ) < pd.Timedelta(grace_period): - logger.info( - 'Skipping download because stored data is recent enough.') - return current - new = self._get_data_yahoo(symbol, start=current.index[-overlap]) - new = self._clean(new) - return pd.concat([current.iloc[:-overlap], new]) - - def _preload(self, data): - """Prepare data for use by Cvxportfolio. - - We drop the `volume` column expressed in number of stocks and - replace it with `valuevolume` which is an estimate of the (e.g., - US dollar) value of the volume exchanged on the day. - """ - data["valuevolume"] = data["volume"] * data["open"] - del data["volume"] - - return data - -# -# Fred. -# - -class Fred(SymbolData): - """Fred single-symbol data. - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. By default it's one day. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded data for the symbol. - """ - - URL = "https://fred.stlouisfed.org/graph/fredgraph.csv" - - # TODO: implement Fred point-in-time - # example: - # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06 - # hourly wages time series **as it appeared** on 2023-07-06 - # store using pd.Series() of diff'ed values only. - - def _internal_download(self, symbol): - try: - return pd.read_csv( - self.URL + f'?id={symbol}', - index_col=0, parse_dates=[0])[symbol] - except URLError as exc: - raise DataError(f"Download of {symbol}" - + f" from {self.__class__.__name__} failed." - + " Are you connected to the Internet?") from exc - - def _download( - self, symbol="DFF", current=None, grace_period='5d', **kwargs): - """Download or update pandas Series from Fred. - - If already downloaded don't change data stored locally and only - add new entries at the end. - - Additionally, we allow for a `grace period`, if the data already - downloaded has a last entry not older than the grace period, we - don't download new data. - """ - if current is None: - return self._internal_download(symbol) - if (pd.Timestamp.today() - current.index[-1] - ) < pd.Timedelta(grace_period): - logger.info( - 'Skipping download because stored data is recent enough.') - return current - - new = self._internal_download(symbol) - new = new.loc[new.index > current.index[-1]] - - if new.empty: - logger.info('New downloaded data is empty!') - return current - - assert new.index[0] > current.index[-1] - return pd.concat([current, new]) - - def _preload(self, data): - """Add UTC timezone.""" - data.index = data.index.tz_localize('UTC') - return data - -# -# Sqlite storage backend. -# - -def _open_sqlite(storage_location): - return sqlite3.connect(storage_location/"db.sqlite") - -def _close_sqlite(connection): - connection.close() - -def _loader_sqlite(symbol, storage_location): - """Load data in sqlite format. - - We separately store dtypes for data consistency and safety. - - .. note:: If your pandas object's index has a name it will be lost, - the index is renamed 'index'. If you pass timestamp data (including - the index) it must have explicit timezone. - """ - try: - connection = _open_sqlite(storage_location) - dtypes = pd.read_sql_query( - f"SELECT * FROM {symbol}___dtypes", - connection, index_col="index", - dtype={"index": "str", "0": "str"}) - - parse_dates = 'index' - my_dtypes = dict(dtypes["0"]) - - tmp = pd.read_sql_query( - f"SELECT * FROM {symbol}", connection, - index_col="index", parse_dates=parse_dates, dtype=my_dtypes) - - _close_sqlite(connection) - multiindex = [] - for col in tmp.columns: - if col[:8] == "___level": - multiindex.append(col) - else: - break - if len(multiindex) > 0: - multiindex = [tmp.index.name] + multiindex - tmp = tmp.reset_index().set_index(multiindex) - return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp - except pd.errors.DatabaseError: - return None - -def _storer_sqlite(symbol, data, storage_location): - """Store data in sqlite format. - - We separately store dtypes for data consistency and safety. - - .. note:: If your pandas object's index has a name it will be lost, - the index is renamed 'index'. If you pass timestamp data (including - the index) it must have explicit timezone. - """ - connection = _open_sqlite(storage_location) - exists = pd.read_sql_query( - f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'", - connection) - - if len(exists): - _ = connection.cursor().execute(f"DROP TABLE '{symbol}'") - _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'") - connection.commit() - - if hasattr(data.index, "levels"): - data.index = data.index.set_names( - ["index"] + - [f"___level{i}" for i in range(1, len(data.index.levels))] - ) - data = data.reset_index().set_index("index") - else: - data.index.name = "index" - - if data.index[0].tzinfo is None: - warnings.warn('Index has not timezone, setting to UTC') - data.index = data.index.tz_localize('UTC') - - data.to_sql(f"{symbol}", connection) - pd.DataFrame(data).dtypes.astype("string").to_sql( - f"{symbol}___dtypes", connection) - _close_sqlite(connection) - - -# -# Pickle storage backend. -# - -def _loader_pickle(symbol, storage_location): - """Load data in pickle format.""" - return pd.read_pickle(storage_location / f"{symbol}.pickle") - -def _storer_pickle(symbol, data, storage_location): - """Store data in pickle format.""" - data.to_pickle(storage_location / f"{symbol}.pickle") - -# -# Csv storage backend. -# - -def _loader_csv(symbol, storage_location): - """Load data in csv format.""" - - index_dtypes = pd.read_csv( - storage_location / f"{symbol}___index_dtypes.csv", - index_col=0)["0"] - - dtypes = pd.read_csv( - storage_location / f"{symbol}___dtypes.csv", index_col=0, - dtype={"index": "str", "0": "str"}) - dtypes = dict(dtypes["0"]) - new_dtypes = {} - parse_dates = [] - for i, level in enumerate(index_dtypes): - if "datetime64[ns" in level: # includes all timezones - parse_dates.append(i) - for i, el in enumerate(dtypes): - if "datetime64[ns" in dtypes[el]: # includes all timezones - parse_dates += [i + len(index_dtypes)] - else: - new_dtypes[el] = dtypes[el] - - tmp = pd.read_csv(storage_location / f"{symbol}.csv", - index_col=list(range(len(index_dtypes))), - parse_dates=parse_dates, dtype=new_dtypes) - - return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp - - -def _storer_csv(symbol, data, storage_location): - """Store data in csv format.""" - pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels') - else [data.index.dtype]).astype("string").to_csv( - storage_location / f"{symbol}___index_dtypes.csv") - pd.DataFrame(data).dtypes.astype("string").to_csv( - storage_location / f"{symbol}___dtypes.csv") - data.to_csv(storage_location / f"{symbol}.csv") - -# -# Market Data -# - -class MarketData: - """Prepare, hold, and serve market data. - - :method serve: Serve data for policy and simulator at time :math:`t`. - """ - - def serve(self, t): - """Serve data for policy and simulator at time :math:`t`. - - :param t: Trading time. It must be included in the timestamps returned - by :meth:`trading_calendar`. - :type t: pandas.Timestamp - - :returns: past_returns, current_returns, past_volumes, current_volumes, - current_prices - :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame, - pandas.Series, pandas.Series) - """ - raise NotImplementedError # pragma: no cover - - # pylint: disable=redundant-returns-doc - def trading_calendar( - self, start_time=None, end_time=None, include_end=True): - """Get trading calendar between times. - - :param start_time: Initial time of the trading calendar. Always - inclusive if present. If None, use the first available time. - :type start_time: pandas.Timestamp - :param end_time: Final time of the trading calendar. If None, - use the last available time. - :type end_time: pandas.Timestamp - :param include_end: Include end time. - :type include_end: bool - - :returns: Trading calendar. - :rtype: pandas.DatetimeIndex - """ - raise NotImplementedError # pragma: no cover - - @property - def periods_per_year(self): - """Average trading periods per year. - - :rtype: int - """ - raise NotImplementedError # pragma: no cover - - @property - def full_universe(self): # pylint: disable=redundant-returns-doc - """Full universe, which might not be available for trading. - - :returns: Full universe. - :rtype: pandas.Index - """ - raise NotImplementedError # pragma: no cover - - # pylint: disable=unused-argument, redundant-returns-doc - def partial_universe_signature(self, partial_universe): - """Unique signature of this instance with a partial universe. - - A partial universe is a subset of the full universe that is - available at some time for trading. - - This is used in cvxportfolio.cache to sign back-test caches that - are saved on disk. If not redefined it returns None which disables - on-disk caching. - - :param partial_universe: A subset of the full universe. - :type partial_universe: pandas.Index - - :returns: Signature. - :rtype: str - """ - return None - -# compiled based on Interactive Brokers benchmark rates choices -# (see https://www.ibkrguides.com/kb/article-2949.htm) -# and their FRED codes -RATES = { - 'USDOLLAR': 'DFF', # Federal funds effective rate - 'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate - 'GBPOUND': 'IUDSOIA', # SONIA - 'JPYEN': 'IRSTCB01JPM156N', # updated monthly - } - -class MarketDataInMemory(MarketData): - """Market data that is stored in memory when initialized.""" - - # this is overwritten in the derived classes' initializers - returns = None - - def __init__( - self, trading_frequency, base_location, cash_key, min_history, - online_usage = False): - """This must be called by the derived classes.""" - if (self.returns.index[-1] - self.returns.index[0]) < min_history: - raise DataError( - "The provided returns have less history " - + f"than the min_history {min_history}") - if trading_frequency: - self._downsample(trading_frequency) - self.trading_frequency = trading_frequency - - self._set_read_only() - self._check_sizes() - self._mask = None - self._masked_returns = None - self._masked_volumes = None - self._masked_prices = None - self.base_location = Path(base_location) - self.cash_key = cash_key - self._min_history_timedelta = min_history - self.online_usage = online_usage - - def _mask_dataframes(self, mask): - """Mask internal dataframes if necessary.""" - if (self._mask is None) or not np.all(self._mask == mask): - logger.info("Masking internal %s dataframes.", - self.__class__.__name__) - colmask = self.returns.columns[mask] - # self._masked_returns = self._df_or_ser_set_read_only( - # pd.DataFrame(self.returns.iloc[:, mask], copy=True)) - self._masked_returns = self._df_or_ser_set_read_only( - pd.DataFrame(self.returns.loc[:, colmask], copy=True)) - # self._masked_returns = self._df_or_ser_set_read_only( - # pd.DataFrame(np.array(self.returns.values[:, mask]), - # index=self.returns.index, columns=colmask)) - if not self.volumes is None: - # self._masked_volumes = self._df_or_ser_set_read_only( - # pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True)) - self._masked_volumes = self._df_or_ser_set_read_only( - pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True)) - # self._masked_volumes = self._df_or_ser_set_read_only( - # pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]), - # index=self.volumes.index, columns=colmask[:-1])) - if not self.prices is None: - # self._masked_prices = self._df_or_ser_set_read_only( - # pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True)) - self._masked_prices = self._df_or_ser_set_read_only( - pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True)) - self._mask = mask - - @property - def full_universe(self): - """Full universe, which might not be available for trading. - - :returns: Full universe. - :rtype: pandas.Index - """ - return self.returns.columns - - def serve(self, t): - """Serve data for policy and simulator at time :math:`t`. - - :param t: Time of execution, *e.g.*, stock market open of a given day. - :type t: pandas.Timestamp - - :returns: (past_returns, current_returns, past_volumes, - current_volumes, current_prices) - :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None, - pandas.Series or None, pandas.Series or None) - """ - - mask = self._universe_mask_at_time(t).values - self._mask_dataframes(mask) - - tidx = self.returns.index.get_loc(t) - past_returns = self._df_or_ser_set_read_only( - pd.DataFrame(self._masked_returns.iloc[:tidx])) - current_returns = self._df_or_ser_set_read_only( - pd.Series(self._masked_returns.iloc[tidx])) - - if not self.volumes is None: - tidx = self.volumes.index.get_loc(t) - past_volumes = self._df_or_ser_set_read_only( - pd.DataFrame(self._masked_volumes.iloc[:tidx])) - current_volumes = self._df_or_ser_set_read_only( - pd.Series(self._masked_volumes.iloc[tidx])) - else: - past_volumes = None - current_volumes = None - - if not self.prices is None: - tidx = self.prices.index.get_loc(t) - current_prices = self._df_or_ser_set_read_only( - pd.Series(self._masked_prices.iloc[tidx])) - else: - current_prices = None - - return (past_returns, current_returns, past_volumes, current_volumes, - current_prices) - - def _add_cash_column(self, cash_key, grace_period): - """Add the cash column to an already formed returns dataframe. - - This assumes that the trading periods are about equally spaced. - If, say, you have trading periods with very different lengths you - should redefine this method **and** replace the :class:`CashReturn` - objective term. - """ - - if not cash_key in RATES: - raise NotImplementedError( - 'Currently the only data pipelines built are for cash_key' - f' in {list(RATES)}') - - if self.returns.index.tz is None: - raise DataError( - 'Your provided dataframes are not timezone aware.' - + " This is not recommended, and doesn't allow to add the cash" - + " returns' column internally." - + " You can fix this by adding a timezone manually " - + "using pandas.DataFrame.tz_localize to the dataframes before" - + " you pass them, or you can provide" - + " the cash returns' column as the last column of the returns" - + " dataframe (so it has one more column than volumes and" - + " prices, if provided), and set the cash_key parameter to" - + " its name.") - - data = Fred( - RATES[cash_key], base_location=self.base_location, - grace_period=grace_period) - - cash_returns_per_period = resample_returns( - data.data/100, periods=self.periods_per_year) - - # we merge instead of assigning column because indexes might - # be misaligned (e.g., with tz-aware timestamps) - cash_returns_per_period.name = self.cash_key - original_returns_index = self.returns.index - tmp = pd.concat( - [self.returns, cash_returns_per_period], sort=True, axis=1) - tmp[cash_key] = tmp[cash_key].ffill() - self.returns = tmp.loc[original_returns_index] - - def trading_calendar( - self, start_time=None, end_time=None, include_end=True): - """Get trading calendar from market data. - - :param start_time: Initial time of the trading calendar. Always - inclusive if present. If None, use the first available time. - :type start_time: pandas.Timestamp - :param end_time: Final time of the trading calendar. If None, - use the last available time. - :type end_time: pandas.Timestamp - :param include_end: Include end time. - :type include_end: bool - - :returns: Trading calendar. - :rtype: pandas.DatetimeIndex - """ - result = self.returns.index - result = result[result >= self._earliest_backtest_start] - if start_time: - result = result[result >= start_time] - if end_time: - result = result[(result <= end_time)] - if not include_end: - result = result[:-1] - return result - - def _universe_mask_at_time(self, t): - """Return the valid universe mask at time t.""" - past_returns = self.returns.loc[self.returns.index < t] - if self.online_usage: - valid_universe_mask = past_returns.count() >= self.min_history - else: - valid_universe_mask = ((past_returns.count() >= self.min_history) & - (~self.returns.loc[t].isnull())) - if sum(valid_universe_mask) <= 1: - raise DataError( - f'The trading universe at time {t} has size less or equal' - + ' than one, i.e., only the cash account. There are probably ' - + ' issues with missing data in the provided market returns.') - return valid_universe_mask - - @staticmethod - def _df_or_ser_set_read_only(df_or_ser): - """Set numpy array contained in dataframe to read only. - - This is done on data store internally before it is served to the - policy or the simulator to ensure data consistency in case some - element of the pipeline accidentally corrupts the data. - - This is enough to prevent direct assignement to the resulting - dataframe. However it could still be accidentally corrupted by - assigning to columns or indices that are not present in the - original. We avoid that case as well by returning a wrapped - dataframe (which doesn't copy data on creation) in - serve_data_policy and serve_data_simulator. - """ - data = df_or_ser.values - data.flags.writeable = False - if hasattr(df_or_ser, 'columns'): - return pd.DataFrame(data, index=df_or_ser.index, - columns=df_or_ser.columns) - return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name) - - def _set_read_only(self): - """Set internal dataframes to read-only.""" - - self.returns = self._df_or_ser_set_read_only(self.returns) - - if not self.prices is None: - self.prices = self._df_or_ser_set_read_only(self.prices) - - if not self.volumes is None: - self.volumes = self._df_or_ser_set_read_only(self.volumes) - - @property - def _earliest_backtest_start(self): - """Earliest date at which we can start a backtest.""" - return self.returns.iloc[:, :-1].dropna(how='all').index[ - self.min_history] - - sampling_intervals = { - 'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'} - - # @staticmethod - # def _is_first_interval_small(datetimeindex): - # """Check if post-resampling the first interval is small. - # - # We have no way of knowing exactly if the first interval - # needs to be dropped. We drop it if its length is smaller - # than the average of all others, minus 2 standard deviation. - # """ - # first_interval = (datetimeindex[1] - datetimeindex[0]) - # all_others = (datetimeindex[2:] - datetimeindex[1:-1]) - # return first_interval < (all_others.mean() - 2 * all_others.std()) - - def _downsample(self, interval): - """_downsample market data.""" - if not interval in self.sampling_intervals: - raise SyntaxError( - 'Unsopported trading interval for down-sampling.') - interval = self.sampling_intervals[interval] - new_returns_index = pd.Series(self.returns.index, self.returns.index - ).resample(interval, closed='left', - label='left').first().values - # print(new_returns_index) - self.returns = np.exp(np.log( - 1+self.returns).resample(interval, closed='left', label='left' - ).sum(min_count=1))-1 - self.returns.index = new_returns_index - - # last row is always unknown - self.returns.iloc[-1] = np.nan - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.returns.index): - # self.returns = self.returns.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.returns.columns[:-1]: - self.returns.loc[ - (~(self.returns[col].isnull())).idxmax(), col] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.returns = self.returns.iloc[1:] - - if self.volumes is not None: - new_volumes_index = pd.Series( - self.volumes.index, self.volumes.index - ).resample(interval, closed='left', - label='left').first().values - self.volumes = self.volumes.resample( - interval, closed='left', label='left').sum(min_count=1) - self.volumes.index = new_volumes_index - - # last row is always unknown - self.volumes.iloc[-1] = np.nan - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.volumes.index): - # self.volumes = self.volumes.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.volumes.columns: - self.volumes.loc[ - (~(self.volumes[col].isnull())).idxmax(), col] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.volumes = self.volumes.iloc[1:] - - if self.prices is not None: - new_prices_index = pd.Series( - self.prices.index, self.prices.index - ).resample( - interval, closed='left', label='left').first().values - self.prices = self.prices.resample( - interval, closed='left', label='left').first() - self.prices.index = new_prices_index - - # # we drop the first row if its interval is small - # if self._is_first_interval_small(self.prices.index): - # self.prices = self.prices.iloc[1:] - - # we nan-out the first non-nan element of every col - for col in self.prices.columns: - self.prices.loc[ - (~(self.prices[col].isnull())).idxmax(), col] = np.nan - - # and we drop the first row, which is mostly NaNs anyway - self.prices = self.prices.iloc[1:] - - def _check_sizes(self): - """Check sizes of user-provided dataframes.""" - - if (not self.volumes is None) and ( - not (self.volumes.shape[1] == self.returns.shape[1] - 1) - or not all(self.volumes.columns == self.returns.columns[:-1])): - raise SyntaxError( - 'Volumes should have same columns as returns, minus cash_key.') - - if (not self.prices is None) and ( - not (self.prices.shape[1] == self.returns.shape[1] - 1) - or not all(self.prices.columns == self.returns.columns[:-1])): - raise SyntaxError( - 'Prices should have same columns as returns, minus cash_key.') - - @property - def periods_per_year(self): - """Average trading periods per year inferred from the data. - - :returns: Average periods per year. - :rtype: int - """ - return periods_per_year_from_datetime_index(self.returns.index) - - @property - def min_history(self): - """Min history expressed in periods. - - :returns: How many non-null elements of the past returns for a given - name are required to include it. - :rtype: int - """ - return int(np.round(self.periods_per_year * ( - self._min_history_timedelta / pd.Timedelta('365.24d')))) - - -class UserProvidedMarketData(MarketDataInMemory): - """User-provided market data. - - :param returns: Historical open-to-open returns. The return - at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where - :math:`p_t` is the (open) price at time :math:`t`. Must - have datetime index. You can also include cash - returns as its last column, and set ``cash_key`` below to the last - column's name. - :type returns: pandas.DataFrame - :param volumes: Historical market volumes, expressed in units - of value (*e.g.*, US dollars). - :type volumes: pandas.DataFrame or None - :param prices: Historical open prices (*e.g.*, used for rounding - trades in the :class:`MarketSimulator`). - :type prices: pandas.DataFrame or None - :param trading_frequency: Instead of using frequency implied by - the index of the returns, down-sample all dataframes. - We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and - ``'annual'``. By default (None) don't down-sample. - :type trading_frequency: str or None - :param min_history: Minimum amount of time for which the returns - are not ``np.nan`` before each assets enters in a back-test. - :type min_history: pandas.Timedelta - :param base_location: The location of the storage, only used - in case it downloads the cash returns. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param cash_key: Name of the cash account. If not the last column - of the provided returns, it will be downloaded. In that case you should - make sure your provided dataframes have a timezone aware datetime - index. Its returns are the risk-free rate. - :type cash_key: str - :param online_usage: Disable removal of assets that have ``np.nan`` returns - for the given time. Default False. - :type online_usage: bool - """ - - # pylint: disable=too-many-arguments - def __init__(self, returns, volumes=None, prices=None, - copy_dataframes=True, trading_frequency=None, - min_history=pd.Timedelta('365.24d'), - base_location=BASE_LOCATION, - grace_period=pd.Timedelta('1d'), - cash_key='USDOLLAR', - online_usage=False): - - if returns is None: - raise SyntaxError( - "If you don't specify a universe you should pass `returns`.") - - self.base_location = Path(base_location) - self.cash_key = cash_key - - self.returns = pd.DataFrame( - make_numeric(returns), copy=copy_dataframes) - self.volumes = volumes if volumes is None else\ - pd.DataFrame(make_numeric(volumes), copy=copy_dataframes) - self.prices = prices if prices is None else\ - pd.DataFrame(make_numeric(prices), copy=copy_dataframes) - - if cash_key != returns.columns[-1]: - self._add_cash_column(cash_key, grace_period=grace_period) - - # this is mandatory - super().__init__( - trading_frequency=trading_frequency, - base_location=base_location, - cash_key=cash_key, - min_history=min_history, - online_usage=online_usage) - - -class DownloadedMarketData(MarketDataInMemory): - """Market data that is downloaded. - - :param universe: List of names as understood by the data source - used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default - Yahoo Finance data source. - :type universe: list - :param datasource: The data source used. - :type datasource: str or :class:`SymbolData` class - :param cash_key: Name of the cash account, its rates will be downloaded - and added as last columns of the returns. Its returns are the - risk-free rate. - :type cash_key: str - :param base_location: The location of the storage. By default - it's a directory named ``cvxportfolio_data`` in your home folder. - :type base_location: pathlib.Path - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``. - :type storage_backend: str - :param min_history: Minimum amount of time for which the returns - are not ``np.nan`` before each assets enters in a back-test. - :type min_history: pandas.Timedelta - :param grace_period: If the most recent observation of each symbol's - data is less old than this we do not download new data. - By default it's one day. - :type grace_period: pandas.Timedelta - :param trading_frequency: Instead of using frequency implied by - the index of the returns, down-sample all dataframes. - We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and - ``'annual'``. By default (None) don't down-sample. - :type trading_frequency: str or None - :param online_usage: Disable removal of assets that have ``np.nan`` returns - for the given time. Default False. - :type online_usage: bool - """ - - # pylint: disable=too-many-arguments - def __init__(self, - universe=(), - datasource='YahooFinance', - cash_key='USDOLLAR', - base_location=BASE_LOCATION, - storage_backend='pickle', - min_history=pd.Timedelta('365.24d'), - grace_period=pd.Timedelta('1d'), - trading_frequency=None, - online_usage=False): - """Initializer.""" - - # drop duplicates and ensure ordering - universe = sorted(set(universe)) - - self.base_location = Path(base_location) - self.cash_key = cash_key - if isinstance(datasource, type): - self.datasource = datasource - else: # try to load in current module - self.datasource = globals()[datasource] - self._get_market_data( - universe, grace_period=grace_period, - storage_backend=storage_backend) - self._add_cash_column(self.cash_key, grace_period=grace_period) - self._remove_missing_recent() - - # this is mandatory - super().__init__( - trading_frequency=trading_frequency, - base_location=base_location, - cash_key=cash_key, - min_history=min_history, - online_usage=online_usage) - - def _get_market_data(self, universe, grace_period, storage_backend): - """Download market data.""" - database_accesses = {} - print('Updating data', end='') - sys.stdout.flush() - - for stock in universe: - logger.info( - 'Updating %s with %s.', stock, self.datasource.__name__) - print('.', end='') - sys.stdout.flush() - database_accesses[stock] = self.datasource( - stock, base_location=self.base_location, - grace_period=grace_period, storage_backend=storage_backend) - print() - - if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR: - self.returns = pd.DataFrame( - {stock: database_accesses[stock].data['return'] - for stock in universe}) - self.volumes = pd.DataFrame( - {stock: database_accesses[stock].data['valuevolume'] - for stock in universe}) - self.prices = pd.DataFrame( - {stock: database_accesses[stock].data['open'] - for stock in universe}) - else: # for now only Fred for indexes, we assume prices! - assert isinstance(database_accesses[universe[0]].data, pd.Series) - self.prices = pd.DataFrame( - # open prices - {stock: database_accesses[stock].data for stock in universe}) - self.returns = 1 - self.prices / self.prices.shift(-1) - self.volumes = None - - def _remove_missing_recent(self): - """Clean recent data. - - Yahoo Finance may has issues with most recent data; we remove - recent days if there are NaNs. - """ - - if self.prices.iloc[-5:].isnull().any().any(): - logger.debug( - 'Removing some recent lines because there are missing values.') - drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax() - logger.debug('Dropping at index %s', drop_at) - self.returns = self.returns.loc[self.returns.index < drop_at] - if self.prices is not None: - self.prices = self.prices.loc[self.prices.index < drop_at] - if self.volumes is not None: - self.volumes = self.volumes.loc[self.volumes.index < drop_at] - - # for consistency we must also nan-out the last row - # of returns and volumes - self.returns.iloc[-1] = np.nan - if self.volumes is not None: - self.volumes.iloc[-1] = np.nan - - def partial_universe_signature(self, partial_universe): - """Unique signature of this instance with a partial universe. - - A partial universe is a subset of the full universe that is - available at some time for trading. - - This is used in cvxportfolio.cache to sign back-test caches that - are saved on disk. See its implementation below for details. If - not redefined it returns None which disables on-disk caching. - - :param partial_universe: A subset of the full universe. - :type partial_universe: pandas.Index - - :returns: Signature. - :rtype: str - """ - assert isinstance(partial_universe, pd.Index) - assert np.all(partial_universe.isin(self.full_universe)) - result = f'{self.__class__.__name__}(' - result += f'datasource={self.datasource.__name__}, ' - result += f'partial_universe_hash={hash_(np.array(partial_universe))},' - result += f' trading_frequency={self.trading_frequency})' - return result diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index 6ba1c406e..e69530b5b 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -22,7 +22,7 @@ from ..errors import DataError from ..utils import (hash_, make_numeric, periods_per_year_from_datetime_index, - resample_returns) + resample_returns, set_pd_read_only) from .symbol_data import * from .symbol_data import OLHCV @@ -150,25 +150,25 @@ def _mask_dataframes(self, mask): logger.info("Masking internal %s dataframes.", self.__class__.__name__) colmask = self.returns.columns[mask] - # self._masked_returns = self._df_or_ser_set_read_only( + # self._masked_returns = set_pd_read_only( # pd.DataFrame(self.returns.iloc[:, mask], copy=True)) - self._masked_returns = self._df_or_ser_set_read_only( + self._masked_returns = set_pd_read_only( pd.DataFrame(self.returns.loc[:, colmask], copy=True)) - # self._masked_returns = self._df_or_ser_set_read_only( + # self._masked_returns = set_pd_read_only( # pd.DataFrame(np.array(self.returns.values[:, mask]), # index=self.returns.index, columns=colmask)) if not self.volumes is None: - # self._masked_volumes = self._df_or_ser_set_read_only( + # self._masked_volumes = set_pd_read_only( # pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True)) - self._masked_volumes = self._df_or_ser_set_read_only( + self._masked_volumes = set_pd_read_only( pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True)) - # self._masked_volumes = self._df_or_ser_set_read_only( + # self._masked_volumes = set_pd_read_only( # pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]), # index=self.volumes.index, columns=colmask[:-1])) if not self.prices is None: - # self._masked_prices = self._df_or_ser_set_read_only( + # self._masked_prices = set_pd_read_only( # pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True)) - self._masked_prices = self._df_or_ser_set_read_only( + self._masked_prices = set_pd_read_only( pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True)) self._mask = mask @@ -197,16 +197,16 @@ def serve(self, t): self._mask_dataframes(mask) tidx = self.returns.index.get_loc(t) - past_returns = self._df_or_ser_set_read_only( + past_returns = set_pd_read_only( pd.DataFrame(self._masked_returns.iloc[:tidx])) - current_returns = self._df_or_ser_set_read_only( + current_returns = set_pd_read_only( pd.Series(self._masked_returns.iloc[tidx])) if not self.volumes is None: tidx = self.volumes.index.get_loc(t) - past_volumes = self._df_or_ser_set_read_only( + past_volumes = set_pd_read_only( pd.DataFrame(self._masked_volumes.iloc[:tidx])) - current_volumes = self._df_or_ser_set_read_only( + current_volumes = set_pd_read_only( pd.Series(self._masked_volumes.iloc[tidx])) else: past_volumes = None @@ -214,7 +214,7 @@ def serve(self, t): if not self.prices is None: tidx = self.prices.index.get_loc(t) - current_prices = self._df_or_ser_set_read_only( + current_prices = set_pd_read_only( pd.Series(self._masked_prices.iloc[tidx])) else: current_prices = None @@ -306,38 +306,16 @@ def _universe_mask_at_time(self, t): + ' issues with missing data in the provided market returns.') return valid_universe_mask - @staticmethod - def _df_or_ser_set_read_only(df_or_ser): - """Set numpy array contained in dataframe to read only. - - This is done on data store internally before it is served to the - policy or the simulator to ensure data consistency in case some - element of the pipeline accidentally corrupts the data. - - This is enough to prevent direct assignement to the resulting - dataframe. However it could still be accidentally corrupted by - assigning to columns or indices that are not present in the - original. We avoid that case as well by returning a wrapped - dataframe (which doesn't copy data on creation) in - serve_data_policy and serve_data_simulator. - """ - data = df_or_ser.values - data.flags.writeable = False - if hasattr(df_or_ser, 'columns'): - return pd.DataFrame(data, index=df_or_ser.index, - columns=df_or_ser.columns) - return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name) - def _set_read_only(self): """Set internal dataframes to read-only.""" - self.returns = self._df_or_ser_set_read_only(self.returns) + self.returns = set_pd_read_only(self.returns) if not self.prices is None: - self.prices = self._df_or_ser_set_read_only(self.prices) + self.prices = set_pd_read_only(self.prices) if not self.volumes is None: - self.volumes = self._df_or_ser_set_read_only(self.volumes) + self.volumes = set_pd_read_only(self.volumes) @property def _earliest_backtest_start(self): diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 516d95de8..0419c645f 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -26,6 +26,7 @@ import requests.exceptions from ..errors import DataError +from ..utils import set_pd_read_only logger = logging.getLogger(__name__) @@ -140,7 +141,7 @@ def load(self): :returns: Loaded time-series data for the symbol. :rtype: pandas.Series or pandas.DataFrame """ - return self._preload(self._load_raw()) + return set_pd_read_only(self._preload(self._load_raw())) def _store(self, data): """Store data in database. @@ -329,14 +330,6 @@ def _process(self, new_data, saved_data=None): return new_data - # def _specific_process(self, new_data, saved_data=None): - # """Specific process, do nothing.""" - # # return new_data - - # def _post_process(self, new_data, saved_data=None): - # """Post process, do nothing.""" - # # return new_data - def _nan_unlikely(self, new_data, saved_data=None): """Nan-out unlikely values.""" # return new_data @@ -380,7 +373,7 @@ def _fill(self, new_data, saved_data=None): # print(data.isnull().sum()) def _nan_values(self, data, condition, columns_to_nan, message): - """Set to NaN in-place on indexing condition chosen columns.""" + """Set to NaN in-place for indexing condition and chosen columns.""" bad_indexes = data.index[condition] if len(bad_indexes) > 0: @@ -438,28 +431,6 @@ def _nan_low_higher_close(self, data): columns_to_nan = "low", message = 'low price higher than close price') - # def _nan_nonpositive_prices(self, data, prices_name): - # """Set non-positive prices (chosen column) to NaN, in-place.""" - - # bad_indexes = data.index[data[prices_name] <= 0] - # if len(bad_indexes) > 0: - # logger.warning( - # '%s("%s") has non-positive %s prices on timestamps: %s,' - # + ' setting to nan', - # self.__class__.__name__, self.symbol, prices_name, bad_indexes) - # data.loc[bad_indexes, prices_name] = np.nan - - # def _nan_negative_volumes(self, data): - # """Set negative volumes to NaN, in-place.""" - - # bad_indexes = data.index[data["volume"] < 0] - # if len(bad_indexes) > 0: - # logger.warning( - # '%s("%s") has negative volumes on timestamps: %s,' - # + ' setting to nan', - # self.__class__.__name__, self.symbol, bad_indexes) - # data.loc[bad_indexes, "volume"] = np.nan - def _set_infty_to_nan(self, data): """Set all +/- infty elements of data to NaN, in-place.""" @@ -491,28 +462,6 @@ def _nan_impossible(self, new_data, saved_data=None): self._nan_high_lower_close(new_data) self._nan_low_higher_close(new_data) - # TODO: these can be made smarter (sometimes the open is clearly wrong) - - # # if low is not the lowest, set it to nan - # bad_indexes = new_data.index[ - # new_data['low'] > new_data[['open', 'high', 'close']].min(1)] - # if len(bad_indexes) > 0: - # logger.warning( - # '%s("%s") low prices are not the lowest on timestamps: %s,' - # + ' setting to nan', - # self.__class__.__name__, self.symbol, bad_indexes) - # new_data.loc[bad_indexes, "low"] = np.nan - - # # if high is not the highest, set it to nan - # bad_indexes = new_data.index[ - # new_data['high'] < new_data[['open', 'high', 'close']].max(1)] - # if len(bad_indexes) > 0: - # logger.warning( - # '%s("%s") high prices are not the highest on timestamps: %s,' - # + ' setting to nan', - # self.__class__.__name__, self.symbol, bad_indexes) - # new_data.loc[bad_indexes, "high"] = np.nan - # TODO: factor quality check and clean into total-return related and non- def _preload(self, data): @@ -540,17 +489,6 @@ class OLHCVAC(OLHCV): This is modeled after the data returned by Yahoo Finance. """ -# It implements -# the transformation required to conform to the -# Open-High-Low-Close-Volume-TotalReturn model, that is, compute -# returns from the adjusted closes, and do some error checks. -# """ -# class OLHCVTR(OLHCV): # pylint: disable=abstract-method -# """Open-Low-High-Close-Volume-TotalReturn symbol data.""" - - # TODO: this becomes a isinstance(OLHC) in the caller - # is open-high-low-close-volume-total return - # IS_OLHCVR = True # # rolstd windows for finding wrong logreturns # _ROLSTD_WINDOWS = [20, 60, 252] @@ -670,24 +608,6 @@ def _process(self, new_data, saved_data=None): return new_data - # def _process(self, data): - # """Clean Yahoo Finance open-low-high-close-volume-adjclose data.""" - - # self._nan_impossible(data) - - # self._fill(data) - - # self._compute_total_returns(data) - - # # eliminate adjclose column - # del data["adjclose"] - - # # eliminate last period's intraday data - # data.loc[data.index[-1], - # ["high", "low", "close", "return", "volume"]] = np.nan - - # return data - def _quality_check(self, data): """Analyze quality of the OLHCV-TR data.""" diff --git a/cvxportfolio/tests/test_utils.py b/cvxportfolio/tests/test_utils.py index efdb7ba01..f4d3d3f4c 100644 --- a/cvxportfolio/tests/test_utils.py +++ b/cvxportfolio/tests/test_utils.py @@ -80,7 +80,7 @@ def test_make_numeric(self): np.array(['1', 2], dtype=object), pd.Series([1, '2', 3], dtype=object), pd.DataFrame([[1, '2.', 3], [4, '5.', 6]], dtype=object)]: - make_numeric(data) + self.assertTrue(np.all(data.astype(float) == make_numeric(data))) for data in [ np.array(['1a', 2], dtype=object), diff --git a/cvxportfolio/utils.py b/cvxportfolio/utils.py index 0f5d3a25d..1454966fb 100644 --- a/cvxportfolio/utils.py +++ b/cvxportfolio/utils.py @@ -29,6 +29,36 @@ 'average_periods_per_year'] +@staticmethod +def set_pd_read_only(df_or_ser): + """Set numpy array contained in dataframe or series to read only. + + This is done on data store internally before it is served to the + policy or the simulator to ensure data consistency in case some + element of the pipeline accidentally corrupts the data. + + This is enough to prevent direct assignement to the resulting + dataframe. However it could still be accidentally corrupted by + assigning to columns or indices that are not present in the + original. We avoid that case as well by returning a wrapped + dataframe (which doesn't copy data on creation) in + serve_data_policy and serve_data_simulator. + + :param df_or_ser: Series or Dataframe, only numeric (better if + homogeneous) dtype. + :type df_or_ser: pd.Series or pd.DataFrame + + :returns: Pandas object set to read only. + :rtype: pd.Series or pd.DataFrame + """ + data = df_or_ser.values + data.flags.writeable = False + if hasattr(df_or_ser, 'columns'): + return pd.DataFrame(data, index=df_or_ser.index, + columns=df_or_ser.columns) + return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name) + + def average_periods_per_year(num_periods, first_time, last_time): """Average periods per year of a datetime index (unpacked), rounded to int. From 4072d546a1616c05b5d181e287a85bb629596031 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 11:40:23 +0400 Subject: [PATCH 19/38] testing --- cvxportfolio/data/symbol_data.py | 18 +- cvxportfolio/tests/test_data.py | 304 ++++++++++++++++++------------- 2 files changed, 191 insertions(+), 131 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 0419c645f..f94d49409 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -410,12 +410,12 @@ def _nan_open_higher_high(self, data): columns_to_nan = "open", message = 'open price higher than high price') - def _nan_incompatible_low_high(self, data): - """Set low and high to NaN if low is higher, in-place.""" - self._nan_values( - data=data, condition = data['low'] > data['high'], - columns_to_nan = ["low", "high"], - message = 'low price higher than high price') + # def _nan_incompatible_low_high(self, data): + # """Set low and high to NaN if low is higher, in-place.""" + # self._nan_values( + # data=data, condition = data['low'] > data['high'], + # columns_to_nan = ["low", "high"], + # message = 'low price higher than high price') def _nan_high_lower_close(self, data): """Set high price to NaN if lower than close, in-place.""" @@ -456,11 +456,11 @@ def _nan_impossible(self, new_data, saved_data=None): self._set_infty_to_nan(new_data) # more - self._nan_open_lower_low(new_data) - self._nan_open_higher_high(new_data) - self._nan_incompatible_low_high(new_data) self._nan_high_lower_close(new_data) self._nan_low_higher_close(new_data) + self._nan_open_lower_low(new_data) + self._nan_open_higher_high(new_data) + # self._nan_incompatible_low_high(new_data) # TODO: factor quality check and clean into total-return related and non- diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index bb23f2b50..e5d920e98 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -327,6 +327,188 @@ def _base_test_multiindex(self, loader, storer): self.assertTrue(all(data.index.dtypes == data1.index.dtypes)) self.assertTrue(all(data.dtypes == data1.dtypes)) + def test_download_errors(self): + """Test single-symbol download error.""" + + storer = YahooFinance( + 'AAPL', grace_period=self.data_grace_period, + base_location=self.datadir) + with self.assertRaises(SyntaxError): + # pylint: disable=protected-access + storer._download('AAPL', overlap=1) + + class YahooFinanceErroneous(YahooFinance): + """Modified YF that nans last open price.""" + def _download(self, symbol, current=None, + overlap=5, grace_period='5d', **kwargs): + """Modified download method.""" + res = super()._download(symbol, current, + grace_period=grace_period) + res.iloc[-1, 0 ] = np.nan + return res + + _ = YahooFinanceErroneous('AMZN', base_location=self.datadir) + with self.assertLogs(level='ERROR') as _: + _ = YahooFinanceErroneous( + 'AMZN', base_location=self.datadir) + + class YahooFinanceErroneous2(YahooFinance): + """Modified YF that nans some line.""" + def _download(self, symbol, current=None, + overlap=5, grace_period='5d', **kwargs): + """Modified download method.""" + res = super()._download(symbol, current, + grace_period=grace_period) + res.iloc[-20] = np.nan + return res + with self.assertLogs(level='WARNING') as _: + _ = YahooFinanceErroneous2('GOOGL', + base_location=self.datadir) + with self.assertLogs(level='WARNING') as _: + _ = YahooFinanceErroneous2( + 'GOOGL', base_location=self.datadir) + + class FredErroneous(Fred): + """Modified FRED SymbolData that gives a NaN in the last entry.""" + + def _download(self, symbol, current, grace_period): + """Modified download method.""" + res = super()._download(symbol, current, + grace_period=grace_period) + res.iloc[-1] = np.nan + return res + + _ = FredErroneous('DFF', base_location=self.datadir) + with self.assertLogs(level='ERROR') as _: + _ = FredErroneous( + 'DFF', base_location=self.datadir) + + class YahooFinanceErroneous3(YahooFinance): + """Modified YF that is not append-only.""" + counter = 0 + def _download(self, symbol, current=None, + overlap=5, grace_period='5d', **kwargs): + """Modified download method.""" + res = super()._download(symbol, current, + grace_period=grace_period) + if self.counter > 0: + res.iloc[-2] = 0. + self.counter += 1 + return res + storer = YahooFinanceErroneous3('GOOGL', base_location=self.datadir) + with self.assertLogs(level='ERROR') as _: + storer.update(pd.Timedelta('0d')) + + def test_no_internet(self): + """Test errors thrown when not connected to the internet.""" + + with NoInternet(): + with self.assertRaises(DataError): + cvx.YahooFinance('BABA', base_location=self.datadir) + + with NoInternet(): + with self.assertRaises(DataError): + cvx.Fred('CES0500000003', base_location=self.datadir) + + def test_yahoo_finance_errors(self): + """Test errors with Yahoo Finance.""" + + with self.assertRaises(DataError): + YahooFinance("DOESNTEXIST", base_location=self.datadir) + + def test_yahoo_finance_cleaning(self): + """Test our logic to clean Yahoo Finance data.""" + + # this stock was found to have NaN issues + data = YahooFinance("ENI.MI", base_location=self.datadir).data + self.assertTrue((data.valuevolume == 0).sum() > 0) + self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0) + + def test_yahoo_finance_cleaning_granular(self): + """Test each step of cleaning.""" + + # pylint: disable=protected-access + raw_data = YahooFinance._get_data_yahoo('ZM') + print(raw_data) + empty_instance = YahooFinance.__new__(YahooFinance) + empty_instance._symbol = 'ZM' # because the warnings use the symbol + + def _test_warning(data_transformation, part_of_message): + """Test that warning is raised w/ message containing some word.""" + data = pd.DataFrame(raw_data, copy=True) + exec(data_transformation) # pylint: disable=exec-used + with self.assertLogs(level='WARNING') as _: + _cleaned = empty_instance._process(data, None) + self.assertTrue(part_of_message in _.output[0]) + # check all NaNs have been filled + self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0) + + # infty + _test_warning( + 'data.iloc[2,2] = np.inf', + 'infinity') + + # non-pos price + _test_warning( + 'data.iloc[2,0] = -1', + 'non-positive open') + _test_warning( + 'data.iloc[2,0] = 0', + 'non-positive open') + _test_warning( + 'data.iloc[4,2] = 0', + 'non-positive high') + + # neg volume + _test_warning( + 'data.iloc[2,-1] = -1', + 'negative volumes') + + # open lower low + _test_warning( + 'data.iloc[1,0] = data.iloc[1,1]*.9', + 'open price lower than low price') + + # open higher high + _test_warning( + 'data.iloc[1,0] = data.iloc[1,2]*1.1', + 'open price higher than high price') + + # low higher close + _test_warning( + 'data.iloc[3,1] = data.iloc[3].close * 1.1', + 'low price higher than close price') + + # high lower close + _test_warning( + 'data.iloc[3,2] = data.iloc[3].close * .9', + 'high price lower than close price') + + # def test_yahoo_finance_wrong_last_time(self): + # """Test that we correct last time if intraday.""" + # + # class YahooFinanceErroneous4(YahooFinance): + # """Modified YF that sets last time wrong.""" + # counter = 0 + # + # @staticmethod + # def _get_data_yahoo( + # ticker, start='1900-01-01', end='2100-01-01'): + # """Modified download method.""" + # res = YahooFinance._get_data_yahoo( + # ticker, start=start, end=end) + # if self.counter > 0: + # res.index = list(res.index)[:-1] + [ + # res.index[-1] - pd.Timedelta('3h')] + # self.counter += 1 + # print(res) + # return res + # + # storer = YahooFinanceErroneous4('GOOGL', base_location=self.datadir) + # print(storer.data) + # #storer.update(pd.Timedelta('0d')) + # #print(storer.data) + class TestMarketData(CvxportfolioTest): """Test MarketData methods and interface.""" @@ -536,128 +718,6 @@ def test_signature(self): print(md.partial_universe_signature(md.full_universe)) - def test_download_errors(self): - """Test single-symbol download error.""" - - storer = YahooFinance( - 'AAPL', grace_period=self.data_grace_period, - base_location=self.datadir) - with self.assertRaises(SyntaxError): - # pylint: disable=protected-access - storer._download('AAPL', overlap=1) - - class YahooFinanceErroneous(YahooFinance): - """Modified YF that nans last open price.""" - def _download(self, symbol, current=None, - overlap=5, grace_period='5d', **kwargs): - """Modified download method.""" - res = super()._download(symbol, current, - grace_period=grace_period) - res.iloc[-1, 0 ] = np.nan - return res - - _ = YahooFinanceErroneous('AMZN', base_location=self.datadir) - with self.assertLogs(level='ERROR') as _: - _ = YahooFinanceErroneous( - 'AMZN', base_location=self.datadir) - - class YahooFinanceErroneous2(YahooFinance): - """Modified YF that nans some line.""" - def _download(self, symbol, current=None, - overlap=5, grace_period='5d', **kwargs): - """Modified download method.""" - res = super()._download(symbol, current, - grace_period=grace_period) - res.iloc[-20] = np.nan - return res - with self.assertLogs(level='WARNING') as _: - _ = YahooFinanceErroneous2('GOOGL', - base_location=self.datadir) - with self.assertLogs(level='WARNING') as _: - _ = YahooFinanceErroneous2( - 'GOOGL', base_location=self.datadir) - - class FredErroneous(Fred): - """Modified FRED SymbolData that gives a NaN in the last entry.""" - - def _download(self, symbol, current, grace_period): - """Modified download method.""" - res = super()._download(symbol, current, - grace_period=grace_period) - res.iloc[-1] = np.nan - return res - - _ = FredErroneous('DFF', base_location=self.datadir) - with self.assertLogs(level='ERROR') as _: - _ = FredErroneous( - 'DFF', base_location=self.datadir) - - class YahooFinanceErroneous3(YahooFinance): - """Modified YF that is not append-only.""" - counter = 0 - def _download(self, symbol, current=None, - overlap=5, grace_period='5d', **kwargs): - """Modified download method.""" - res = super()._download(symbol, current, - grace_period=grace_period) - if self.counter > 0: - res.iloc[-2] = 0. - self.counter += 1 - return res - storer = YahooFinanceErroneous3('GOOGL', base_location=self.datadir) - with self.assertLogs(level='ERROR') as _: - storer.update(pd.Timedelta('0d')) - - def test_no_internet(self): - """Test errors thrown when not connected to the internet.""" - - with NoInternet(): - with self.assertRaises(DataError): - cvx.YahooFinance('BABA', base_location=self.datadir) - - with NoInternet(): - with self.assertRaises(DataError): - cvx.Fred('CES0500000003', base_location=self.datadir) - - def test_yahoo_finance_errors(self): - """Test errors with Yahoo Finance.""" - - with self.assertRaises(DataError): - YahooFinance("DOESNTEXIST", base_location=self.datadir) - - def test_yahoo_finance_cleaning(self): - """Test our logic to clean Yahoo Finance data.""" - - # this stock was found to have NaN issues - data = YahooFinance("ENI.MI", base_location=self.datadir).data - self.assertTrue((data.valuevolume == 0).sum() > 0) - self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0) - - # def test_yahoo_finance_wrong_last_time(self): - # """Test that we correct last time if intraday.""" - # - # class YahooFinanceErroneous4(YahooFinance): - # """Modified YF that sets last time wrong.""" - # counter = 0 - # - # @staticmethod - # def _get_data_yahoo( - # ticker, start='1900-01-01', end='2100-01-01'): - # """Modified download method.""" - # res = YahooFinance._get_data_yahoo( - # ticker, start=start, end=end) - # if self.counter > 0: - # res.index = list(res.index)[:-1] + [ - # res.index[-1] - pd.Timedelta('3h')] - # self.counter += 1 - # print(res) - # return res - # - # storer = YahooFinanceErroneous4('GOOGL', base_location=self.datadir) - # print(storer.data) - # #storer.update(pd.Timedelta('0d')) - # #print(storer.data) - if __name__ == '__main__': unittest.main(warnings='error') # pragma: no cover From aea2f7647b2f0552223a8f638ef91666e6961ce6 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 12:51:20 +0400 Subject: [PATCH 20/38] basic anomalous cleaning --- cvxportfolio/data/symbol_data.py | 50 ++++++++++++++++++++++++++--- cvxportfolio/tests/test_data.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 5 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index f94d49409..6297cb399 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -252,7 +252,6 @@ def _timestamp_convert(unix_seconds_ts): return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') # Windows for filtering extreme logreturns -_WINDOWS = (10, 20, 50, 100, 200) def _median_scale_around(lrets, window): """Median absolute logreturn in a window around each timestamp.""" @@ -264,14 +263,13 @@ def _mean_scale_around(lrets, window): (lrets**2).rolling(window, center=True, min_periods=1).mean()) def _unlikeliness_score( - test_logreturns, reference_logreturns, scaler=_median_scale_around, - windows=_WINDOWS): + test_logreturns, reference_logreturns, scaler, windows): """Find problematic indexes for test logreturns compared w/ reference.""" scaled = [ np.abs(test_logreturns) / scaler(reference_logreturns, window) for window in windows] scaled = pd.DataFrame(scaled).T - return scaled.min(axis=1), scaled + return scaled.min(axis=1) class OLHCV(SymbolData): # pylint: disable=abstract-method @@ -295,6 +293,16 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method dividends, ...) and they're dealt with in derived classes. """ + FILTERING_WINDOWS = (10, 20, 50, 100, 200) + + # remove open prices when open to close abs logreturn is larger than + # this time the median absolute ones in FILTERING_WINDOWS around it + THRESHOLD_OPEN_TO_CLOSE = 15 + + # remove low/high prices when low/high to close abs logreturn larger than + # this time the median absolute ones in FILTERING_WINDOWS around it + THRESHOLD_LOWHIGH_TO_CLOSE = 20 + def _process(self, new_data, saved_data=None): """Base method for processing (cleaning) data. @@ -330,9 +338,33 @@ def _process(self, new_data, saved_data=None): return new_data + def _nan_anomalous_prices(self, data, price_name, threshold): + """Set to NaN given price name on its anomalous logrets to close.""" + lr_to_close = np.log(data['close']) - np.log(data[price_name]) + # with this we skip over exact zeros (which come from some upstream + # cleaning) and would throw the median off + lr_to_close.loc[lr_to_close == 0] = np.nan + score = _unlikeliness_score( + lr_to_close, lr_to_close, scaler=_median_scale_around, + windows=self.FILTERING_WINDOWS) + self._nan_values( + data, condition = score > threshold, + columns_to_nan=price_name, message=f'anomalous {price_name} price') + def _nan_unlikely(self, new_data, saved_data=None): """Nan-out unlikely values.""" - # return new_data + + # NaN anomalous open prices + self._nan_anomalous_prices( + new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE) + + # NaN anomalous high prices + self._nan_anomalous_prices( + new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) + + # NaN anomalous low prices + self._nan_anomalous_prices( + new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) def _fill(self, new_data, saved_data=None): """Make easy fills.""" @@ -460,6 +492,14 @@ def _nan_impossible(self, new_data, saved_data=None): self._nan_low_higher_close(new_data) self._nan_open_lower_low(new_data) self._nan_open_higher_high(new_data) + + assert np.all( + new_data['low'].fillna(0.) <= new_data[ + ['open', 'high', 'close']].min(1)) + assert np.all( + new_data['high'].fillna(np.inf) >= new_data[ + ['open', 'low', 'close']].max(1)) + # self._nan_incompatible_low_high(new_data) # TODO: factor quality check and clean into total-return related and non- diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index e5d920e98..4561fa94b 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -424,6 +424,11 @@ def test_yahoo_finance_cleaning(self): self.assertTrue((data.valuevolume == 0).sum() > 0) self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0) + # this stock was found to have phony open/low/high prices + data = YahooFinance('NWG.L', base_location=self.datadir).data + self.assertGreater(data['return'].min(), -0.75) + self.assertLess(data['return'].max(), 0.75) + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" @@ -484,6 +489,56 @@ def _test_warning(data_transformation, part_of_message): 'data.iloc[3,2] = data.iloc[3].close * .9', 'high price lower than close price') + # extreme low price + _test_warning( + 'data.iloc[3,1] = data.iloc[3,1] * .01', + 'anomalous low price') + _test_warning( + 'data.iloc[3,1] = data.iloc[3,1] * .02', + 'anomalous low price') + _test_warning( + 'data.iloc[3,1] = data.iloc[3,1] * .05', + 'anomalous low price') + _test_warning( + 'data.iloc[3,1] = data.iloc[3,1] * .1', + 'anomalous low price') + _test_warning( + 'data.iloc[3,1] = data.iloc[3,1] * .2', + 'anomalous low price') + _test_warning( # changed dtindex until found one that works + 'data.iloc[20,1] = data.iloc[20,1] * .5', + 'anomalous low price') + + # extreme high price + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 100', + 'anomalous high price') + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 50', + 'anomalous high price') + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 20', + 'anomalous high price') + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 10', + 'anomalous high price') + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 5', + 'anomalous high price') + _test_warning( + 'data.iloc[3,2] = data.iloc[3,2] * 2', + 'anomalous high price') + + # extreme open price + _test_warning( + 'data.iloc[3,0] = data.iloc[3,0] * 1.75;' + + 'data.iloc[3,2] = data.iloc[3,0]', + 'anomalous open price') + _test_warning( + 'data.iloc[20,0] = data.iloc[20,0] * 0.5;' + + 'data.iloc[20,1] = data.iloc[20,0]', + 'anomalous open price') + # def test_yahoo_finance_wrong_last_time(self): # """Test that we correct last time if intraday.""" # From b61eb91e3e38928e4e7366b8a202f87ff5c0d148 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 14:41:38 +0400 Subject: [PATCH 21/38] basic pipeline, needs improvement --- cvxportfolio/data/symbol_data.py | 255 ++++++++++++++++++++++--------- cvxportfolio/tests/test_data.py | 7 +- 2 files changed, 186 insertions(+), 76 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 6297cb399..b47aabe29 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -330,14 +330,119 @@ def _process(self, new_data, saved_data=None): the parent's before or after its own processing. """ - self._nan_impossible(new_data, saved_data=saved_data) - # self._specific_process(new_data, saved_data=saved_data) - self._nan_unlikely(new_data, saved_data=saved_data) - self._fill(new_data, saved_data=saved_data) - # self._post_process(new_data, saved_data=saved_data) + + ## Preliminaries + ## Eliminate non-positive prices, infinity values. + + # NaN nonpositive prices + for column in ["open", "close", "high", "low"]: + self._nan_nonpositive_prices(new_data, column) + + # all infinity values to NaN + self._set_infty_to_nan(new_data) + + + ## Close price. + ## We believe them (for now). We forward fill them if unavailable. + + # forward-fill close + self._fillna_and_message( + new_data, 'close', 'last available', filler='ffill') + + + ## Volumes. + ## We set negative to NaN, and fill with zeros. + + # NaN negative volumes + self._nan_negative_volumes(new_data) + + # fill with zeros + self._fillna_and_message( + new_data, 'volume', 'zeros', filler='fillna', filler_arg=0.) + + + ## Open price. + ## We remove if lower than low, higher than high, or open to close + ## logreturn is anomalous. Then we fill with close from day before. + + # NaN open if lower than low + self._nan_open_lower_low(new_data) + + # NaN open if higher than high + self._nan_open_higher_high(new_data) + + # NaN anomalous open prices + self._nan_anomalous_prices( + new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE) + + # fill open with close from day before + self._fillna_and_message( + new_data, 'open', 'close from period before', filler='fillna', + filler_arg=new_data['close'].shift(1)) + + + ## Low price. + ## We remove if higher than close or anomalous low to close logreturn. + ## We fill them with min of open and close. + + # NaN low if higher than close + self._nan_low_higher_close(new_data) + + # NaN low if higher than open (cleaned) + self._nan_low_higher_open(new_data) + + # NaN anomalous low prices + self._nan_anomalous_prices( + new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) + + # fill low with min of open and close + self._fillna_and_message( + new_data, 'low', 'min of open and close', filler='fillna', + filler_arg=new_data[['open', 'close']].min(axis=1)) + + + ## High price. + ## We remove if lower than close or anomalous low to close logreturn. + ## We fill them with max of open and close. + + # NaN high if lower than close + self._nan_high_lower_close(new_data) + + # NaN high if lower than open (cleaned) + self._nan_high_lower_open(new_data) + + # NaN anomalous high prices + self._nan_anomalous_prices( + new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) + + # fill high with max of open and close + self._fillna_and_message( + new_data, 'high', 'max of open and close', filler='fillna', + filler_arg=new_data[['open', 'close']].max(axis=1)) + + + ## Some asserts + assert new_data.iloc[1:].isnull().sum().sum() == 0 + assert np.all( + new_data['low'].fillna(0.) <= new_data[ + ['open', 'high', 'close']].min(1)) + assert np.all( + new_data['high'].fillna(np.inf) >= new_data[ + ['open', 'low', 'close']].max(1)) return new_data + def _fillna_and_message( + self, data, col_name, message, filler='fillna', filler_arg=None): + """Fill NaNs in column with chosen method and arg.""" + bad_indexes = data.index[data[col_name].isnull()] + if len(bad_indexes) > 0: + logger.warning( + '%s("%s").data["%s"] has NaNs on timestamps: %s,' + + ' filling them with %s.', self.__class__.__name__, + self.symbol, col_name, bad_indexes, message) + data[col_name] = getattr(data[col_name], filler)(filler_arg) + def _nan_anomalous_prices(self, data, price_name, threshold): """Set to NaN given price name on its anomalous logrets to close.""" lr_to_close = np.log(data['close']) - np.log(data[price_name]) @@ -351,58 +456,44 @@ def _nan_anomalous_prices(self, data, price_name, threshold): data, condition = score > threshold, columns_to_nan=price_name, message=f'anomalous {price_name} price') - def _nan_unlikely(self, new_data, saved_data=None): - """Nan-out unlikely values.""" - - # NaN anomalous open prices - self._nan_anomalous_prices( - new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE) - - # NaN anomalous high prices - self._nan_anomalous_prices( - new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) - - # NaN anomalous low prices - self._nan_anomalous_prices( - new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) - def _fill(self, new_data, saved_data=None): - """Make easy fills.""" + # def _fill(self, new_data, saved_data=None): + # """Make easy fills.""" - # TODO: simplify + # # TODO: simplify - # print(data) - # print(data.isnull().sum()) + # # print(data) + # # print(data.isnull().sum()) - # fill volumes with zeros (safest choice) - new_data['volume'] = new_data['volume'].fillna(0.) + # # fill volumes with zeros (safest choice) + # new_data['volume'] = new_data['volume'].fillna(0.) - # fill close price with open price - new_data['close'] = new_data['close'].fillna(new_data['open']) + # # fill close price with open price + # new_data['close'] = new_data['close'].fillna(new_data['open']) - # fill open price with close from day(s) before - # repeat as long as it helps (up to 1 year) - for shifter in range(252): - logger.info( - "Filling opens with close from %s days before", shifter) - orig_missing_opens = new_data['open'].isnull().sum() - new_data['open'] = new_data['open'].fillna(new_data['close'].shift( - shifter+1)) - new_missing_opens = new_data['open'].isnull().sum() - if orig_missing_opens == new_missing_opens: - break + # # fill open price with close from day(s) before + # # repeat as long as it helps (up to 1 year) + # for shifter in range(252): + # logger.info( + # "Filling opens with close from %s days before", shifter) + # orig_missing_opens = new_data['open'].isnull().sum() + # new_data['open'] = new_data['open'].fillna(new_data['close'].shift( + # shifter+1)) + # new_missing_opens = new_data['open'].isnull().sum() + # if orig_missing_opens == new_missing_opens: + # break - # fill close price with same day's open - new_data['close'] = new_data['close'].fillna(new_data['open']) + # # fill close price with same day's open + # new_data['close'] = new_data['close'].fillna(new_data['open']) - # fill high price with max - new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1)) + # # fill high price with max + # new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1)) - # fill low price with max - new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1)) + # # fill low price with max + # new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1)) - # print(data) - # print(data.isnull().sum()) + # # print(data) + # # print(data.isnull().sum()) def _nan_values(self, data, condition, columns_to_nan, message): """Set to NaN in-place for indexing condition and chosen columns.""" @@ -456,6 +547,13 @@ def _nan_high_lower_close(self, data): columns_to_nan = "high", message = 'high price lower than close price') + def _nan_high_lower_open(self, data): + """Set high price to NaN if lower than open, in-place.""" + self._nan_values( + data=data, condition = data['high'] < data['open'], + columns_to_nan = "high", + message = 'high price lower than open price') + def _nan_low_higher_close(self, data): """Set low price to NaN if higher than close, in-place.""" self._nan_values( @@ -463,6 +561,13 @@ def _nan_low_higher_close(self, data): columns_to_nan = "low", message = 'low price higher than close price') + def _nan_low_higher_open(self, data): + """Set low price to NaN if higher than open, in-place.""" + self._nan_values( + data=data, condition = data['low'] > data['open'], + columns_to_nan = "low", + message = 'low price higher than open price') + def _set_infty_to_nan(self, data): """Set all +/- infty elements of data to NaN, in-place.""" @@ -474,33 +579,33 @@ def _set_infty_to_nan(self, data): data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) - def _nan_impossible(self, new_data, saved_data=None): - """Set some impossible values of new_data to NaN, in-place.""" + # def _nan_impossible(self, new_data, saved_data=None): + # """Set some impossible values of new_data to NaN, in-place.""" - # nan-out nonpositive prices - for column in ["open", "close", "high", "low"]: - self._nan_nonpositive_prices(new_data, column) + # # nan-out nonpositive prices + # for column in ["open", "close", "high", "low"]: + # self._nan_nonpositive_prices(new_data, column) - # nan-out negative volumes - self._nan_negative_volumes(new_data) + # # nan-out negative volumes + # self._nan_negative_volumes(new_data) - # all infinity values are nans - self._set_infty_to_nan(new_data) + # # all infinity values are nans + # self._set_infty_to_nan(new_data) - # more - self._nan_high_lower_close(new_data) - self._nan_low_higher_close(new_data) - self._nan_open_lower_low(new_data) - self._nan_open_higher_high(new_data) + # # more + # self._nan_high_lower_close(new_data) + # self._nan_low_higher_close(new_data) + # self._nan_open_lower_low(new_data) + # self._nan_open_higher_high(new_data) - assert np.all( - new_data['low'].fillna(0.) <= new_data[ - ['open', 'high', 'close']].min(1)) - assert np.all( - new_data['high'].fillna(np.inf) >= new_data[ - ['open', 'low', 'close']].max(1)) + # assert np.all( + # new_data['low'].fillna(0.) <= new_data[ + # ['open', 'high', 'close']].min(1)) + # assert np.all( + # new_data['high'].fillna(np.inf) >= new_data[ + # ['open', 'low', 'close']].max(1)) - # self._nan_incompatible_low_high(new_data) + # # self._nan_incompatible_low_high(new_data) # TODO: factor quality check and clean into total-return related and non- @@ -618,6 +723,8 @@ def _compute_total_returns(self, data): def _process(self, new_data, saved_data=None): """Temporary.""" + self._nan_nonpositive_prices(new_data, "adjclose") + super()._process(new_data, saved_data=saved_data) self._compute_total_returns(new_data) @@ -689,14 +796,14 @@ def print_extreme(logreturns, name, sigmas=50): open2low = np.log(data['low']) - np.log(data['open']).dropna() print_extreme(open2low, 'open to low returns') - def _nan_impossible(self, new_data, saved_data=None): - """Set impossible values to NaN.""" + # def _nan_impossible(self, new_data, saved_data=None): + # """Set impossible values to NaN.""" - # call the OLHCV method - super()._nan_impossible(new_data) + # # call the OLHCV method + # super()._nan_impossible(new_data) - # also do it on adjclose - self._nan_nonpositive_prices(new_data, "adjclose") + # # also do it on adjclose + # self._nan_nonpositive_prices(new_data, "adjclose") # def _specific_process(self, new_data, saved_data=None): # """Specific process, compute total returns.""" diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 4561fa94b..f0f2519a6 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -485,8 +485,11 @@ def _test_warning(data_transformation, part_of_message): 'low price higher than close price') # high lower close - _test_warning( - 'data.iloc[3,2] = data.iloc[3].close * .9', + _test_warning( # had to fix it otherwise open cleaner kicks in + 'close = data.iloc[3].close;' + 'data.iloc[3,0] = close * .95;' # open + 'data.iloc[3,1] = close * .95;' # low + 'data.iloc[3,2] = close * .975', # high 'high price lower than close price') # extreme low price From f4f1f3ee91204ddcf0a54358c9f4a371c335de5f Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 15:35:34 +0400 Subject: [PATCH 22/38] better --- cvxportfolio/data/symbol_data.py | 228 ++++++++++++------------------- cvxportfolio/tests/test_data.py | 6 + 2 files changed, 94 insertions(+), 140 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index b47aabe29..01bdebe89 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -303,6 +303,10 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method # this time the median absolute ones in FILTERING_WINDOWS around it THRESHOLD_LOWHIGH_TO_CLOSE = 20 + # log warning on _preload for abs logreturns (of 4 types) larger than this + # time the root squared mean absolute ones in FILTERING_WINDOWS around it + THRESHOLD_WARN_EXTREME_LOGRETS = 5 + def _process(self, new_data, saved_data=None): """Base method for processing (cleaning) data. @@ -312,28 +316,11 @@ def _process(self, new_data, saved_data=None): (possibly overlapping with new_data at the end), and is **read only**: it is used as reference to help with the cleaning, it has already been cleaned. - - The method is composed of the following steps, split between child - classes at the appropriate hierarchy level. - - #. :meth:`_nan_impossible`: Nan-out impossible values in ``new_data``. - #. :meth:`_specific_process`: Do processing specific to the class, - before the following step (*e.g.,*, because we might want unlikely - values to still be there). - #. :meth:`_nan_unlikely`: Nan-out values that are (highly) unlikely, - with threshold-based testing. - #. :meth:`_fill`: Fill nans. - #. :meth:`_post_process`: Do final processing specific to the class. - - With this factoring we should have the flexibility to handle various - data sources, by choosing at each level if each method calls - the parent's before or after its own processing. """ - ## Preliminaries ## Eliminate non-positive prices, infinity values. - + # NaN nonpositive prices for column in ["open", "close", "high", "low"]: self._nan_nonpositive_prices(new_data, column) @@ -341,7 +328,6 @@ def _process(self, new_data, saved_data=None): # all infinity values to NaN self._set_infty_to_nan(new_data) - ## Close price. ## We believe them (for now). We forward fill them if unavailable. @@ -349,7 +335,6 @@ def _process(self, new_data, saved_data=None): self._fillna_and_message( new_data, 'close', 'last available', filler='ffill') - ## Volumes. ## We set negative to NaN, and fill with zeros. @@ -360,7 +345,6 @@ def _process(self, new_data, saved_data=None): self._fillna_and_message( new_data, 'volume', 'zeros', filler='fillna', filler_arg=0.) - ## Open price. ## We remove if lower than low, higher than high, or open to close ## logreturn is anomalous. Then we fill with close from day before. @@ -380,7 +364,6 @@ def _process(self, new_data, saved_data=None): new_data, 'open', 'close from period before', filler='fillna', filler_arg=new_data['close'].shift(1)) - ## Low price. ## We remove if higher than close or anomalous low to close logreturn. ## We fill them with min of open and close. @@ -400,7 +383,6 @@ def _process(self, new_data, saved_data=None): new_data, 'low', 'min of open and close', filler='fillna', filler_arg=new_data[['open', 'close']].min(axis=1)) - ## High price. ## We remove if lower than close or anomalous low to close logreturn. ## We fill them with max of open and close. @@ -420,7 +402,6 @@ def _process(self, new_data, saved_data=None): new_data, 'high', 'max of open and close', filler='fillna', filler_arg=new_data[['open', 'close']].max(axis=1)) - ## Some asserts assert new_data.iloc[1:].isnull().sum().sum() == 0 assert np.all( @@ -456,45 +437,6 @@ def _nan_anomalous_prices(self, data, price_name, threshold): data, condition = score > threshold, columns_to_nan=price_name, message=f'anomalous {price_name} price') - - # def _fill(self, new_data, saved_data=None): - # """Make easy fills.""" - - # # TODO: simplify - - # # print(data) - # # print(data.isnull().sum()) - - # # fill volumes with zeros (safest choice) - # new_data['volume'] = new_data['volume'].fillna(0.) - - # # fill close price with open price - # new_data['close'] = new_data['close'].fillna(new_data['open']) - - # # fill open price with close from day(s) before - # # repeat as long as it helps (up to 1 year) - # for shifter in range(252): - # logger.info( - # "Filling opens with close from %s days before", shifter) - # orig_missing_opens = new_data['open'].isnull().sum() - # new_data['open'] = new_data['open'].fillna(new_data['close'].shift( - # shifter+1)) - # new_missing_opens = new_data['open'].isnull().sum() - # if orig_missing_opens == new_missing_opens: - # break - - # # fill close price with same day's open - # new_data['close'] = new_data['close'].fillna(new_data['open']) - - # # fill high price with max - # new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1)) - - # # fill low price with max - # new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1)) - - # # print(data) - # # print(data.isnull().sum()) - def _nan_values(self, data, condition, columns_to_nan, message): """Set to NaN in-place for indexing condition and chosen columns.""" @@ -579,35 +521,49 @@ def _set_infty_to_nan(self, data): data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) - # def _nan_impossible(self, new_data, saved_data=None): - # """Set some impossible values of new_data to NaN, in-place.""" - - # # nan-out nonpositive prices - # for column in ["open", "close", "high", "low"]: - # self._nan_nonpositive_prices(new_data, column) + def _warn_on_extreme_logreturns(self, logreturns, threshold, what): + """Log warning if logreturns are extreme.""" + # with this we skip over exact zeros (which we assume come from some + # cleaning) and would bias the mean down + logreturns.loc[logreturns == 0] = np.nan + score = _unlikeliness_score( + logreturns, logreturns, scaler=_mean_scale_around, + windows=self.FILTERING_WINDOWS) + dubious_indexes = logreturns.index[score > threshold] + if len(dubious_indexes) > 0: + logger.warning( + '%s("%s") has dubious %s for timestamps: %s', + self.__class__.__name__, self.symbol, what, dubious_indexes) - # # nan-out negative volumes - # self._nan_negative_volumes(new_data) + def _quality_check(self, data): + """Log issues with the quality of data given to the user.""" - # # all infinity values are nans - # self._set_infty_to_nan(new_data) + # zero volume + zerovol_idx = data.index[data.volume == 0] + if len(zerovol_idx) > 0: + logger.info( + '%s("%s") has volume equal to zero for timestamps: %s', + self.__class__.__name__, self.symbol, zerovol_idx) - # # more - # self._nan_high_lower_close(new_data) - # self._nan_low_higher_close(new_data) - # self._nan_open_lower_low(new_data) - # self._nan_open_higher_high(new_data) + # warn on extreme logreturns + self._warn_on_extreme_logreturns( + np.log(1 + data['return']), self.THRESHOLD_WARN_EXTREME_LOGRETS, + 'total open-to-open returns') - # assert np.all( - # new_data['low'].fillna(0.) <= new_data[ - # ['open', 'high', 'close']].min(1)) - # assert np.all( - # new_data['high'].fillna(np.inf) >= new_data[ - # ['open', 'low', 'close']].max(1)) + # extreme open2close + self._warn_on_extreme_logreturns( + np.log(data['close']) - np.log(data['open']), + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns') - # # self._nan_incompatible_low_high(new_data) + # extreme open2high + self._warn_on_extreme_logreturns( + np.log(data['high']) - np.log(data['open']), + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns') - # TODO: factor quality check and clean into total-return related and non- + # extreme open2low + self._warn_on_extreme_logreturns( + np.log(data['low']) - np.log(data['open']), + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns') def _preload(self, data): """Prepare data for use by Cvxportfolio. @@ -723,9 +679,23 @@ def _compute_total_returns(self, data): def _process(self, new_data, saved_data=None): """Temporary.""" + ## Here we only deal with the adjusted close prices. + ## All other operations are in the _process method of OLHCV + + # all infinity values to NaN (repeat, but for adjclose) + self._set_infty_to_nan(new_data) + + # NaN non-positive adj close self._nan_nonpositive_prices(new_data, "adjclose") + # forward-fill adj close + self._fillna_and_message( + new_data, 'adjclose', 'last available', filler='ffill') + + ## OLHCV._process treats all other than adjclose super()._process(new_data, saved_data=saved_data) + + # Compute total open-to-open returns self._compute_total_returns(new_data) # close2close_total = np.log(1 + new_data['total_return']) @@ -750,74 +720,52 @@ def _process(self, new_data, saved_data=None): del new_data["adjclose"] # eliminate last period's intraday data + # TODO this operation needs to be moved in preload new_data.loc[new_data.index[-1], ["high", "low", "close", "return", "volume"]] = np.nan return new_data - def _quality_check(self, data): - """Analyze quality of the OLHCV-TR data.""" + # def _quality_check(self, data): + # """Analyze quality of the OLHCV-TR data.""" - # zero volume - zerovol_idx = data.index[data.volume == 0] - if len(zerovol_idx) > 0: - logger.warning( - '%s("%s") has volume equal to zero for timestamps: %s', - self.__class__.__name__, self.symbol, zerovol_idx) - - def print_extreme(logreturns, name, sigmas=50): - - # TODO: choose - m, s = logreturns.median(), np.sqrt((logreturns**2).median()) - normalized = (logreturns - m)/s - - # normalized = logreturns / logreturns.rolling(252).std().shift(1) + # # zero volume + # zerovol_idx = data.index[data.volume == 0] + # if len(zerovol_idx) > 0: + # logger.warning( + # '%s("%s") has volume equal to zero for timestamps: %s', + # self.__class__.__name__, self.symbol, zerovol_idx) - extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] - if len(extremereturn_idx) > 0: - logger.warning( - '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', - self.__class__.__name__, self.symbol, name, sigmas, - extremereturn_idx) - - # extreme logreturns - logreturns = np.log(1 + data['return']).dropna() - print_extreme(logreturns, 'total returns') - - # extreme open2close - open2close = np.log(data['close']) - np.log(data['open']).dropna() - print_extreme(open2close, 'open to close returns') - - # extreme open2high - open2high = np.log(data['high']) - np.log(data['open']).dropna() - print_extreme(open2high, 'open to high returns') - - # extreme open2low - open2low = np.log(data['low']) - np.log(data['open']).dropna() - print_extreme(open2low, 'open to low returns') + # def print_extreme(logreturns, name, sigmas=50): - # def _nan_impossible(self, new_data, saved_data=None): - # """Set impossible values to NaN.""" + # # TODO: choose + # m, s = logreturns.median(), np.sqrt((logreturns**2).median()) + # normalized = (logreturns - m)/s - # # call the OLHCV method - # super()._nan_impossible(new_data) + # # normalized = logreturns / logreturns.rolling(252).std().shift(1) - # # also do it on adjclose - # self._nan_nonpositive_prices(new_data, "adjclose") + # extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] + # if len(extremereturn_idx) > 0: + # logger.warning( + # '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', + # self.__class__.__name__, self.symbol, name, sigmas, + # extremereturn_idx) - # def _specific_process(self, new_data, saved_data=None): - # """Specific process, compute total returns.""" + # # extreme logreturns + # logreturns = np.log(1 + data['return']).dropna() + # print_extreme(logreturns, 'total returns') - # # Close-to-close total return, so we can delegate to parent class. - # # Note that this uses different time alignment than Cvxportfolio, - # # Here today's return uses yesterday close and today close, while - # # today's returns in Cvxportfolio use today open and tomorrow open. - # # However this is the format more common among data vendors. - # # new_data['total_return'] = new_data['adjclose'].ffill().pct_change() + # # extreme open2close + # open2close = np.log(data['close']) - np.log(data['open']).dropna() + # print_extreme(open2close, 'open to close returns') - # # We don't need this any more. - # # del new_data['adjclose'] + # # extreme open2high + # open2high = np.log(data['high']) - np.log(data['open']).dropna() + # print_extreme(open2high, 'open to high returns') + # # extreme open2low + # open2low = np.log(data['low']) - np.log(data['open']).dropna() + # print_extreme(open2low, 'open to low returns') class YahooFinance(OLHCVAC): """Yahoo Finance symbol data. diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index f0f2519a6..3c5f3490f 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -429,6 +429,12 @@ def test_yahoo_finance_cleaning(self): self.assertGreater(data['return'].min(), -0.75) self.assertLess(data['return'].max(), 0.75) + # this stock had some extreme returns but they were legitimate + with self.assertNoLogs(level='WARNING'): + data = YahooFinance('GME', base_location=self.datadir).data + self.assertGreater(data['return'].min(), -0.75) + self.assertGreater(data['return'].max(), 3) + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" From 39e8939e3cd1930e3789c4527e2935cbac2e0542 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 16:28:00 +0400 Subject: [PATCH 23/38] testing --- cvxportfolio/data/symbol_data.py | 243 +++++++++---------------------- cvxportfolio/tests/test_data.py | 52 +++++++ 2 files changed, 119 insertions(+), 176 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 01bdebe89..b0875223c 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -229,7 +229,7 @@ def _download(self, symbol, current, grace_period, **kwargs): :type current: pandas.Series or pandas.DataFrame or None :rtype: pandas.Series or pandas.DataFrame """ - raise NotImplementedError #pragma: no cover + raise NotImplementedError # pragma: no cover def _preload(self, data): """Prepare data to serve to the user. @@ -240,7 +240,7 @@ def _preload(self, data): :type data: pandas.Series or pandas.DataFrame :rtype: pandas.Series or pandas.DataFrame """ - return data + return data # pragma: no cover # @@ -251,7 +251,7 @@ def _timestamp_convert(unix_seconds_ts): """Convert a UNIX timestamp in seconds to a pandas.Timestamp.""" return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC') -# Windows for filtering extreme logreturns +# Anomalous, extreme, dubious logreturns filtering. def _median_scale_around(lrets, window): """Median absolute logreturn in a window around each timestamp.""" @@ -289,7 +289,7 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method in which case the ``'return'`` column is not processed. It only matters in the :meth:`_preload`, method: if open-to-open returns are not present, - we compute them there. Otherwise these may be total returns (including + we compute them there. Otherwise these may be total returns (which include dividends, ...) and they're dealt with in derived classes. """ @@ -304,8 +304,8 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method THRESHOLD_LOWHIGH_TO_CLOSE = 20 # log warning on _preload for abs logreturns (of 4 types) larger than this - # time the root squared mean absolute ones in FILTERING_WINDOWS around it - THRESHOLD_WARN_EXTREME_LOGRETS = 5 + # time the median absolute ones in FILTERING_WINDOWS around it + THRESHOLD_WARN_EXTREME_LOGRETS = 17.5 def _process(self, new_data, saved_data=None): """Base method for processing (cleaning) data. @@ -524,10 +524,10 @@ def _set_infty_to_nan(self, data): def _warn_on_extreme_logreturns(self, logreturns, threshold, what): """Log warning if logreturns are extreme.""" # with this we skip over exact zeros (which we assume come from some - # cleaning) and would bias the mean down + # cleaning) and would bias the median down logreturns.loc[logreturns == 0] = np.nan score = _unlikeliness_score( - logreturns, logreturns, scaler=_mean_scale_around, + logreturns, logreturns, scaler=_median_scale_around, windows=self.FILTERING_WINDOWS) dubious_indexes = logreturns.index[score > threshold] if len(dubious_indexes) > 0: @@ -585,102 +585,56 @@ def _preload(self, data): return data -class OLHCVAC(OLHCV): - """Open-High-Low-Close-Volume-AdjustedClose data. +# TODO: plan +# ffill adj closes & compute adj close logreturns +# use code above to get indexes of wrong ones, raise warnings, set to 0 +# +# check close vs adj close, there should be only dividends (with y finance) +# +# throw out opens that are not in [low, high] +# +# apply similar logic (perhaps using total lrets for the stddev) for +# open-close , close-high , close-low, throw out open/low/close not OK +# +# fill +# +# compute open-open total returns, then check with same logic for errors +# +# when doing append, make past data adhere to same format: recompute adj +# close +# could use volumes as well, if there are jumps in price due to +# splits not recorded, then price * volume should be more stable +# +# - This is modeled after the data returned by Yahoo Finance. - """ - # # rolstd windows for finding wrong logreturns - # _ROLSTD_WINDOWS = [20, 60, 252] - - # # threshold for finding wrong logreturns - # _WRONG_LOGRET_THRESHOLD = 15 - - # def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold): - # """Get indexes of logreturns that are extreme wrt trailing stddev.""" - # trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1)) - # bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold] - # return bad_indexes - - # def _find_wrong_daily_logreturns(self, lrets): - # """Find indexes of logreturns that are most probably data errors.""" - # bad_indexes = [] - # for window in self._ROLSTD_WINDOWS: - # bad_indexes.append( - # set(self._indexes_extreme_logrets_wrt_rolstddev( - # lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD))) - # bad_indexes.append( - # set(self._indexes_extreme_logrets_wrt_rolstddev( - # lrets.iloc[::-1], window=window, - # treshold=self._WRONG_LOGRET_THRESHOLD))) - # bad_indexes = set.intersection(*bad_indexes) - # return bad_indexes - - # TODO: plan - # ffill adj closes & compute adj close logreturns - # use code above to get indexes of wrong ones, raise warnings, set to 0 - # - # check close vs adj close, there should be only dividends (with y finance) - # - # throw out opens that are not in [low, high] - # - # apply similar logic (perhaps using total lrets for the stddev) for - # open-close , close-high , close-low, throw out open/low/close not OK - # - # fill - # - # compute open-open total returns, then check with same logic for errors - # - # when doing append, make past data adhere to same format: recompute adj - # close - # could use volumes as well, if there are jumps in price due to - # splits not recorded, then price * volume should be more stable - # - # - - def _compute_total_returns(self, data): - """Compute total open-to-open returns.""" - - # print(data) - # print(data.isnull().sum()) - - # compute log of ratio between adjclose and close - log_adjustment_ratio = np.log(data['adjclose'] / data['close']) - - # forward fill adjustment ratio - log_adjustment_ratio = log_adjustment_ratio.ffill() - - # non-market log returns (dividends, splits) - non_market_lr = log_adjustment_ratio.diff().shift(-1) - - # dividend_return = (data['adjclose'] / data['close']).pct_change().shift(-1) - - # import code; code.interact(local=locals()) - - # full open-to-open returns - open_to_open = np.log(data["open"]).diff().shift(-1) - data['return'] = np.exp(open_to_open + non_market_lr) - 1 - - # print(data) - # print(data.isnull().sum()) - - # intraday_logreturn = np.log(data["close"]) - np.log(data["open"]) - # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1) - # open_to_open_logreturn = ( - # close_to_close_logreturn + intraday_logreturn - - # intraday_logreturn.shift(-1) - # ) - # data["return"] = np.exp(open_to_open_logreturn) - 1 - - # print(data) - # print(data.isnull().sum()) +class YahooFinance(OLHCV): + """Yahoo Finance symbol data. + + :param symbol: The symbol that we downloaded. + :type symbol: str + :param storage_backend: The storage backend, implemented ones are + ``'pickle'``, ``'csv'``, and ``'sqlite'``. + :type storage_backend: str + :param base_storage_location: The location of the storage. We store in a + subdirectory named after the class which derives from this. + :type base_storage_location: pathlib.Path + :param grace_period: If the most recent observation in the data is less + old than this we do not download new data. + :type grace_period: pandas.Timedelta + + :attribute data: The downloaded, and cleaned, data for the symbol. + :type data: pandas.DataFrame + """ def _process(self, new_data, saved_data=None): - """Temporary.""" + """Process Yahoo Finance specific data, call parent's. - ## Here we only deal with the adjusted close prices. - ## All other operations are in the _process method of OLHCV + Here we deal with the adjclose column, call OLHCV._process method, and + compute total open-to-open returns. + """ + + ## Treat adjclose. We believe them (unless impossible). # all infinity values to NaN (repeat, but for adjclose) self._set_infty_to_nan(new_data) @@ -692,29 +646,26 @@ def _process(self, new_data, saved_data=None): self._fillna_and_message( new_data, 'adjclose', 'last available', filler='ffill') - ## OLHCV._process treats all other than adjclose + ## OLHCV._process treats all columns other than adjclose super()._process(new_data, saved_data=saved_data) - # Compute total open-to-open returns - self._compute_total_returns(new_data) - - # close2close_total = np.log(1 + new_data['total_return']) - # open2close = np.log(new_data['close']) - np.log(new_data['open']) - # open2open_total = close2close_total - open2close + open2close.shift(1) - # alt = (np.exp(open2open_total) - 1).shift(-1) + ## Compute total open-to-open returns - # close_div_open = new_data['close'] / new_data['open'] - # open_to_open_total = ( - # (1 + new_data['total_return']) / close_div_open - # ) * close_div_open.shift(1) - 1 + # intraday logreturn + intraday_logreturn = np.log( + new_data["close"]) - np.log(new_data["open"]) - # import code; code.interact(local=locals()) + # close to close total logreturn + close_to_close_total_logreturn = np.log( + new_data["adjclose"]).diff().shift(-1) - # assert np.allclose(new_data['return'].dropna(), open_to_open_total.shift(-1).dropna()) + # open to open total logreturn + open_to_open_total_logreturn = \ + close_to_close_total_logreturn + intraday_logreturn \ + - intraday_logreturn.shift(-1) - # new_data['return'] = open_to_open_total.shift(-1) - - # del new_data['total_return'] + # open to open total return + new_data['return'] = np.exp(open_to_open_total_logreturn) - 1 # eliminate adjclose column del new_data["adjclose"] @@ -726,66 +677,6 @@ def _process(self, new_data, saved_data=None): return new_data - # def _quality_check(self, data): - # """Analyze quality of the OLHCV-TR data.""" - - # # zero volume - # zerovol_idx = data.index[data.volume == 0] - # if len(zerovol_idx) > 0: - # logger.warning( - # '%s("%s") has volume equal to zero for timestamps: %s', - # self.__class__.__name__, self.symbol, zerovol_idx) - - # def print_extreme(logreturns, name, sigmas=50): - - # # TODO: choose - # m, s = logreturns.median(), np.sqrt((logreturns**2).median()) - # normalized = (logreturns - m)/s - - # # normalized = logreturns / logreturns.rolling(252).std().shift(1) - - # extremereturn_idx = normalized.index[np.abs(normalized) > sigmas] - # if len(extremereturn_idx) > 0: - # logger.warning( - # '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s', - # self.__class__.__name__, self.symbol, name, sigmas, - # extremereturn_idx) - - # # extreme logreturns - # logreturns = np.log(1 + data['return']).dropna() - # print_extreme(logreturns, 'total returns') - - # # extreme open2close - # open2close = np.log(data['close']) - np.log(data['open']).dropna() - # print_extreme(open2close, 'open to close returns') - - # # extreme open2high - # open2high = np.log(data['high']) - np.log(data['open']).dropna() - # print_extreme(open2high, 'open to high returns') - - # # extreme open2low - # open2low = np.log(data['low']) - np.log(data['open']).dropna() - # print_extreme(open2low, 'open to low returns') - -class YahooFinance(OLHCVAC): - """Yahoo Finance symbol data. - - :param symbol: The symbol that we downloaded. - :type symbol: str - :param storage_backend: The storage backend, implemented ones are - ``'pickle'``, ``'csv'``, and ``'sqlite'``. - :type storage_backend: str - :param base_storage_location: The location of the storage. We store in a - subdirectory named after the class which derives from this. - :type base_storage_location: pathlib.Path - :param grace_period: If the most recent observation in the data is less - old than this we do not download new data. - :type grace_period: pandas.Timedelta - - :attribute data: The downloaded, and cleaned, data for the symbol. - :type data: pandas.DataFrame - """ - @staticmethod def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): """Get 1-day OLHC-AC-V from Yahoo finance. diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 3c5f3490f..04d0dda36 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -435,6 +435,58 @@ def test_yahoo_finance_cleaning(self): self.assertGreater(data['return'].min(), -0.75) self.assertGreater(data['return'].max(), 3) + def test_yahoo_finance_preload_warnings(self): + """Test warnings on _preload if data has issues.""" + + # pylint: disable=protected-access + + raw_data = YahooFinance._get_data_yahoo('ZM') + empty_instance = YahooFinance.__new__(YahooFinance) + empty_instance._symbol = 'ZM' # because the warnings use the symbol + cleaned = empty_instance._process(raw_data, None) + + def _test_warning(data_transformation, part_of_message): + """Test that warning is raised w/ message containing some word.""" + data = pd.DataFrame(cleaned, copy=True) + exec(data_transformation) # pylint: disable=exec-used + # print(data) + with self.assertLogs(level='WARNING') as _: + empty_instance._preload(data) + # print(_) + self.assertTrue(part_of_message in _.output[0]) + + # columns are: open low high close volume return + + # high unexpected return + _test_warning( + 'data.iloc[300,-1] = 1', + 'dubious total open-to-open returns') + + # low unexpected return + _test_warning( + 'data.iloc[300,-1] = -0.5', + 'dubious total open-to-open returns') + + # low unexpected open + _test_warning( + 'data.iloc[300,0] = data.iloc[300,0]*0.5', + 'dubious open to close returns') + + # high unexpected open + _test_warning( + 'data.iloc[300,0] = data.iloc[300,0]*2', + 'dubious open to close returns') + + # low unexpected low + _test_warning( + 'data.iloc[300,1] = data.iloc[300,1]*0.5', + 'dubious open to low returns') + + # high unexpected high + _test_warning( + 'data.iloc[300,2] = data.iloc[300,2]*2', + 'dubious open to high returns') + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" From 0488b7ee580e1df8b58328f64f515478ace5da58 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 17:00:18 +0400 Subject: [PATCH 24/38] minor --- cvxportfolio/data/symbol_data.py | 28 +++++++++++++++++++--------- cvxportfolio/tests/test_data.py | 3 ++- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index b0875223c..0c6106eeb 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -168,6 +168,14 @@ def _print_difference(self, current, new): def update(self, grace_period): """Update current stored data for symbol. + Checks (which raise warnings): + + #. Elements of data are NaN (skipping last row) + #. Update is not append-only. For dataframes check all elements other + than last row of the data which was there before, and for that last + row, only the open price. For Series that doesn't matter, check that + last element is the same. + :param grace_period: If the time between now and the last value stored is less than this, we don't update the data already stored. :type grace_period: pandas.Timedelta @@ -192,8 +200,7 @@ def update(self, grace_period): # we use numpy.isclose because returns may be computed # via logreturns and numerical errors can sift through np.isclose(updated.loc[current.index[:-1]], - current.iloc[:-1], equal_nan=True, - rtol=1e-08, atol=1e-08)): + current.iloc[:-1], equal_nan=True)): logger.error(f"{self.__class__.__name__} update" + f" of {self.symbol} is not append-only!") self._print_difference(current, updated) @@ -211,7 +218,7 @@ def update(self, grace_period): f"{self.__class__.__name__} update" + f" of {self.symbol} changed last value!") self._print_difference(current, updated) - except KeyError: + except KeyError: # this should have become superflous logger.error("%s update of %s could not be checked for" + " append-only edits. Was there a DST change?", self.__class__.__name__, self.symbol) @@ -577,9 +584,16 @@ def _preload(self, data): # pure OLHCV data source there is no need to store the open-to-open # returns, they can be computed here if not 'return' in data.columns: - data['return'] = data['open'].pct_change().shift(-1) + data['return'] = data[ + 'open'].pct_change().shift(-1) # pragma: no cover self._quality_check(data) + + # NaN intraday data + data.loc[data.index[-1], + ["high", "low", "close", "return", "volume"]] = np.nan + + # compute volume in cash units data["valuevolume"] = data["volume"] * data["open"] del data["volume"] @@ -670,11 +684,6 @@ def _process(self, new_data, saved_data=None): # eliminate adjclose column del new_data["adjclose"] - # eliminate last period's intraday data - # TODO this operation needs to be moved in preload - new_data.loc[new_data.index[-1], - ["high", "low", "close", "return", "volume"]] = np.nan - return new_data @staticmethod @@ -752,6 +761,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): this_periods_open_time = _timestamp_convert( data['meta']['currentTradingPeriod']['regular']['start']) + # this should be enough, but be careful if df_result.index[-1] > this_periods_open_time: index = df_result.index.to_numpy() index[-1] = this_periods_open_time diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 04d0dda36..957fc72af 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -81,8 +81,8 @@ def test_yfinance_download(self): data.loc["2023-04-10 13:30:00+00:00", "return"], data.loc["2023-04-11 13:30:00+00:00", "open"] / data.loc["2023-04-10 13:30:00+00:00", "open"] - 1, + rtol=1e-04, atol=1e-07, )) - self.assertTrue(np.isnan(data.iloc[-1]["close"])) def test_fred(self): """Test basic Fred usage.""" @@ -124,6 +124,7 @@ def test_yahoo_finance(self): data.loc["2023-04-05 13:30:00+00:00", "return"], data.loc["2023-04-06 13:30:00+00:00", "open"] / data.loc["2023-04-05 13:30:00+00:00", "open"] - 1, + rtol=1e-04, atol=1e-07, )) store.update(grace_period=pd.Timedelta('1d')) From 40ff3b5b885ca124155a05d235b5c010f1661b3c Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 17:32:19 +0400 Subject: [PATCH 25/38] mostly done --- cvxportfolio/data/market_data.py | 4 +-- cvxportfolio/data/symbol_data.py | 53 +++++++++++++++++++++++--------- cvxportfolio/tests/test_data.py | 46 ++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 18 deletions(-) diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py index e69530b5b..245873948 100644 --- a/cvxportfolio/data/market_data.py +++ b/cvxportfolio/data/market_data.py @@ -636,10 +636,10 @@ def _remove_missing_recent(self): """ if self.prices.iloc[-5:].isnull().any().any(): - logger.debug( + logger.warning( 'Removing some recent lines because there are missing values.') drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax() - logger.debug('Dropping at index %s', drop_at) + logger.warning('Dropping at index %s', drop_at) self.returns = self.returns.loc[self.returns.index < drop_at] if self.prices is not None: self.prices = self.prices.loc[self.prices.index < drop_at] diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 0c6106eeb..9d56bb858 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -218,10 +218,11 @@ def update(self, grace_period): f"{self.__class__.__name__} update" + f" of {self.symbol} changed last value!") self._print_difference(current, updated) - except KeyError: # this should have become superflous + # this should have become superflous + except KeyError: # pragma: no cover logger.error("%s update of %s could not be checked for" + " append-only edits. Was there a DST change?", - self.__class__.__name__, self.symbol) + self.__class__.__name__, self.symbol) # pragma: no cover self._store(updated) def _download(self, symbol, current, grace_period, **kwargs): @@ -264,10 +265,10 @@ def _median_scale_around(lrets, window): """Median absolute logreturn in a window around each timestamp.""" return np.abs(lrets).rolling(window, center=True, min_periods=1).median() -def _mean_scale_around(lrets, window): - """Root mean squared logreturn in a window around each timestamp.""" - return np.sqrt( - (lrets**2).rolling(window, center=True, min_periods=1).mean()) +# def _mean_scale_around(lrets, window): +# """Root mean squared logreturn in a window around each timestamp.""" +# return np.sqrt( +# (lrets**2).rolling(window, center=True, min_periods=1).mean()) def _unlikeliness_score( test_logreturns, reference_logreturns, scaler, windows): @@ -364,7 +365,8 @@ def _process(self, new_data, saved_data=None): # NaN anomalous open prices self._nan_anomalous_prices( - new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE) + new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE, + saved_data=saved_data) # fill open with close from day before self._fillna_and_message( @@ -383,7 +385,8 @@ def _process(self, new_data, saved_data=None): # NaN anomalous low prices self._nan_anomalous_prices( - new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) + new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE, + saved_data=saved_data) # fill low with min of open and close self._fillna_and_message( @@ -402,7 +405,8 @@ def _process(self, new_data, saved_data=None): # NaN anomalous high prices self._nan_anomalous_prices( - new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE) + new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE, + saved_data=saved_data) # fill high with max of open and close self._fillna_and_message( @@ -431,17 +435,36 @@ def _fillna_and_message( self.symbol, col_name, bad_indexes, message) data[col_name] = getattr(data[col_name], filler)(filler_arg) - def _nan_anomalous_prices(self, data, price_name, threshold): + def _nan_anomalous_prices( + self, new_data, price_name, threshold, saved_data=None): """Set to NaN given price name on its anomalous logrets to close.""" - lr_to_close = np.log(data['close']) - np.log(data[price_name]) + new_lr_to_close =\ + np.log(new_data['close']) - np.log(new_data[price_name]) + + # if there is saved data, we use it to compute the logrets + # also on the past, but we only NaN (if necessary) elements of + # new data, so the scores computed on the past are not used + if saved_data is None: + all_lr_to_close = new_lr_to_close + else: + old_lr_to_close =\ + np.log(saved_data['close']) - np.log(saved_data[price_name]) + all_lr_to_close = pd.concat( + [old_lr_to_close.loc[ + old_lr_to_close.index < new_lr_to_close.index[0]], + new_lr_to_close]) + # drop old data which we don't need + all_lr_to_close = all_lr_to_close.iloc[ + -len(new_data) - max(self.FILTERING_WINDOWS):] + # with this we skip over exact zeros (which come from some upstream # cleaning) and would throw the median off - lr_to_close.loc[lr_to_close == 0] = np.nan + all_lr_to_close.loc[all_lr_to_close == 0] = np.nan score = _unlikeliness_score( - lr_to_close, lr_to_close, scaler=_median_scale_around, + all_lr_to_close, all_lr_to_close, scaler=_median_scale_around, windows=self.FILTERING_WINDOWS) self._nan_values( - data, condition = score > threshold, + new_data, condition = score.loc[new_data.index] > threshold, columns_to_nan=price_name, message=f'anomalous {price_name} price') def _nan_values(self, data, condition, columns_to_nan, message): @@ -753,7 +776,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'): data['indicators']['quote'][0], index=index) df_result['adjclose'] = data[ 'indicators']['adjclose'][0]['adjclose'] - except KeyError as exc: + except KeyError as exc: # pragma: no cover raise DataError(f'Yahoo finance download of {ticker} failed.' + ' Json:', str(res.json())) from exc # pragma: no cover diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 957fc72af..2b891ea7e 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -156,7 +156,8 @@ def test_yahoo_finance_removefirstline(self): and sys.version_info.minor < 11, "Issues with timezoned timestamps.") def test_sqlite3_store_series(self): """Test storing and retrieving of a Series with datetime index.""" - self._base_test_series(_loader_sqlite, _storer_sqlite) + with self.assertWarns(UserWarning): + self._base_test_series(_loader_sqlite, _storer_sqlite) @unittest.skipIf(sys.version_info.major == 3 and sys.version_info.minor < 11, "Issues with timezoned timestamps.") @@ -196,6 +197,9 @@ def _base_test_series(self, loader, storer): """Test storing and retrieving of a Series with datetime index.""" for data in [ + pd.Series( + 0.0, pd.date_range("2020-01-01", "2020-01-10"), + name="test0"), pd.Series( 0.0, pd.date_range("2020-01-01", "2020-01-10", tz='UTC-05:00'), name="test1"), @@ -507,6 +511,17 @@ def _test_warning(data_transformation, part_of_message): # check all NaNs have been filled self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0) + def _test_warning_update(data_transformation, part_of_message): + """Test that warning is raised w/ message containing some word.""" + new_data = pd.DataFrame(raw_data.iloc[-20:], copy=True) + saved_data = pd.DataFrame(raw_data.iloc[:-15], copy=True) + exec(data_transformation) # pylint: disable=exec-used + with self.assertLogs(level='WARNING') as _: + _cleaned = empty_instance._process(new_data, saved_data) + self.assertTrue(part_of_message in _.output[0]) + # check all NaNs have been filled + self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0) + # infty _test_warning( 'data.iloc[2,2] = np.inf', @@ -601,6 +616,16 @@ def _test_warning(data_transformation, part_of_message): + 'data.iloc[20,1] = data.iloc[20,0]', 'anomalous open price') + # extreme open update + _test_warning_update( + 'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 1.75;' + + 'new_data.iloc[-1,2] = new_data.iloc[-1,0]', + 'anomalous open price') + _test_warning_update( + 'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 0.5;' + + 'new_data.iloc[-1,1] = new_data.iloc[-1,0]', + 'anomalous open price') + # def test_yahoo_finance_wrong_last_time(self): # """Test that we correct last time if intraday.""" # @@ -789,6 +814,25 @@ def test_user_provided_market_data(self): prices=self.prices, cash_key='cash', min_history=pd.Timedelta('0d')) + with self.assertRaises(NotImplementedError): + UserProvidedMarketData(returns=self.returns, volumes=used_volumes, + prices=self.prices, cash_key='NOTSUPPORTED', + min_history=pd.Timedelta('0d')) + + with self.assertRaises(ValueError): + UserProvidedMarketData(returns=self.returns, volumes=used_volumes, + prices=self.prices, cash_key='USDOLLAR', + min_history=pd.Timedelta('0d')) + + md = UserProvidedMarketData( + returns=self.returns, volumes=self.volumes, + prices=self.prices, cash_key='cash', + min_history=pd.Timedelta('60d')) + + # try to serve when there's not enough min_history + with self.assertRaises(ValueError): + md.serve(t=self.returns.index[20]) + def test_market_data_full(self): """Test serve method of DownloadedMarketData.""" From c55ab7cf2736b638591643fc93f22e0d492fca6b Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 17:44:54 +0400 Subject: [PATCH 26/38] typo --- cvxportfolio/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cvxportfolio/utils.py b/cvxportfolio/utils.py index 1454966fb..662ba336a 100644 --- a/cvxportfolio/utils.py +++ b/cvxportfolio/utils.py @@ -29,7 +29,6 @@ 'average_periods_per_year'] -@staticmethod def set_pd_read_only(df_or_ser): """Set numpy array contained in dataframe or series to read only. From d75b0ac53803b67f55a9341a84a077be3850cf31 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 21:31:15 +0400 Subject: [PATCH 27/38] assertNoLogs not available on py < 3.10 --- cvxportfolio/tests/test_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 2b891ea7e..1cde51676 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -435,7 +435,11 @@ def test_yahoo_finance_cleaning(self): self.assertLess(data['return'].max(), 0.75) # this stock had some extreme returns but they were legitimate - with self.assertNoLogs(level='WARNING'): + # only available on py<3.10 + if hasattr(self, 'assertNoLogs'): + with self.assertNoLogs(level='WARNING'): + data = YahooFinance('GME', base_location=self.datadir).data + else: data = YahooFinance('GME', base_location=self.datadir).data self.assertGreater(data['return'].min(), -0.75) self.assertGreater(data['return'].max(), 3) From 0aa09be09e29ec89d9f17c799f5253bb29c57ff2 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Tue, 13 Feb 2024 22:10:21 +0400 Subject: [PATCH 28/38] preload warning on RMS logreturn not abs mean --- cvxportfolio/data/symbol_data.py | 23 ++++++++++++++--------- cvxportfolio/tests/test_data.py | 6 ++++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 9d56bb858..6fb7443c4 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -265,10 +265,14 @@ def _median_scale_around(lrets, window): """Median absolute logreturn in a window around each timestamp.""" return np.abs(lrets).rolling(window, center=True, min_periods=1).median() -# def _mean_scale_around(lrets, window): -# """Root mean squared logreturn in a window around each timestamp.""" -# return np.sqrt( -# (lrets**2).rolling(window, center=True, min_periods=1).mean()) +def _mean_scale_around(lrets, window): + """Root mean squared logreturn in a window around each timestamp. + + We need a few operations because we skip the observation itself + """ + sum = (lrets**2).rolling(window, center=True, min_periods=2).sum() + count = lrets.rolling(window, center=True, min_periods=2).count() + return np.sqrt((sum - lrets**2) / (count - 1)) def _unlikeliness_score( test_logreturns, reference_logreturns, scaler, windows): @@ -308,12 +312,13 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method THRESHOLD_OPEN_TO_CLOSE = 15 # remove low/high prices when low/high to close abs logreturn larger than - # this time the median absolute ones in FILTERING_WINDOWS around it + # this time the median absolute ones in FILTERING_WINDOWS centered on it THRESHOLD_LOWHIGH_TO_CLOSE = 20 # log warning on _preload for abs logreturns (of 4 types) larger than this - # time the median absolute ones in FILTERING_WINDOWS around it - THRESHOLD_WARN_EXTREME_LOGRETS = 17.5 + # time the root mean square in FILTERING_WINDOWS centered on it, without + # the given observation itself + THRESHOLD_WARN_EXTREME_LOGRETS = 10 def _process(self, new_data, saved_data=None): """Base method for processing (cleaning) data. @@ -554,10 +559,10 @@ def _set_infty_to_nan(self, data): def _warn_on_extreme_logreturns(self, logreturns, threshold, what): """Log warning if logreturns are extreme.""" # with this we skip over exact zeros (which we assume come from some - # cleaning) and would bias the median down + # cleaning) and would bias the mean down logreturns.loc[logreturns == 0] = np.nan score = _unlikeliness_score( - logreturns, logreturns, scaler=_median_scale_around, + logreturns, logreturns, scaler=_mean_scale_around, windows=self.FILTERING_WINDOWS) dubious_indexes = logreturns.index[score > threshold] if len(dubious_indexes) > 0: diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 1cde51676..17a7a1316 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -435,11 +435,13 @@ def test_yahoo_finance_cleaning(self): self.assertLess(data['return'].max(), 0.75) # this stock had some extreme returns but they were legitimate - # only available on py<3.10 + # only available on py>3.10 if hasattr(self, 'assertNoLogs'): - with self.assertNoLogs(level='WARNING'): + with self.assertNoLogs(level='WARNING'): # pragma: no cover + # pragma: no cover data = YahooFinance('GME', base_location=self.datadir).data else: + # pragma: no cover data = YahooFinance('GME', base_location=self.datadir).data self.assertGreater(data['return'].min(), -0.75) self.assertGreater(data['return'].max(), 3) From e81932abe24dcf22da83038a4e3cce53bf2a2034 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 12:20:30 +0400 Subject: [PATCH 29/38] historical data cleaning --- cvxportfolio/data/symbol_data.py | 41 +++++++++++++++++++++++----- cvxportfolio/tests/test_data.py | 11 ++++++++ examples/strategies/ftse100_daily.py | 5 +++- examples/universes.py | 24 ++++++++-------- 4 files changed, 61 insertions(+), 20 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 6fb7443c4..afc3fed92 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -316,9 +316,8 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method THRESHOLD_LOWHIGH_TO_CLOSE = 20 # log warning on _preload for abs logreturns (of 4 types) larger than this - # time the root mean square in FILTERING_WINDOWS centered on it, without - # the given observation itself - THRESHOLD_WARN_EXTREME_LOGRETS = 10 + # this time the median absolute ones in FILTERING_WINDOWS centered on it + THRESHOLD_WARN_EXTREME_LOGRETS = 50 def _process(self, new_data, saved_data=None): """Base method for processing (cleaning) data. @@ -438,7 +437,10 @@ def _fillna_and_message( '%s("%s").data["%s"] has NaNs on timestamps: %s,' + ' filling them with %s.', self.__class__.__name__, self.symbol, col_name, bad_indexes, message) - data[col_name] = getattr(data[col_name], filler)(filler_arg) + if filler == 'ffill': + data[col_name] = data[col_name].ffill() + else: + data[col_name] = getattr(data[col_name], filler)(filler_arg) def _nan_anomalous_prices( self, new_data, price_name, threshold, saved_data=None): @@ -559,10 +561,10 @@ def _set_infty_to_nan(self, data): def _warn_on_extreme_logreturns(self, logreturns, threshold, what): """Log warning if logreturns are extreme.""" # with this we skip over exact zeros (which we assume come from some - # cleaning) and would bias the mean down + # cleaning) and would bias the scale down logreturns.loc[logreturns == 0] = np.nan score = _unlikeliness_score( - logreturns, logreturns, scaler=_mean_scale_around, + logreturns, logreturns, scaler=_median_scale_around, windows=self.FILTERING_WINDOWS) dubious_indexes = logreturns.index[score > threshold] if len(dubious_indexes) > 0: @@ -669,6 +671,28 @@ class YahooFinance(OLHCV): :type data: pandas.DataFrame """ + # Maximum number of contiguous days on which an adjclose price can be + # invalid (e.g., negative); if any such period is found, all data before + # and including it is removed + MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20 + + def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): + """Throw out all data before many NaN on adjclose column.""" + invalid_indexes = new_data.index[ + new_data.adjclose.isnull().rolling( + self.MAX_CONTIGUOUS_MISSING_ADJCLOSES + ).sum() == self.MAX_CONTIGUOUS_MISSING_ADJCLOSES] + if len(invalid_indexes) > 0: + last_invalid_index = invalid_indexes[-1] + logger.warning( + '%s("%s").data has invalid adjclose prices for more than' + + ' %s contiguous days until %s; removing all data until then', + self.__class__.__name__, self.symbol, + self.MAX_CONTIGUOUS_MISSING_ADJCLOSES, last_invalid_index) + new_data = pd.DataFrame( + new_data.loc[new_data.index > last_invalid_index], copy=True) + return new_data + def _process(self, new_data, saved_data=None): """Process Yahoo Finance specific data, call parent's. @@ -684,12 +708,15 @@ def _process(self, new_data, saved_data=None): # NaN non-positive adj close self._nan_nonpositive_prices(new_data, "adjclose") + # Throw out data before many NaN on adjclose + new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data) + # forward-fill adj close self._fillna_and_message( new_data, 'adjclose', 'last available', filler='ffill') ## OLHCV._process treats all columns other than adjclose - super()._process(new_data, saved_data=saved_data) + new_data = super()._process(new_data, saved_data=saved_data) ## Compute total open-to-open returns diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 17a7a1316..b5fd8c08d 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -498,6 +498,17 @@ def _test_warning(data_transformation, part_of_message): 'data.iloc[300,2] = data.iloc[300,2]*2', 'dubious open to high returns') + def test_yahoo_finance_remove_on_many_bad_adjcloses(self): + """Test remove old data when many adjcloses are invalid.""" + + # this stock was found to have bad (negative) adjcloses for many + # months at its start + # with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level='WARNING') as _: + YahooFinance('BATS.L', base_location=self.datadir) + self.assertTrue(np.any( + 'contiguous' in el.output for el in _)) + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py index bc99dd472..07336929c 100644 --- a/examples/strategies/ftse100_daily.py +++ b/examples/strategies/ftse100_daily.py @@ -67,10 +67,13 @@ def policy(gamma_risk, gamma_trade): else: import matplotlib.pyplot as plt + import pandas as pd #INDEX_ETF = 'DIA' - research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND') + md = cvx.DownloadedMarketData( + FTSE100, cash_key='GBPOUND', grace_period=pd.Timedelta('5d')) + research_sim = cvx.StockMarketSimulator(market_data = md) research_policy, _ = policy(1., 1.) diff --git a/examples/universes.py b/examples/universes.py index 3687a1112..139c1b65e 100644 --- a/examples/universes.py +++ b/examples/universes.py @@ -20,7 +20,7 @@ We could also save each universe in a ``json`` file. """ -# This was generated on 2024-01-04 06:18:49.851642+00:00 +# This was generated on 2024-02-14 07:15:36.308012+00:00 SP500 = \ ['A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', @@ -31,11 +31,11 @@ 'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF-B', 'BG', 'BIIB', 'BIO', 'BK', 'BKNG', 'BKR', 'BLDR', 'BLK', 'BMY', 'BR', 'BRK-B', 'BRO', 'BSX', 'BWA', 'BX', 'BXP', 'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE', 'CBRE', 'CCI', 'CCL', - 'DAY', 'CDNS', 'CDW', 'CE', 'CEG', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', - 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', - 'COF', 'COO', 'COP', 'COR', 'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', - 'CSCO', 'CSGP', 'CSX', 'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX', - 'CZR', 'D', 'DAL', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', + 'CDNS', 'CDW', 'CE', 'CEG', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', + 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COF', + 'COO', 'COP', 'COR', 'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CSCO', + 'CSGP', 'CSX', 'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX', 'CZR', + 'D', 'DAL', 'DAY', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOV', 'DOW', 'DPZ', 'DRI', 'DTE', 'DUK', 'DVA', 'DVN', 'DXCM', 'EA', 'EBAY', 'ECL', 'ED', 'EFX', 'EG', 'EIX', 'EL', 'ELV', 'EMN', 'EMR', 'ENPH', 'EOG', 'EPAM', 'EQIX', 'EQR', 'EQT', 'ES', 'ESS', 'ETN', 'ETR', 'ETSY', @@ -93,12 +93,12 @@ ['AAF.L', 'AAL.L', 'ABF.L', 'ADM.L', 'AHT.L', 'ANTO.L', 'AUTO.L', 'AV.L', 'AZN.L', 'BA.L', 'BARC.L', 'BATS.L', 'BDEV.L', 'BEZ.L', 'BKG.L', 'BME.L', 'BNZL.L', 'BP.L', 'BRBY.L', 'BT-A.L', 'CCH.L', 'CNA.L', 'CPG.L', 'CRDA.L', - 'CTEC.L', 'DCC.L', 'DGE.L', 'DPH.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L', - 'FCIT.L', 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L', - 'HLN.L', 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L', - 'IMI.L', 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L', - 'LSEG.L', 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L', - 'OCDO.L', 'PHNX.L', 'PRU.L', 'PSH.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L', + 'CTEC.L', 'DCC.L', 'DGE.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L', 'FCIT.L', + 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L', 'HLN.L', + 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L', 'IMI.L', + 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L', 'LSEG.L', + 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L', 'OCDO.L', + 'PHNX.L', 'PRU.L', 'PSH.L', 'PSN.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L', 'RMV.L', 'RR.L', 'RS1.L', 'RTO.L', 'SBRY.L', 'SDR.L', 'SGE.L', 'SGRO.L', 'SHEL.L', 'SKG.L', 'SMDS.L', 'SMIN.L', 'SMT.L', 'SN.L', 'SPX.L', 'SSE.L', 'STAN.L', 'STJ.L', 'SVT.L', 'TSCO.L', 'TW.L', 'ULVR.L', 'UTG.L', 'UU.L', From db7477c6fc044755c37ae4fdfe2f23f174119d8b Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 13:01:35 +0400 Subject: [PATCH 30/38] removing phony adjcloses and data around them --- cvxportfolio/data/symbol_data.py | 33 +++++++++++++++++++++++++++++++- cvxportfolio/tests/test_data.py | 10 +++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index afc3fed92..10adbf2e2 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -676,6 +676,11 @@ class YahooFinance(OLHCV): # and including it is removed MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20 + # remove all data (also one day before and after) when logrets implied by + # adjcloses are anomalous; abs value larger than median abs value time this + # in many windows around it + THRESHOLD_BAD_ADJCLOSE = 20 + def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): """Throw out all data before many NaN on adjclose column.""" invalid_indexes = new_data.index[ @@ -693,6 +698,29 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): new_data.loc[new_data.index > last_invalid_index], copy=True) return new_data + def _remove_data_on_bad_adjcloses(self, new_data): + """Remove adjcloses if implied logreturns are highly anomalous.""" + while True: + logrets = np.log(new_data.adjclose.ffill()).diff() + score = _unlikeliness_score( + logrets, logrets, scaler=_median_scale_around, + windows=self.FILTERING_WINDOWS) + + # we eliminate data 1 day before and after any anomalous event + # could be made less aggressive, but better to be safe + bad_indexes = logrets.index[ + (score > self.THRESHOLD_BAD_ADJCLOSE) + | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)] + + if len(bad_indexes) == 0: + break + new_data.loc[bad_indexes] = np.nan + logger.warning( + '%s("%s").data has anomalous adjclose prices on timestamps' + + '(including one day before and after) %s; removing all' + + 'data (not just adjcloses) on those timestamps.', + self.__class__.__name__, self.symbol, bad_indexes) + def _process(self, new_data, saved_data=None): """Process Yahoo Finance specific data, call parent's. @@ -708,9 +736,12 @@ def _process(self, new_data, saved_data=None): # NaN non-positive adj close self._nan_nonpositive_prices(new_data, "adjclose") - # Throw out data before many NaN on adjclose + # Throw out all data before many NaN on adjclose new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data) + # Remove all data when highly anomalous adjclose prices are detected + self._remove_data_on_bad_adjcloses(new_data) + # forward-fill adj close self._fillna_and_message( new_data, 'adjclose', 'last available', filler='ffill') diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index b5fd8c08d..59ced7de0 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -503,12 +503,20 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self): # this stock was found to have bad (negative) adjcloses for many # months at its start - # with self.assertLogs(level='WARNING') as _: with self.assertLogs(level='WARNING') as _: YahooFinance('BATS.L', base_location=self.datadir) self.assertTrue(np.any( 'contiguous' in el.output for el in _)) + def test_adjcloses_logrets_removal(self): + """Test method to remove adjcloses when its logrets are anomalous.""" + + # this stock was found to have phony adjcloses + with self.assertLogs(level='WARNING') as _: + YahooFinance('BA.L', base_location=self.datadir) + self.assertTrue(np.any( + 'anomalous adjclose prices' in el.output for el in _)) + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" From cde79b55e89d5363b72ded5cfb7adc294454301f Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 17:15:57 +0400 Subject: [PATCH 31/38] improving cleaning of bad adjcloses, more analysis needed --- cvxportfolio/data/symbol_data.py | 41 +++++++++++++++++++-------- cvxportfolio/tests/test_data.py | 48 ++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 10adbf2e2..2ae19e6f0 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -281,7 +281,7 @@ def _unlikeliness_score( np.abs(test_logreturns) / scaler(reference_logreturns, window) for window in windows] scaled = pd.DataFrame(scaled).T - return scaled.min(axis=1) + return scaled.min(axis=1, skipna=True) class OLHCV(SymbolData): # pylint: disable=abstract-method @@ -558,7 +558,8 @@ def _set_infty_to_nan(self, data): data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan) - def _warn_on_extreme_logreturns(self, logreturns, threshold, what): + def _warn_on_extreme_logreturns( + self, logreturns, threshold, what, level='warning'): """Log warning if logreturns are extreme.""" # with this we skip over exact zeros (which we assume come from some # cleaning) and would bias the scale down @@ -568,7 +569,7 @@ def _warn_on_extreme_logreturns(self, logreturns, threshold, what): windows=self.FILTERING_WINDOWS) dubious_indexes = logreturns.index[score > threshold] if len(dubious_indexes) > 0: - logger.warning( + getattr(logger, level)( '%s("%s") has dubious %s for timestamps: %s', self.__class__.__name__, self.symbol, what, dubious_indexes) @@ -590,17 +591,20 @@ def _quality_check(self, data): # extreme open2close self._warn_on_extreme_logreturns( np.log(data['close']) - np.log(data['open']), - self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns') + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns', + level='info') # extreme open2high self._warn_on_extreme_logreturns( np.log(data['high']) - np.log(data['open']), - self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns') + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns', + level='info') # extreme open2low self._warn_on_extreme_logreturns( np.log(data['low']) - np.log(data['open']), - self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns') + self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns', + level='info') def _preload(self, data): """Prepare data for use by Cvxportfolio. @@ -674,12 +678,15 @@ class YahooFinance(OLHCV): # Maximum number of contiguous days on which an adjclose price can be # invalid (e.g., negative); if any such period is found, all data before # and including it is removed - MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20 + MAX_CONTIGUOUS_MISSING_ADJCLOSES = 10 # remove all data (also one day before and after) when logrets implied by - # adjcloses are anomalous; abs value larger than median abs value time this - # in many windows around it - THRESHOLD_BAD_ADJCLOSE = 20 + # adjcloses are anomalous: abs value larger than median abs value time this + # in many windows around it. + # this is redone iteratively up to the MAX_CONTIGUOUS_MISSING_ADJCLOSES, + # so unless the bad adjcloses are only for few days all data up to the + # anomalous event will be deleted + THRESHOLD_BAD_ADJCLOSE = 100 def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): """Throw out all data before many NaN on adjclose column.""" @@ -700,8 +707,13 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): def _remove_data_on_bad_adjcloses(self, new_data): """Remove adjcloses if implied logreturns are highly anomalous.""" - while True: + # worst case (if it goes to end of for loop) + # we throw out all data before the event + for _ in range(self.MAX_CONTIGUOUS_MISSING_ADJCLOSES + 1): logrets = np.log(new_data.adjclose.ffill()).diff() + # with this we skip over exact zeros (which we assume come from + # some cleaning) and would bias the scale down + # logrets.loc[logrets == 0.] = np.nan score = _unlikeliness_score( logrets, logrets, scaler=_median_scale_around, windows=self.FILTERING_WINDOWS) @@ -710,7 +722,9 @@ def _remove_data_on_bad_adjcloses(self, new_data): # could be made less aggressive, but better to be safe bad_indexes = logrets.index[ (score > self.THRESHOLD_BAD_ADJCLOSE) - | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)] + | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1) + # | score.isnull() # TODO: this is not good, necessary for SMT.L + ] if len(bad_indexes) == 0: break @@ -742,6 +756,9 @@ def _process(self, new_data, saved_data=None): # Remove all data when highly anomalous adjclose prices are detected self._remove_data_on_bad_adjcloses(new_data) + # Repeat throw out all data before many NaN on adjclose + new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data) + # forward-fill adj close self._fillna_and_message( new_data, 'adjclose', 'last available', filler='ffill') diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 59ced7de0..0618f73ed 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -456,12 +456,13 @@ def test_yahoo_finance_preload_warnings(self): empty_instance._symbol = 'ZM' # because the warnings use the symbol cleaned = empty_instance._process(raw_data, None) - def _test_warning(data_transformation, part_of_message): + def _test_warning( + data_transformation, part_of_message, level='WARNING'): """Test that warning is raised w/ message containing some word.""" data = pd.DataFrame(cleaned, copy=True) exec(data_transformation) # pylint: disable=exec-used # print(data) - with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level=level) as _: empty_instance._preload(data) # print(_) self.assertTrue(part_of_message in _.output[0]) @@ -470,33 +471,37 @@ def _test_warning(data_transformation, part_of_message): # high unexpected return _test_warning( - 'data.iloc[300,-1] = 1', + 'data.iloc[300,-1] = 4', 'dubious total open-to-open returns') # low unexpected return _test_warning( - 'data.iloc[300,-1] = -0.5', + 'data.iloc[300,-1] = -0.9', 'dubious total open-to-open returns') # low unexpected open _test_warning( - 'data.iloc[300,0] = data.iloc[300,0]*0.5', - 'dubious open to close returns') + 'data.iloc[300,0] = data.iloc[300,0]*0.1', + 'dubious open to close returns', + level='INFO') # high unexpected open _test_warning( - 'data.iloc[300,0] = data.iloc[300,0]*2', - 'dubious open to close returns') + 'data.iloc[300,0] = data.iloc[300,0]*5', + 'dubious open to close returns', + level='INFO') # low unexpected low _test_warning( - 'data.iloc[300,1] = data.iloc[300,1]*0.5', - 'dubious open to low returns') + 'data.iloc[300,1] = data.iloc[300,1]*0.1', + 'dubious open to low returns', + level='INFO') # high unexpected high _test_warning( - 'data.iloc[300,2] = data.iloc[300,2]*2', - 'dubious open to high returns') + 'data.iloc[300,2] = data.iloc[300,2]*5', + 'dubious open to high returns', + level='INFO') def test_yahoo_finance_remove_on_many_bad_adjcloses(self): """Test remove old data when many adjcloses are invalid.""" @@ -508,14 +513,29 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self): self.assertTrue(np.any( 'contiguous' in el.output for el in _)) + with self.assertNoLogs(level='WARNING'): + YahooFinance('BATS.L', base_location=self.datadir) + def test_adjcloses_logrets_removal(self): """Test method to remove adjcloses when its logrets are anomalous.""" + # this stock had anomalous price changes in the 70s + with self.assertLogs(level='WARNING') as _: + d = YahooFinance("SMT.L", base_location=self.datadir).data + self.assertTrue(np.any([ + 'anomalous adjclose prices' in el for el in _.output])) + self.assertTrue(d['return'].max() < 2) + # this stock was found to have phony adjcloses with self.assertLogs(level='WARNING') as _: YahooFinance('BA.L', base_location=self.datadir) - self.assertTrue(np.any( - 'anomalous adjclose prices' in el.output for el in _)) + self.assertTrue(np.any([ + 'anomalous adjclose prices' in el for el in _.output])) + + if hasattr(self, 'assertNoLogs'): + with self.assertNoLogs(level='WARNING'): + YahooFinance('BA.L', base_location=self.datadir) + def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" From 26f03cdc427cea31faa9bfc17ba823495030987c Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 19:16:27 +0400 Subject: [PATCH 32/38] added more adjclose filtering --- cvxportfolio/data/symbol_data.py | 30 ++++++++++++++++++++++-------- cvxportfolio/tests/test_data.py | 1 - 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 2ae19e6f0..d8baf9500 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -678,7 +678,7 @@ class YahooFinance(OLHCV): # Maximum number of contiguous days on which an adjclose price can be # invalid (e.g., negative); if any such period is found, all data before # and including it is removed - MAX_CONTIGUOUS_MISSING_ADJCLOSES = 10 + MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20 # remove all data (also one day before and after) when logrets implied by # adjcloses are anomalous: abs value larger than median abs value time this @@ -686,7 +686,16 @@ class YahooFinance(OLHCV): # this is redone iteratively up to the MAX_CONTIGUOUS_MISSING_ADJCLOSES, # so unless the bad adjcloses are only for few days all data up to the # anomalous event will be deleted - THRESHOLD_BAD_ADJCLOSE = 100 + THRESHOLD_BAD_ADJCLOSE = 50 + + # assume any adjclose-to-adjclose log10-return larger than this in absolute + # value (1. is 10x) is false and eliminate both adjcloses around it + # this only applies before ASSUME_FALSE_BEFORE + THRESHOLD_FALSE_LOG10RETS = .5 + + # assume logreturns larger in abs value than threshold above are false + # ONLY before this date, otherwise don't filter them + ASSUME_FALSE_BEFORE = pd.Timestamp('2000-01-01', tz='UTC') def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): """Throw out all data before many NaN on adjclose column.""" @@ -710,21 +719,26 @@ def _remove_data_on_bad_adjcloses(self, new_data): # worst case (if it goes to end of for loop) # we throw out all data before the event for _ in range(self.MAX_CONTIGUOUS_MISSING_ADJCLOSES + 1): - logrets = np.log(new_data.adjclose.ffill()).diff() + logrets = np.log10(new_data.adjclose.ffill()).diff() + # with this we skip over exact zeros (which we assume come from # some cleaning) and would bias the scale down - # logrets.loc[logrets == 0.] = np.nan + logrets.loc[logrets == 0.] = np.nan + score = _unlikeliness_score( logrets, logrets, scaler=_median_scale_around, windows=self.FILTERING_WINDOWS) + bad_score = score > self.THRESHOLD_BAD_ADJCLOSE + + too_large_logreturns = np.abs( + logrets) > self.THRESHOLD_FALSE_LOG10RETS + too_large_logreturns &= logrets.index < self.ASSUME_FALSE_BEFORE # we eliminate data 1 day before and after any anomalous event # could be made less aggressive, but better to be safe bad_indexes = logrets.index[ - (score > self.THRESHOLD_BAD_ADJCLOSE) - | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1) - # | score.isnull() # TODO: this is not good, necessary for SMT.L - ] + bad_score | bad_score.shift(-1) | too_large_logreturns + | too_large_logreturns.shift(-1)] if len(bad_indexes) == 0: break diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 0618f73ed..2026a61d5 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -536,7 +536,6 @@ def test_adjcloses_logrets_removal(self): with self.assertNoLogs(level='WARNING'): YahooFinance('BA.L', base_location=self.datadir) - def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" From 7ccdd542530b689b42b088d1dff6373612fc3b3d Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 19:17:36 +0400 Subject: [PATCH 33/38] typo --- cvxportfolio/tests/test_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 2026a61d5..1fc2c9106 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -513,8 +513,9 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self): self.assertTrue(np.any( 'contiguous' in el.output for el in _)) - with self.assertNoLogs(level='WARNING'): - YahooFinance('BATS.L', base_location=self.datadir) + if hasattr(self, 'assertNoLogs'): + with self.assertNoLogs(level='WARNING'): + YahooFinance('BATS.L', base_location=self.datadir) def test_adjcloses_logrets_removal(self): """Test method to remove adjcloses when its logrets are anomalous.""" From 5900968591cbc6342322904c66d56e06c0dcd13a Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Wed, 14 Feb 2024 22:02:39 +0400 Subject: [PATCH 34/38] tested on current example universes; names that get historical data trimmed down are HUBB, JCI, NVR, and seem reasonable --- cvxportfolio/data/symbol_data.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index d8baf9500..3d2456b81 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -777,6 +777,11 @@ def _process(self, new_data, saved_data=None): self._fillna_and_message( new_data, 'adjclose', 'last available', filler='ffill') + # eliminate (initial) rows where adjclose is NaN + nan_adjcloses = new_data.adjclose.isnull() + if np.any(nan_adjcloses): + new_data = pd.DataFrame(new_data.loc[~nan_adjcloses], copy=True) + ## OLHCV._process treats all columns other than adjclose new_data = super()._process(new_data, saved_data=saved_data) From 572b4df3503a45dbfbfab245f76e9f905f951a91 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Thu, 15 Feb 2024 13:26:02 +0400 Subject: [PATCH 35/38] data cleaning example --- examples/data_cleaning.py | 111 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 examples/data_cleaning.py diff --git a/examples/data_cleaning.py b/examples/data_cleaning.py new file mode 100644 index 000000000..08a503c77 --- /dev/null +++ b/examples/data_cleaning.py @@ -0,0 +1,111 @@ +# Copyright 2023 Enzo Busseti +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This script is used to show the data cleaning applied to Yahoo Finance data. + +It is not really an example, and some of the methods shown here are not public, +so not covered by the semantic versioning agreeement (they could change +without notice). +""" + +import logging +import shutil +import tempfile +from pathlib import Path +from time import sleep + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +import cvxportfolio as cvx + +# If you change this to logging.INFO you get more logging output from the +# cleaning procedure +logging.getLogger().setLevel(logging.WARNING) + +# Here put any number of stocks for which you wish to analyze the cleaning +TEST_UNIVERSE = ['AAPL', 'GOOG', 'TSLA'] + +# Some names with known issues: +# TEST_UNIVERSE = ['SMT.L','NVR', 'HUBB', 'NWG.L'] + +ALL_DROPPED_ROWS_PCT = pd.Series(dtype=float) +ALL_MIN_LR = pd.Series(dtype=float) +ALL_MAX_LR = pd.Series(dtype=float) + +PLOT = True +SLEEP = 1 + +for stock in TEST_UNIVERSE: + sleep(SLEEP) + print(f'\n\t{stock}:') + + # This method is not public: + raw_yfinance = cvx.YahooFinance._get_data_yahoo(stock) + print(f'{stock}: YAHOO FINANCE RAW') + print(raw_yfinance) + + tmpdir = Path(tempfile.mkdtemp()) + cvx_cleaned = cvx.YahooFinance(stock, base_location=tmpdir).data + shutil.rmtree(tmpdir) + print(f'{stock}: CVXPORTFOLIO CLEANED') + print(cvx_cleaned) + + yf_log10r = np.log10(raw_yfinance.adjclose).diff().shift(-1) + cvx_log10r = np.log10(1 + cvx_cleaned['return']) + + if PLOT: + fig, axes = plt.subplots( + 3, figsize=(10/1.62, 10), layout='constrained') + + raw_yfinance.iloc[:, :5].plot(ax=axes[0]) + axes[0].set_yscale('log') + axes[0].set_title(f'{stock}: RAW YAHOO FINANCE') + + cvx_cleaned.iloc[:, :4].plot(ax=axes[1]) + axes[1].set_title(f'{stock}: CVXPORTFOLIO CLEANED DATA') + axes[1].set_yscale('log') + + (yf_log10r.cumsum() - yf_log10r.sum()).plot( + label='Yahoo Finance total close-to-close', ax=axes[2]) + (cvx_log10r.cumsum() - cvx_log10r.sum()).plot( + label='Cvxportfolio total open-to-open', ax=axes[2]) + axes[2].set_title(f'{stock}: CUMULATIVE LOG10 RETURNS (SCALED)') + axes[2].legend() + + plt.show() + + assert cvx_cleaned.index[-1] == raw_yfinance.index[-1] + + print() + dropped_rows = len(raw_yfinance) - len(cvx_cleaned) + dropped_rows_pct = dropped_rows / len(raw_yfinance) + ALL_DROPPED_ROWS_PCT.loc[stock] = dropped_rows_pct*100 + print(f'Cvxportfolio dropped {int(dropped_rows_pct*100)}% of rows') + + ALL_MIN_LR.loc[stock] = np.log(1+cvx_cleaned['return']).min() + ALL_MAX_LR.loc[stock] = np.log(1+cvx_cleaned['return']).max() + + print('Max Cvxportfolio logreturn:', ALL_MAX_LR.loc[stock]) + print('Min Cvxportfolio logreturn:', ALL_MIN_LR.loc[stock] ) + print('How many zero volumes:', (cvx_cleaned['valuevolume'] == 0.).mean()) + +print('\nCvxportfolio dropped rows %:') +print(ALL_DROPPED_ROWS_PCT.sort_values().tail()) + +print('\nnCvxportfolio min logreturns:') +print(ALL_MIN_LR.sort_values().head()) + +print('\nnCvxportfolio max logreturns:') +print(ALL_MAX_LR.sort_values().tail()) From 4a207c47869aed21d14fc375637b805fd8cf5ad6 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Thu, 15 Feb 2024 17:21:17 +0400 Subject: [PATCH 36/38] adjusted log level of cleaning to info --- cvxportfolio/data/symbol_data.py | 94 ++++++++++++++++++-------------- cvxportfolio/tests/test_data.py | 87 +++++++++++++++-------------- examples/data_cleaning.py | 6 +- 3 files changed, 104 insertions(+), 83 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 3d2456b81..759af7d66 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -265,14 +265,14 @@ def _median_scale_around(lrets, window): """Median absolute logreturn in a window around each timestamp.""" return np.abs(lrets).rolling(window, center=True, min_periods=1).median() -def _mean_scale_around(lrets, window): - """Root mean squared logreturn in a window around each timestamp. +# def _mean_scale_around(lrets, window): +# """Root mean squared logreturn in a window around each timestamp. - We need a few operations because we skip the observation itself - """ - sum = (lrets**2).rolling(window, center=True, min_periods=2).sum() - count = lrets.rolling(window, center=True, min_periods=2).count() - return np.sqrt((sum - lrets**2) / (count - 1)) +# We need a few operations because we skip the observation itself +# """ +# sum = (lrets**2).rolling(window, center=True, min_periods=2).sum() +# count = lrets.rolling(window, center=True, min_periods=2).count() +# return np.sqrt((sum - lrets**2) / (count - 1)) def _unlikeliness_score( test_logreturns, reference_logreturns, scaler, windows): @@ -338,7 +338,7 @@ def _process(self, new_data, saved_data=None): self._nan_nonpositive_prices(new_data, column) # all infinity values to NaN - self._set_infty_to_nan(new_data) + self._set_infty_to_nan(new_data, level='info') ## Close price. ## We believe them (for now). We forward fill them if unavailable. @@ -370,12 +370,12 @@ def _process(self, new_data, saved_data=None): # NaN anomalous open prices self._nan_anomalous_prices( new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE, - saved_data=saved_data) + saved_data=saved_data, level='info') # fill open with close from day before self._fillna_and_message( new_data, 'open', 'close from period before', filler='fillna', - filler_arg=new_data['close'].shift(1)) + filler_arg=new_data['close'].shift(1), level='info') ## Low price. ## We remove if higher than close or anomalous low to close logreturn. @@ -390,12 +390,12 @@ def _process(self, new_data, saved_data=None): # NaN anomalous low prices self._nan_anomalous_prices( new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE, - saved_data=saved_data) + saved_data=saved_data, level='info') # fill low with min of open and close self._fillna_and_message( new_data, 'low', 'min of open and close', filler='fillna', - filler_arg=new_data[['open', 'close']].min(axis=1)) + filler_arg=new_data[['open', 'close']].min(axis=1), level='info') ## High price. ## We remove if lower than close or anomalous low to close logreturn. @@ -410,12 +410,12 @@ def _process(self, new_data, saved_data=None): # NaN anomalous high prices self._nan_anomalous_prices( new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE, - saved_data=saved_data) + saved_data=saved_data, level='info') # fill high with max of open and close self._fillna_and_message( new_data, 'high', 'max of open and close', filler='fillna', - filler_arg=new_data[['open', 'close']].max(axis=1)) + filler_arg=new_data[['open', 'close']].max(axis=1), level='info') ## Some asserts assert new_data.iloc[1:].isnull().sum().sum() == 0 @@ -429,11 +429,12 @@ def _process(self, new_data, saved_data=None): return new_data def _fillna_and_message( - self, data, col_name, message, filler='fillna', filler_arg=None): + self, data, col_name, message, filler='fillna', filler_arg=None, + level='warning'): """Fill NaNs in column with chosen method and arg.""" bad_indexes = data.index[data[col_name].isnull()] if len(bad_indexes) > 0: - logger.warning( + getattr(logger, level)( '%s("%s").data["%s"] has NaNs on timestamps: %s,' + ' filling them with %s.', self.__class__.__name__, self.symbol, col_name, bad_indexes, message) @@ -443,7 +444,8 @@ def _fillna_and_message( data[col_name] = getattr(data[col_name], filler)(filler_arg) def _nan_anomalous_prices( - self, new_data, price_name, threshold, saved_data=None): + self, new_data, price_name, threshold, saved_data=None, + level='warning'): """Set to NaN given price name on its anomalous logrets to close.""" new_lr_to_close =\ np.log(new_data['close']) - np.log(new_data[price_name]) @@ -472,14 +474,16 @@ def _nan_anomalous_prices( windows=self.FILTERING_WINDOWS) self._nan_values( new_data, condition = score.loc[new_data.index] > threshold, - columns_to_nan=price_name, message=f'anomalous {price_name} price') + columns_to_nan=price_name, message=f'anomalous {price_name} price', + level=level) - def _nan_values(self, data, condition, columns_to_nan, message): + def _nan_values( + self, data, condition, columns_to_nan, message, level='warning'): """Set to NaN in-place for indexing condition and chosen columns.""" bad_indexes = data.index[condition] if len(bad_indexes) > 0: - logger.warning( + getattr(logger, level)( '%s("%s") has %s on timestamps: %s,' + ' setting to nan', self.__class__.__name__, self.symbol, message, bad_indexes) @@ -490,27 +494,28 @@ def _nan_nonpositive_prices(self, data, prices_name): self._nan_values( data=data, condition = data[prices_name] <= 0, columns_to_nan = prices_name, - message = f'non-positive {prices_name} prices') + message = f'non-positive {prices_name} prices', level='info') def _nan_negative_volumes(self, data): """Set negative volumes to NaN, in-place.""" self._nan_values( data=data, condition = data["volume"] < 0, - columns_to_nan = "volume", message = 'negative volumes') + columns_to_nan = "volume", message = 'negative volumes', + level='info') def _nan_open_lower_low(self, data): """Set open price to NaN if lower than low, in-place.""" self._nan_values( data=data, condition = data['open'] < data['low'], columns_to_nan = "open", - message = 'open price lower than low price') + message = 'open price lower than low price', level='info') def _nan_open_higher_high(self, data): """Set open price to NaN if higher than high, in-place.""" self._nan_values( data=data, condition = data['open'] > data['high'], columns_to_nan = "open", - message = 'open price higher than high price') + message = 'open price higher than high price', level='info') # def _nan_incompatible_low_high(self, data): # """Set low and high to NaN if low is higher, in-place.""" @@ -524,34 +529,34 @@ def _nan_high_lower_close(self, data): self._nan_values( data=data, condition = data['high'] < data['close'], columns_to_nan = "high", - message = 'high price lower than close price') + message = 'high price lower than close price', level='info') def _nan_high_lower_open(self, data): """Set high price to NaN if lower than open, in-place.""" self._nan_values( data=data, condition = data['high'] < data['open'], columns_to_nan = "high", - message = 'high price lower than open price') + message = 'high price lower than open price', level='info') def _nan_low_higher_close(self, data): """Set low price to NaN if higher than close, in-place.""" self._nan_values( data=data, condition = data['low'] > data['close'], columns_to_nan = "low", - message = 'low price higher than close price') + message = 'low price higher than close price', level='info') def _nan_low_higher_open(self, data): """Set low price to NaN if higher than open, in-place.""" self._nan_values( data=data, condition = data['low'] > data['open'], columns_to_nan = "low", - message = 'low price higher than open price') + message = 'low price higher than open price', level='info') - def _set_infty_to_nan(self, data): + def _set_infty_to_nan(self, data, level='warning'): """Set all +/- infty elements of data to NaN, in-place.""" if np.isinf(data).sum().sum() > 0: - logger.warning( + getattr(logger, level)( '%s("%s") has +/- infinity values, setting those to nan', self.__class__.__name__, self.symbol) data.iloc[:, :] = np.nan_to_num( @@ -586,7 +591,7 @@ def _quality_check(self, data): # warn on extreme logreturns self._warn_on_extreme_logreturns( np.log(1 + data['return']), self.THRESHOLD_WARN_EXTREME_LOGRETS, - 'total open-to-open returns') + 'total open-to-open returns', level='warning') # extreme open2close self._warn_on_extreme_logreturns( @@ -697,7 +702,8 @@ class YahooFinance(OLHCV): # ONLY before this date, otherwise don't filter them ASSUME_FALSE_BEFORE = pd.Timestamp('2000-01-01', tz='UTC') - def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): + def _throw_out_all_data_before_many_bad_adjcloses( + self, new_data, level='warning'): """Throw out all data before many NaN on adjclose column.""" invalid_indexes = new_data.index[ new_data.adjclose.isnull().rolling( @@ -705,7 +711,7 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): ).sum() == self.MAX_CONTIGUOUS_MISSING_ADJCLOSES] if len(invalid_indexes) > 0: last_invalid_index = invalid_indexes[-1] - logger.warning( + getattr(logger, level)( '%s("%s").data has invalid adjclose prices for more than' + ' %s contiguous days until %s; removing all data until then', self.__class__.__name__, self.symbol, @@ -714,7 +720,7 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data): new_data.loc[new_data.index > last_invalid_index], copy=True) return new_data - def _remove_data_on_bad_adjcloses(self, new_data): + def _remove_data_on_bad_adjcloses(self, new_data, level='warning'): """Remove adjcloses if implied logreturns are highly anomalous.""" # worst case (if it goes to end of for loop) # we throw out all data before the event @@ -743,7 +749,7 @@ def _remove_data_on_bad_adjcloses(self, new_data): if len(bad_indexes) == 0: break new_data.loc[bad_indexes] = np.nan - logger.warning( + getattr(logger, level)( '%s("%s").data has anomalous adjclose prices on timestamps' + '(including one day before and after) %s; removing all' + 'data (not just adjcloses) on those timestamps.', @@ -759,27 +765,35 @@ def _process(self, new_data, saved_data=None): ## Treat adjclose. We believe them (unless impossible). # all infinity values to NaN (repeat, but for adjclose) - self._set_infty_to_nan(new_data) + self._set_infty_to_nan(new_data, level='info') # NaN non-positive adj close self._nan_nonpositive_prices(new_data, "adjclose") # Throw out all data before many NaN on adjclose - new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data) + new_data = self._throw_out_all_data_before_many_bad_adjcloses( + new_data, level='info') # Remove all data when highly anomalous adjclose prices are detected - self._remove_data_on_bad_adjcloses(new_data) + self._remove_data_on_bad_adjcloses(new_data, level='info') # Repeat throw out all data before many NaN on adjclose - new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data) + new_data = self._throw_out_all_data_before_many_bad_adjcloses( + new_data, level='info') # forward-fill adj close self._fillna_and_message( - new_data, 'adjclose', 'last available', filler='ffill') + new_data, 'adjclose', 'last available', filler='ffill', + level='info') # eliminate (initial) rows where adjclose is NaN nan_adjcloses = new_data.adjclose.isnull() if np.any(nan_adjcloses): + logger.info( + '%s("%s") is eliminating data on %s because the adjclose ' + + 'price is missing.', + self.__class__.__name__, self.symbol, + new_data.index[nan_adjcloses]) new_data = pd.DataFrame(new_data.loc[~nan_adjcloses], copy=True) ## OLHCV._process treats all columns other than adjclose diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 1fc2c9106..3fbf87f93 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -440,9 +440,6 @@ def test_yahoo_finance_cleaning(self): with self.assertNoLogs(level='WARNING'): # pragma: no cover # pragma: no cover data = YahooFinance('GME', base_location=self.datadir).data - else: - # pragma: no cover - data = YahooFinance('GME', base_location=self.datadir).data self.assertGreater(data['return'].min(), -0.75) self.assertGreater(data['return'].max(), 3) @@ -521,87 +518,97 @@ def test_adjcloses_logrets_removal(self): """Test method to remove adjcloses when its logrets are anomalous.""" # this stock had anomalous price changes in the 70s - with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level='INFO') as _: d = YahooFinance("SMT.L", base_location=self.datadir).data self.assertTrue(np.any([ 'anomalous adjclose prices' in el for el in _.output])) self.assertTrue(d['return'].max() < 2) # this stock was found to have phony adjcloses - with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level='INFO') as _: YahooFinance('BA.L', base_location=self.datadir) self.assertTrue(np.any([ 'anomalous adjclose prices' in el for el in _.output])) - if hasattr(self, 'assertNoLogs'): - with self.assertNoLogs(level='WARNING'): - YahooFinance('BA.L', base_location=self.datadir) + with self.assertLogs(level='INFO') as _: + YahooFinance('BA.L', base_location=self.datadir) + self.assertFalse(np.any([ + 'anomalous adjclose prices' in el for el in _.output])) def test_yahoo_finance_cleaning_granular(self): """Test each step of cleaning.""" # pylint: disable=protected-access raw_data = YahooFinance._get_data_yahoo('ZM') - print(raw_data) + # print(raw_data) empty_instance = YahooFinance.__new__(YahooFinance) empty_instance._symbol = 'ZM' # because the warnings use the symbol - def _test_warning(data_transformation, part_of_message): + def _test_warning( + data_transformation, part_of_message, level='WARNING'): """Test that warning is raised w/ message containing some word.""" data = pd.DataFrame(raw_data, copy=True) exec(data_transformation) # pylint: disable=exec-used - with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level=level) as _: _cleaned = empty_instance._process(data, None) - self.assertTrue(part_of_message in _.output[0]) + self.assertTrue( + np.any([part_of_message in el for el in _.output])) # check all NaNs have been filled self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0) - def _test_warning_update(data_transformation, part_of_message): + def _test_warning_update( + data_transformation, part_of_message, level='WARNING'): """Test that warning is raised w/ message containing some word.""" new_data = pd.DataFrame(raw_data.iloc[-20:], copy=True) saved_data = pd.DataFrame(raw_data.iloc[:-15], copy=True) exec(data_transformation) # pylint: disable=exec-used - with self.assertLogs(level='WARNING') as _: + with self.assertLogs(level=level) as _: _cleaned = empty_instance._process(new_data, saved_data) - self.assertTrue(part_of_message in _.output[0]) + self.assertTrue( + np.any([part_of_message in el for el in _.output])) # check all NaNs have been filled self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0) + # missing initial adjclose + _test_warning( + 'data.iloc[0,-2] = np.nan', + 'adjclose price is missing', level='INFO') + # infty _test_warning( 'data.iloc[2,2] = np.inf', - 'infinity') + 'infinity', level='INFO') # non-pos price _test_warning( 'data.iloc[2,0] = -1', - 'non-positive open') + 'non-positive open', level='INFO') _test_warning( 'data.iloc[2,0] = 0', - 'non-positive open') + 'non-positive open', level='INFO') _test_warning( 'data.iloc[4,2] = 0', - 'non-positive high') + 'non-positive high', level='INFO') # neg volume _test_warning( 'data.iloc[2,-1] = -1', - 'negative volumes') + 'negative volumes', level='INFO') # open lower low _test_warning( 'data.iloc[1,0] = data.iloc[1,1]*.9', - 'open price lower than low price') + 'open price lower than low price', level='INFO') # open higher high _test_warning( 'data.iloc[1,0] = data.iloc[1,2]*1.1', - 'open price higher than high price') + 'open price higher than high price', level='INFO') # low higher close _test_warning( 'data.iloc[3,1] = data.iloc[3].close * 1.1', - 'low price higher than close price') + 'low price higher than close price', level='INFO') # high lower close _test_warning( # had to fix it otherwise open cleaner kicks in @@ -609,67 +616,67 @@ def _test_warning_update(data_transformation, part_of_message): 'data.iloc[3,0] = close * .95;' # open 'data.iloc[3,1] = close * .95;' # low 'data.iloc[3,2] = close * .975', # high - 'high price lower than close price') + 'high price lower than close price', level='INFO') # extreme low price _test_warning( 'data.iloc[3,1] = data.iloc[3,1] * .01', - 'anomalous low price') + 'anomalous low price', level='INFO') _test_warning( 'data.iloc[3,1] = data.iloc[3,1] * .02', - 'anomalous low price') + 'anomalous low price', level='INFO') _test_warning( 'data.iloc[3,1] = data.iloc[3,1] * .05', - 'anomalous low price') + 'anomalous low price', level='INFO') _test_warning( 'data.iloc[3,1] = data.iloc[3,1] * .1', - 'anomalous low price') + 'anomalous low price', level='INFO') _test_warning( 'data.iloc[3,1] = data.iloc[3,1] * .2', - 'anomalous low price') + 'anomalous low price', level='INFO') _test_warning( # changed dtindex until found one that works 'data.iloc[20,1] = data.iloc[20,1] * .5', - 'anomalous low price') + 'anomalous low price', level='INFO') # extreme high price _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 100', - 'anomalous high price') + 'anomalous high price', level='INFO') _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 50', - 'anomalous high price') + 'anomalous high price', level='INFO') _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 20', - 'anomalous high price') + 'anomalous high price', level='INFO') _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 10', - 'anomalous high price') + 'anomalous high price', level='INFO') _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 5', - 'anomalous high price') + 'anomalous high price', level='INFO') _test_warning( 'data.iloc[3,2] = data.iloc[3,2] * 2', - 'anomalous high price') + 'anomalous high price', level='INFO') # extreme open price _test_warning( 'data.iloc[3,0] = data.iloc[3,0] * 1.75;' + 'data.iloc[3,2] = data.iloc[3,0]', - 'anomalous open price') + 'anomalous open price', level='INFO') _test_warning( 'data.iloc[20,0] = data.iloc[20,0] * 0.5;' + 'data.iloc[20,1] = data.iloc[20,0]', - 'anomalous open price') + 'anomalous open price', level='INFO') # extreme open update _test_warning_update( 'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 1.75;' + 'new_data.iloc[-1,2] = new_data.iloc[-1,0]', - 'anomalous open price') + 'anomalous open price', level='INFO') _test_warning_update( 'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 0.5;' + 'new_data.iloc[-1,1] = new_data.iloc[-1,0]', - 'anomalous open price') + 'anomalous open price', level='INFO') # def test_yahoo_finance_wrong_last_time(self): # """Test that we correct last time if intraday.""" diff --git a/examples/data_cleaning.py b/examples/data_cleaning.py index 08a503c77..23e67d000 100644 --- a/examples/data_cleaning.py +++ b/examples/data_cleaning.py @@ -38,7 +38,7 @@ TEST_UNIVERSE = ['AAPL', 'GOOG', 'TSLA'] # Some names with known issues: -# TEST_UNIVERSE = ['SMT.L','NVR', 'HUBB', 'NWG.L'] +# TEST_UNIVERSE = ['SMT.L', 'NVR', 'HUBB', 'NWG.L'] ALL_DROPPED_ROWS_PCT = pd.Series(dtype=float) ALL_MIN_LR = pd.Series(dtype=float) @@ -104,8 +104,8 @@ print('\nCvxportfolio dropped rows %:') print(ALL_DROPPED_ROWS_PCT.sort_values().tail()) -print('\nnCvxportfolio min logreturns:') +print('\nCvxportfolio min logreturns:') print(ALL_MIN_LR.sort_values().head()) -print('\nnCvxportfolio max logreturns:') +print('\nCvxportfolio max logreturns:') print(ALL_MAX_LR.sort_values().tail()) From b9d5d015babb597683124394cf9e625d86d5f670 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Thu, 15 Feb 2024 17:57:14 +0400 Subject: [PATCH 37/38] testcase typo --- cvxportfolio/data/symbol_data.py | 8 ++++++++ cvxportfolio/tests/test_data.py | 6 +----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py index 759af7d66..119c1221a 100644 --- a/cvxportfolio/data/symbol_data.py +++ b/cvxportfolio/data/symbol_data.py @@ -664,6 +664,14 @@ def _preload(self, data): class YahooFinance(OLHCV): """Yahoo Finance symbol data. + .. versionadded:: 1.2.0 + + The data cleaning logic has been significantly improved, see the + ``data_cleaning.py`` example to view what's done on any given + name (or enable ``'INFO'`` logging messages). It is recommended to + delete the ``~/cvxportfolio_data`` folder with data files downloaded + by previous Cvxportfolio versions. + :param symbol: The symbol that we downloaded. :type symbol: str :param storage_backend: The storage backend, implemented ones are diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py index 3fbf87f93..a9ea2eb6a 100644 --- a/cvxportfolio/tests/test_data.py +++ b/cvxportfolio/tests/test_data.py @@ -435,11 +435,7 @@ def test_yahoo_finance_cleaning(self): self.assertLess(data['return'].max(), 0.75) # this stock had some extreme returns but they were legitimate - # only available on py>3.10 - if hasattr(self, 'assertNoLogs'): - with self.assertNoLogs(level='WARNING'): # pragma: no cover - # pragma: no cover - data = YahooFinance('GME', base_location=self.datadir).data + data = YahooFinance('GME', base_location=self.datadir).data self.assertGreater(data['return'].min(), -0.75) self.assertGreater(data['return'].max(), 3) From c9c11ff27833e550b2d5327a24d9c8ebaccdb081 Mon Sep 17 00:00:00 2001 From: Enzo Busseti Date: Thu, 15 Feb 2024 18:19:57 +0400 Subject: [PATCH 38/38] data cleaning example docs --- docs/examples.rst | 1 + docs/examples/data_cleaning.rst | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 docs/examples/data_cleaning.rst diff --git a/docs/examples.rst b/docs/examples.rst index d43bd17b5..102471231 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -12,6 +12,7 @@ We show some of them, along with their results, in the following pages: examples/dow30 examples/timing examples/universes + examples/data_cleaning examples/etfs examples/user_provided_forecasters examples/risk_models diff --git a/docs/examples/data_cleaning.rst b/docs/examples/data_cleaning.rst new file mode 100644 index 000000000..1128baf63 --- /dev/null +++ b/docs/examples/data_cleaning.rst @@ -0,0 +1,9 @@ +Data cleaning +=================== + +This example script is +`available in the repository `_. + +.. literalinclude:: ../../examples/data_cleaning.py + :language: python + :lines: 14-