From d745b00662df9305617bc8f68daceb0a75178bc5 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Thu, 4 Jan 2024 11:52:38 +0400
Subject: [PATCH 01/38] started ftse100 strategy

---
 cvxportfolio/data.py                 |   4 +-
 examples/strategies/ftse100_daily.py | 114 +++++++++++++++++++++++++++
 examples/universes.py                |  37 +++++++--
 3 files changed, 148 insertions(+), 7 deletions(-)
 create mode 100644 examples/strategies/ftse100_daily.py

diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
index c19e1f404..e96958bce 100644
--- a/cvxportfolio/data.py
+++ b/cvxportfolio/data.py
@@ -543,9 +543,9 @@ class Fred(SymbolData):
 
     def _internal_download(self, symbol):
         try:
-            return pd.read_csv(
+            return pd.to_numeric(pd.read_csv(
                 self.URL + f'?id={symbol}',
-                index_col=0, parse_dates=[0])[symbol]
+                index_col=0, parse_dates=[0])[symbol], errors='coerce')
         except URLError as exc:
             raise DataError(f"Download of {symbol}"
                 + f" from {self.__class__.__name__} failed."
diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py
new file mode 100644
index 000000000..0f0b976db
--- /dev/null
+++ b/examples/strategies/ftse100_daily.py
@@ -0,0 +1,114 @@
+# Copyright 2023 Enzo Busseti
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is a simple example strategy which we run every day.
+
+It is a long-only, unit leverage, allocation on the FTSE 100 universe.
+
+We will see how it performs online.
+
+You run it from the root of the repository in the development environment by:
+
+.. code:: bash
+
+    python -m examples.strategies.ftse100_daily
+"""
+
+import cvxportfolio as cvx
+
+from ..universes import FTSE100
+
+HYPERPAR_OPTIMIZE_START = '2012-01-01'
+
+OBJECTIVE = 'sharpe_ratio'
+
+
+def policy(gamma_risk, gamma_trade):
+    """Create fresh policy object, also return handles to hyper-parameters.
+
+    :param gamma_risk: Risk aversion multiplier.
+    :type gamma_risk: float
+    :param gamma_trade: Transaction cost aversion multiplier.
+    :type gamma_trade: float, optional
+
+    :return: Policy object and dictionary mapping hyper-parameter names (which
+        must match the arguments of this function) to their respective objects.
+    :rtype: tuple
+    """
+    gamma_risk_hp = cvx.Gamma(initial_value=gamma_risk)
+    gamma_trade_hp = cvx.Gamma(initial_value=gamma_trade)
+    return cvx.SinglePeriodOptimization(
+        cvx.ReturnsForecast()
+        - gamma_risk_hp * cvx.FullCovariance()
+        - gamma_trade_hp * cvx.StocksTransactionCost(),
+        [cvx.LongOnly(), cvx.LeverageLimit(1)],
+        benchmark=cvx.MarketBenchmark(),
+    ), {'gamma_risk': gamma_risk_hp, 'gamma_trade': gamma_trade_hp}
+
+
+if __name__ == '__main__':
+
+    RESEARCH = True
+
+    if not RESEARCH:
+        from .strategy_executor import main
+        main(policy=policy, hyperparameter_opt_start=HYPERPAR_OPTIMIZE_START,
+            objective=OBJECTIVE, universe=FTSE100, cash_key='GBPOUND')
+
+    else:
+        import matplotlib.pyplot as plt
+        #INDEX_ETF = 'DIA'
+
+        research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND')
+
+        research_policy, _ = policy(1., 1.)
+
+        result_unif = research_sim.backtest(
+            cvx.Uniform(), start_time=HYPERPAR_OPTIMIZE_START)
+        print('uniform')
+        print(result_unif)
+
+        result_market = research_sim.backtest(
+            cvx.MarketBenchmark(), start_time=HYPERPAR_OPTIMIZE_START)
+        print('market')
+        print(result_market)
+
+        exit(0)
+
+        # result_etf = cvx.StockMarketSimulator([INDEX_ETF]).backtest(
+        #     cvx.Uniform(), start_time=HYPERPAR_OPTIMIZE_START)
+        # print(INDEX_ETF)
+        # print(result_etf)
+
+        research_sim.optimize_hyperparameters(
+            research_policy, start_time=HYPERPAR_OPTIMIZE_START,
+            objective='sharpe_ratio')
+
+        result_opt = research_sim.backtest(
+            research_policy, start_time=HYPERPAR_OPTIMIZE_START)
+        print('optimized')
+        print(result_opt)
+
+        result_unif.plot()
+        result_opt.plot()
+        result_market.plot()
+        #result_etf.plot()
+
+        plt.figure()
+        result_opt.growth_rates.iloc[-252*4:].cumsum().plot(label='optimized')
+        result_unif.growth_rates.iloc[-252*4:].cumsum().plot(label='uniform')
+        result_market.growth_rates.iloc[-252*4:].cumsum().plot(label='market')
+        #result_etf.growth_rates.iloc[-252*4:].cumsum().plot(label='market etf')
+        plt.legend()
+
+        plt.show()
diff --git a/examples/universes.py b/examples/universes.py
index fb2b6348c..4bd34e050 100644
--- a/examples/universes.py
+++ b/examples/universes.py
@@ -20,7 +20,7 @@
 We could also save each universe in a ``json`` file.
 """
 
-# This was generated on 2023-12-27 06:55:30.344592+00:00
+# This was generated on 2024-01-04 06:18:49.851642+00:00
 
 SP500 = \
 ['A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI',
@@ -89,6 +89,21 @@
  'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT',
  'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT']
 
+FTSE100 = \
+['AAF.L', 'AAL.L', 'ABF.L', 'ADM.L', 'AHT.L', 'ANTO.L', 'AUTO.L', 'AV.L',
+ 'AZN.L', 'BA.L', 'BARC.L', 'BATS.L', 'BDEV.L', 'BEZ.L', 'BKG.L', 'BME.L',
+ 'BNZL.L', 'BP.L', 'BRBY.L', 'BT-A.L', 'CCH.L', 'CNA.L', 'CPG.L', 'CRDA.L',
+ 'CTEC.L', 'DCC.L', 'DGE.L', 'DPH.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L',
+ 'FCIT.L', 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L',
+ 'HLN.L', 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L',
+ 'IMI.L', 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L',
+ 'LSEG.L', 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L',
+ 'OCDO.L', 'PHNX.L', 'PRU.L', 'PSH.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L',
+ 'RMV.L', 'RR.L', 'RS1.L', 'RTO.L', 'SBRY.L', 'SDR.L', 'SGE.L', 'SGRO.L',
+ 'SHEL.L', 'SKG.L', 'SMDS.L', 'SMIN.L', 'SMT.L', 'SN.L', 'SPX.L', 'SSE.L',
+ 'STAN.L', 'STJ.L', 'SVT.L', 'TSCO.L', 'TW.L', 'ULVR.L', 'UTG.L', 'UU.L',
+ 'VOD.L', 'WEIR.L', 'WPP.L', 'WTB.L']
+
 if __name__ == '__main__':
 
     # import json
@@ -113,10 +128,16 @@
             'page': "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average",
             'table_number': 0,
             'column_number': 1,
+        },
+        'ftse100': {
+            'page': 'https://en.wikipedia.org/wiki/FTSE_100_Index',
+            'table_number': -1,
+            'column_number':1,
+            'suffix':'.L',
         }
     }
 
-    def get_column_wikipedia_page(page, table_number, column_number):
+    def get_column_wikipedia_page(page, table_number, column_number, **kwargs):
         """Get a column as list of strings from a table on wikipedia.
 
         This is adapted from:
@@ -129,6 +150,8 @@ def get_column_wikipedia_page(page, table_number, column_number):
         :type table_number: int
         :param column_number: Which column to extract.
         :type column_number: int
+        :param kwargs: Unused arguments.
+        :type kwargs: dict
 
         :returns: Sorted strings of the column.
         :rtype: list
@@ -143,17 +166,21 @@ def get_column_wikipedia_page(page, table_number, column_number):
             column.append(element.strip())
         return sorted(column)
 
-    def adapt_for_yahoo_finance(tickers_list):
+    def adapt_for_yahoo_finance(tickers_list, suffix='', **kwargs):
         """Change tickers to match the spelling of Yahoo Finance.
 
         :param tickers_list: Tickers from Wikipedia.
         :type tickers_list: list
+        :param suffix: Suffix to add to each ticker, default empty string.
+        :type suffix: str
+        :param kwargs: Unused arguments.
+        :type kwargs: dict
 
         :returns: Adapted tickers.
         :rtype: list
         """
 
-        return [el.replace('.', '-') for el in tickers_list]
+        return [el.replace('.', '-') + suffix for el in tickers_list]
 
     # re-write this file
 
@@ -177,7 +204,7 @@ def adapt_for_yahoo_finance(tickers_list):
         for key, value in universes.items():
 
             tickers = adapt_for_yahoo_finance(
-                get_column_wikipedia_page(**value))
+                get_column_wikipedia_page(**value), **value)
             f.write(f'\n{key.upper()} = \\\n')
             pprint(tickers, compact=True, width=79, stream=f)
 

From 1ac90d410d808458a7ac0eadc965935b885330ba Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 17 Jan 2024 21:05:07 +0400
Subject: [PATCH 02/38] data quality check

---
 cvxportfolio/data.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
index e96958bce..6b5dcfc1e 100644
--- a/cvxportfolio/data.py
+++ b/cvxportfolio/data.py
@@ -498,6 +498,47 @@ def _download(self, symbol, current=None,
         new = self._clean(new)
         return pd.concat([current.iloc[:-overlap], new])
 
+    def _quality_check(self, data):
+        """Analyze quality of the OHLCV-TR data."""
+
+        # zero volume
+        zerovol_idx = data.index[data.volume == 0]
+        if len(zerovol_idx) > 0:
+            logger.warning(
+                '%s("%s") has volume equal to zero for timestamps: %s',
+                self.__class__.__name__, self.symbol, zerovol_idx)
+
+        def print_extreme(logreturns, name, sigmas=50):
+
+            # TODO: choose
+            m, s = logreturns.median(), np.sqrt((logreturns**2).median())
+            normalized = (logreturns - m)/s
+
+            # normalized = logreturns / logreturns.rolling(252).std().shift(1)
+
+            extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
+            if len(extremereturn_idx) > 0:
+                logger.warning(
+                    '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
+                    self.__class__.__name__, self.symbol, name, sigmas,
+                    extremereturn_idx)
+
+        # extreme logreturns
+        logreturns = np.log(1 + data['return']).dropna()
+        print_extreme(logreturns, 'total returns')
+
+        # extreme open2close
+        open2close = np.log(data['close']) - np.log(data['open']).dropna()
+        print_extreme(open2close, 'open to close returns')
+
+        # extreme open2high
+        open2high = np.log(data['high']) - np.log(data['open']).dropna()
+        print_extreme(open2high, 'open to high returns')
+
+        # extreme open2low
+        open2low = np.log(data['low']) - np.log(data['open']).dropna()
+        print_extreme(open2low, 'open to low returns')
+
     def _preload(self, data):
         """Prepare data for use by Cvxportfolio.
 
@@ -505,6 +546,8 @@ def _preload(self, data):
         replace it with `valuevolume` which is an estimate of the (e.g.,
         US dollar) value of the volume exchanged on the day.
         """
+
+        self._quality_check(data)
         data["valuevolume"] = data["volume"] * data["open"]
         del data["volume"]
 

From b45b9d7a915edc969a597cc56b8a26c26174928d Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 11:58:46 +0400
Subject: [PATCH 03/38] Split history cvxportfolio/data.py to
 cvxportfolio/market_data.py

---
 cvxportfolio/{data.py => market_data.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cvxportfolio/{data.py => market_data.py} (100%)

diff --git a/cvxportfolio/data.py b/cvxportfolio/market_data.py
similarity index 100%
rename from cvxportfolio/data.py
rename to cvxportfolio/market_data.py

From 7988b64f69b67317d10f22648b860885c3501acb Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 11:58:46 +0400
Subject: [PATCH 04/38] Split history cvxportfolio/data.py to
 cvxportfolio/market_data.py

---
 cvxportfolio/data.py => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cvxportfolio/data.py => temp (100%)

diff --git a/cvxportfolio/data.py b/temp
similarity index 100%
rename from cvxportfolio/data.py
rename to temp

From 4f4fa684b5fda79eecbc42438075b8cf22837908 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 11:58:46 +0400
Subject: [PATCH 05/38] Split history cvxportfolio/data.py to
 cvxportfolio/market_data.py

---
 temp => cvxportfolio/data.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => cvxportfolio/data.py (100%)

diff --git a/temp b/cvxportfolio/data.py
similarity index 100%
rename from temp
rename to cvxportfolio/data.py

From ade1db5d93c706f2e9f58ba75a4eccab7edd1a9a Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 12:00:16 +0400
Subject: [PATCH 06/38] moved both files

---
 cvxportfolio/data/__init__.py                 | 25 +++++++++++++++++++
 cvxportfolio/{ => data}/market_data.py        |  0
 cvxportfolio/{data.py => data/symbol_data.py} |  0
 3 files changed, 25 insertions(+)
 create mode 100644 cvxportfolio/data/__init__.py
 rename cvxportfolio/{ => data}/market_data.py (100%)
 rename cvxportfolio/{data.py => data/symbol_data.py} (100%)

diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py
new file mode 100644
index 000000000..df2459ce4
--- /dev/null
+++ b/cvxportfolio/data/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2023 Enzo Busseti
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module include classes that download, store, and serve market data.
+The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
+Neither are exposed outside this module. Their derived classes instead are.
+If you want to interface cvxportfolio with financial data source other
+than the ones we provide, you should derive from either of those two classes.
+"""
+
+from .market_data import *
+from .symbol_data import *
+
+__all__ = [
+    "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"]
\ No newline at end of file
diff --git a/cvxportfolio/market_data.py b/cvxportfolio/data/market_data.py
similarity index 100%
rename from cvxportfolio/market_data.py
rename to cvxportfolio/data/market_data.py
diff --git a/cvxportfolio/data.py b/cvxportfolio/data/symbol_data.py
similarity index 100%
rename from cvxportfolio/data.py
rename to cvxportfolio/data/symbol_data.py

From fb9d0dbefc9d40916ec73fe4adaaf09a6c64aedc Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 12:15:44 +0400
Subject: [PATCH 07/38] completed split of data.py into data/ , no logic
 changes

used script git-split.sh

#!/bin/sh
# used https://stackoverflow.com/questions/3887736/keep-git-history-when-splitting-a-file

if [[ $# -ne 2 ]] ; then
  echo "Usage: git-split.sh original copy"
  exit 0
fi

git mv $1 $2
git commit -n -m "Split history $1 to $2"
REV=`git rev-parse HEAD`
git reset --hard HEAD^
git mv $1 temp
git commit -n -m "Split history $1 to $2"
git merge $REV
git commit -a -n -m "Split history $1 to $2"
git mv temp $1
git commit -n -m "Split history $1 to $2"
---
 cvxportfolio/data/__init__.py    |   1 +
 cvxportfolio/data/market_data.py | 755 +------------------------------
 cvxportfolio/data/symbol_data.py | 701 +---------------------------
 3 files changed, 16 insertions(+), 1441 deletions(-)

diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py
index df2459ce4..97aad902d 100644
--- a/cvxportfolio/data/__init__.py
+++ b/cvxportfolio/data/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """This module include classes that download, store, and serve market data.
+
 The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
 Neither are exposed outside this module. Their derived classes instead are.
 If you want to interface cvxportfolio with financial data source other
diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index d75936d1b..be723a618 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -11,766 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module include classes that download, store, and serve market data.
+"""This module defines the :class:`MarketData` abstraction and derived classes."""
 
-The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
-Neither are exposed outside this module. Their derived classes instead are.
-
-If you want to interface cvxportfolio with financial data source other
-than the ones we provide, you should derive from either of those two classes.
-"""
-
-import datetime
 import logging
-import sqlite3
 import sys
-import warnings
 from pathlib import Path
-from urllib.error import URLError
 
 import numpy as np
 import pandas as pd
-import requests
-import requests.exceptions
-
-from .errors import DataError
-from .utils import (hash_, periods_per_year_from_datetime_index,
-                    resample_returns)
 
-__all__ = ["YahooFinance", "Fred",
-           "UserProvidedMarketData", "DownloadedMarketData"]
+from ..errors import DataError
+from ..utils import (hash_, periods_per_year_from_datetime_index,
+                     resample_returns)
+from .symbol_data import *
 
 logger = logging.getLogger(__name__)
 
-BASE_LOCATION = Path.home() / "cvxportfolio_data"
-
-def now_timezoned():
-    """Return current timestamp with local timezone.
-
-    :returns: Current timestamp with local timezone.
-    :rtype: pandas.Timestamp
-    """
-    return pd.Timestamp(
-        datetime.datetime.now(datetime.timezone.utc).astimezone())
-
-class SymbolData:
-    """Base class for a single symbol time series data.
-
-    The data is either in the form of a Pandas Series or DataFrame
-    and has datetime index.
-
-    This class needs to be derived. At a minimum,
-    one should redefine the ``_download`` method, which
-    implements the downloading of the symbol's time series
-    from an external source. The method takes the current (already
-    downloaded and stored) data and is supposed to **only append** to it.
-    In this way we only store new data and don't modify already downloaded
-    data.
-
-    Additionally one can redefine the ``_preload`` method, which prepares
-    data to serve to the user (so the data is stored in a different format
-    than what the user sees.) We found that this separation can be useful.
-
-    This class interacts with module-level functions named ``_loader_BACKEND``
-    and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage
-    system used. We define ``pickle``, ``csv``, and ``sqlite`` backends.
-    These may have limitations. See their docstrings for more information.
-
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param base_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data. By default it's one day.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded data for the symbol.
-    """
-
-    def __init__(self, symbol,
-                 storage_backend='pickle',
-                 base_location=BASE_LOCATION,
-                 grace_period=pd.Timedelta('1d')):
-        self._symbol = symbol
-        self._storage_backend = storage_backend
-        self._base_location = base_location
-        self.update(grace_period)
-        self._data = self.load()
-
-    @property
-    def storage_location(self):
-        """Storage location. Directory is created if not existent.
-
-        :rtype: pathlib.Path
-        """
-        loc = self._base_location / f"{self.__class__.__name__}"
-        loc.mkdir(parents=True, exist_ok=True)
-        return loc
-
-    @property
-    def symbol(self):
-        """The symbol whose data this instance contains.
-
-        :rtype: str
-        """
-        return self._symbol
-
-    @property
-    def data(self):
-        """Time series data, updated to the most recent observation.
-
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return self._data
-
-    def _load_raw(self):
-        """Load raw data from database."""
-        # we could implement multiprocess safety here
-        loader = globals()['_loader_' + self._storage_backend]
-        try:
-            logger.info(
-                f"{self.__class__.__name__} is trying to load {self.symbol}"
-                + f" with {self._storage_backend} backend"
-                + f" from {self.storage_location}")
-            return loader(self.symbol, self.storage_location)
-        except FileNotFoundError:
-            return None
-
-    def load(self):
-        """Load data from database using `self.preload` function to process.
-
-        :returns: Loaded time-series data for the symbol.
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return self._preload(self._load_raw())
-
-    def _store(self, data):
-        """Store data in database.
-
-        :param data: Time-series data to store.
-        :type data: pandas.Series or pandas.DataFrame
-        """
-        # we could implement multiprocess safety here
-        storer = globals()['_storer_' + self._storage_backend]
-        logger.info(
-            f"{self.__class__.__name__} is storing {self.symbol}"
-            + f" with {self._storage_backend} backend"
-            + f" in {self.storage_location}")
-        storer(self.symbol, data, self.storage_location)
-
-    def _print_difference(self, current, new):
-        """Helper method to print difference if update is not append-only.
-
-        This is temporary and will be re-factored.
-        """
-        print("TEMPORARY: Diff between overlap of downloaded and stored")
-        print((new - current).dropna(how='all').tail(5))
-
-    def update(self, grace_period):
-        """Update current stored data for symbol.
-
-        :param grace_period: If the time between now and the last value stored
-            is less than this, we don't update the data already stored.
-        :type grace_period: pandas.Timedelta
-        """
-        current = self._load_raw()
-        logger.info(
-            f"Downloading {self.symbol}"
-            + f" from {self.__class__.__name__}")
-        updated = self._download(
-            self.symbol, current, grace_period=grace_period)
-
-        if np.any(updated.iloc[:-1].isnull()):
-            logger.warning(
-              " cvxportfolio.%s('%s').data contains NaNs."
-              + " You may want to inspect it. If you want, you can delete the"
-              + " data file in %s to force re-download from the start.",
-              self.__class__.__name__, self.symbol, self.storage_location)
-
-        try:
-            if current is not None:
-                if not np.all(
-                        # we use numpy.isclose because returns may be computed
-                        # via logreturns and numerical errors can sift through
-                        np.isclose(updated.loc[current.index[:-1]],
-                            current.iloc[:-1], equal_nan=True,
-                            rtol=1e-08, atol=1e-08)):
-                    logger.error(f"{self.__class__.__name__} update"
-                        + f" of {self.symbol} is not append-only!")
-                    self._print_difference(current, updated)
-                if hasattr(current, 'columns'):
-                    # the first column is open price
-                    if not current.iloc[-1, 0] == updated.loc[
-                            current.index[-1]].iloc[0]:
-                        logger.error(
-                            f"{self.__class__.__name__} update "
-                            + f" of {self.symbol} changed last open price!")
-                        self._print_difference(current, updated)
-                else:
-                    if not current.iloc[-1] == updated.loc[current.index[-1]]:
-                        logger.error(
-                            f"{self.__class__.__name__} update"
-                            + f" of {self.symbol} changed last value!")
-                        self._print_difference(current, updated)
-        except KeyError:
-            logger.error("%s update of %s could not be checked for"
-                + " append-only edits. Was there a DST change?",
-                self.__class__.__name__, self.symbol)
-        self._store(updated)
-
-    def _download(self, symbol, current, grace_period, **kwargs):
-        """Download data from external source given already downloaded data.
-
-        This method must be redefined by derived classes.
-
-        :param symbol: The symbol we download.
-        :type symbol: str
-        :param current: The data already downloaded. We are supposed to
-            **only append** to it. If None, no data is present.
-        :type current: pandas.Series or pandas.DataFrame or None
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        raise NotImplementedError #pragma: no cover
-
-    def _preload(self, data):
-        """Prepare data to serve to the user.
-
-        This method can be redefined by derived classes.
-
-        :param data: The data returned by the storage backend.
-        :type data: pandas.Series or pandas.DataFrame
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return data
-
-
-#
-# Yahoo Finance.
-#
-
-def _timestamp_convert(unix_seconds_ts):
-    """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
-    return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
-
-
-class YahooFinance(SymbolData):
-    """Yahoo Finance symbol data.
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded, and cleaned, data for the symbol.
-    :type data: pandas.DataFrame
-    """
-
-    # is open-high-low-close-volume-(total)return
-    IS_OHLCVR = True
-
-    @staticmethod
-    def _clean(data):
-        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # nan-out nonpositive prices
-        data.loc[data["open"] <= 0, 'open'] = np.nan
-        data.loc[data["close"] <= 0, "close"] = np.nan
-        data.loc[data["high"] <= 0, "high"] = np.nan
-        data.loc[data["low"] <= 0, "low"] = np.nan
-        data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan
-
-        # nan-out negative volumes
-        data.loc[data["volume"] < 0, 'volume'] = np.nan
-
-        # all infinity values are nans
-        data.iloc[:, :] = np.nan_to_num(
-            data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan)
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # if low is not the lowest, set it to nan
-        data['low'].loc[
-            data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan
-
-        # if high is not the highest, set it to nan
-        data['high'].loc[
-            data['high'] < data[['open', 'high', 'close']].max(1)] = np.nan
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        #
-        # fills
-        #
-
-        # fill volumes with zeros (safest choice)
-        data['volume'] = data['volume'].fillna(0.)
-
-        # fill close price with open price
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill open price with close from day(s) before
-        # repeat as long as it helps (up to 1 year)
-        for shifter in range(252):
-            orig_missing_opens = data['open'].isnull().sum()
-            data['open'] = data['open'].fillna(data['close'].shift(
-                shifter+1))
-            new_missing_opens = data['open'].isnull().sum()
-            if orig_missing_opens == new_missing_opens:
-                break
-            logger.info(
-                "Filled missing open prices with close from %s periods before",
-                shifter+1)
-
-        # fill close price with same day's open
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill high price with max
-        data['high'] = data['high'].fillna(data[['open', 'close']].max(1))
-
-        # fill low price with max
-        data['low'] = data['low'].fillna(data[['open', 'close']].min(1))
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        #
-        # Compute returns
-        #
-
-        # compute log of ratio between adjclose and close
-        log_adjustment_ratio = np.log(data['adjclose'] / data['close'])
-
-        # forward fill adjustment ratio
-        log_adjustment_ratio = log_adjustment_ratio.ffill()
-
-        # non-market log returns (dividends, splits)
-        non_market_lr = log_adjustment_ratio.diff().shift(-1)
-
-        # full open-to-open returns
-        open_to_open = np.log(data["open"]).diff().shift(-1)
-        data['return'] = np.exp(open_to_open + non_market_lr) - 1
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # intraday_logreturn = np.log(data["close"]) - np.log(data["open"])
-        # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1)
-        # open_to_open_logreturn = (
-        #     close_to_close_logreturn + intraday_logreturn -
-        #     intraday_logreturn.shift(-1)
-        # )
-        # data["return"] = np.exp(open_to_open_logreturn) - 1
-        del data["adjclose"]
-
-        # eliminate last period's intraday data
-        data.loc[data.index[-1],
-            ["high", "low", "close", "return", "volume"]] = np.nan
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        return data
-
-    @staticmethod
-    def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
-        """Get 1 day OHLC from Yahoo finance.
-
-        Result is timestamped with the open time (time-zoned) of the
-        instrument.
-        """
-
-        base_url = 'https://query2.finance.yahoo.com'
-
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'
-            ' AppleWebKit/537.36 (KHTML, like Gecko)'
-            ' Chrome/39.0.2171.95 Safari/537.36'}
-
-        # print(HEADERS)
-        start = int(pd.Timestamp(start).timestamp())
-        end = int(pd.Timestamp(end).timestamp())
-
-        try:
-            res = requests.get(
-                url=f"{base_url}/v8/finance/chart/{ticker}",
-                params={'interval': '1d',
-                    "period1": start,
-                    "period2": end},
-                headers=headers,
-                timeout=10) # seconds
-        except requests.ConnectionError as exc:
-            raise DataError(
-                f"Download of {ticker} from YahooFinance failed."
-                + " Are you connected to the Internet?") from exc
-
-        # print(res)
-
-        if res.status_code == 404:
-            raise DataError(
-                f'Data for symbol {ticker} is not available.'
-                + 'Json output:', str(res.json()))
-
-        if res.status_code != 200:
-            raise DataError(f'Yahoo finance download of {ticker} failed. Json:',
-                str(res.json())) # pragma: no cover
-
-        data = res.json()['chart']['result'][0]
-
-        try:
-            index = pd.DatetimeIndex(
-                [_timestamp_convert(el) for el in data['timestamp']])
-
-            df_result = pd.DataFrame(
-                data['indicators']['quote'][0], index=index)
-            df_result['adjclose'] = data[
-                'indicators']['adjclose'][0]['adjclose']
-        except KeyError:
-            raise DataError(f'Yahoo finance download of {ticker} failed.'
-                + ' Json:', str(res.json())) # pragma: no cover
-
-        # last timestamp is probably broken (not timed to market open)
-        # we set its time to same as the day before, but this is wrong
-        # on days of DST switch. It's fine though because that line will be
-        # overwritten next update
-        if df_result.index[-1].time() != df_result.index[-2].time():
-            tm1 = df_result.index[-2].time()
-            newlast = df_result.index[-1].replace(
-                hour=tm1.hour, minute=tm1.minute, second=tm1.second)
-            df_result.index = pd.DatetimeIndex(
-                list(df_result.index[:-1]) + [newlast])
-
-        return df_result[
-            ['open', 'low', 'high', 'close', 'adjclose', 'volume']]
-
-    def _download(self, symbol, current=None,
-                overlap=5, grace_period='5d', **kwargs):
-        """Download single stock from Yahoo Finance.
-
-        If data was already downloaded we only download
-        the most recent missing portion.
-
-        Args:
-
-            symbol (str): yahoo name of the instrument
-            current (pandas.DataFrame or None): current data present locally
-            overlap (int): how many lines of current data will be overwritten
-                by newly downloaded data
-            kwargs (dict): extra arguments passed to yfinance.download
-
-        Returns:
-            updated (pandas.DataFrame): updated DataFrame for the symbol
-        """
-        if overlap < 2:
-            raise SyntaxError(
-                f'{self.__class__.__name__} with overlap smaller than 2'
-                + ' could have issues with DST.')
-        if (current is None) or (len(current) < overlap):
-            updated = self._get_data_yahoo(symbol, **kwargs)
-            logger.info('Downloading from the start.')
-            result = self._clean(updated)
-            # we remove first row if it contains NaNs
-            if np.any(result.iloc[0].isnull()):
-                result = result.iloc[1:]
-            return result
-        if (now_timezoned() - current.index[-1]
-                ) < pd.Timedelta(grace_period):
-            logger.info(
-                'Skipping download because stored data is recent enough.')
-            return current
-        new = self._get_data_yahoo(symbol, start=current.index[-overlap])
-        new = self._clean(new)
-        return pd.concat([current.iloc[:-overlap], new])
-
-    def _quality_check(self, data):
-        """Analyze quality of the OHLCV-TR data."""
-
-        # zero volume
-        zerovol_idx = data.index[data.volume == 0]
-        if len(zerovol_idx) > 0:
-            logger.warning(
-                '%s("%s") has volume equal to zero for timestamps: %s',
-                self.__class__.__name__, self.symbol, zerovol_idx)
-
-        def print_extreme(logreturns, name, sigmas=50):
-
-            # TODO: choose
-            m, s = logreturns.median(), np.sqrt((logreturns**2).median())
-            normalized = (logreturns - m)/s
-
-            # normalized = logreturns / logreturns.rolling(252).std().shift(1)
-
-            extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
-            if len(extremereturn_idx) > 0:
-                logger.warning(
-                    '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
-                    self.__class__.__name__, self.symbol, name, sigmas,
-                    extremereturn_idx)
-
-        # extreme logreturns
-        logreturns = np.log(1 + data['return']).dropna()
-        print_extreme(logreturns, 'total returns')
-
-        # extreme open2close
-        open2close = np.log(data['close']) - np.log(data['open']).dropna()
-        print_extreme(open2close, 'open to close returns')
-
-        # extreme open2high
-        open2high = np.log(data['high']) - np.log(data['open']).dropna()
-        print_extreme(open2high, 'open to high returns')
-
-        # extreme open2low
-        open2low = np.log(data['low']) - np.log(data['open']).dropna()
-        print_extreme(open2low, 'open to low returns')
-
-    def _preload(self, data):
-        """Prepare data for use by Cvxportfolio.
-
-        We drop the `volume` column expressed in number of stocks and
-        replace it with `valuevolume` which is an estimate of the (e.g.,
-        US dollar) value of the volume exchanged on the day.
-        """
-
-        self._quality_check(data)
-        data["valuevolume"] = data["volume"] * data["open"]
-        del data["volume"]
-
-        return data
-
-#
-# Fred.
-#
-
-class Fred(SymbolData):
-    """Fred single-symbol data.
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data. By default it's one day.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded data for the symbol.
-    """
-
-    URL = "https://fred.stlouisfed.org/graph/fredgraph.csv"
-
-    # TODO: implement Fred point-in-time
-    # example:
-    # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06
-    # hourly wages time series **as it appeared** on 2023-07-06
-    # store using pd.Series() of diff'ed values only.
-
-    def _internal_download(self, symbol):
-        try:
-            return pd.to_numeric(pd.read_csv(
-                self.URL + f'?id={symbol}',
-                index_col=0, parse_dates=[0])[symbol], errors='coerce')
-        except URLError as exc:
-            raise DataError(f"Download of {symbol}"
-                + f" from {self.__class__.__name__} failed."
-                + " Are you connected to the Internet?") from exc
-
-    def _download(
-        self, symbol="DFF", current=None, grace_period='5d', **kwargs):
-        """Download or update pandas Series from Fred.
-
-        If already downloaded don't change data stored locally and only
-        add new entries at the end.
-
-        Additionally, we allow for a `grace period`, if the data already
-        downloaded has a last entry not older than the grace period, we
-        don't download new data.
-        """
-        if current is None:
-            return self._internal_download(symbol)
-        if (pd.Timestamp.today() - current.index[-1]
-            ) < pd.Timedelta(grace_period):
-            logger.info(
-                'Skipping download because stored data is recent enough.')
-            return current
-
-        new = self._internal_download(symbol)
-        new = new.loc[new.index > current.index[-1]]
-
-        if new.empty:
-            logger.info('New downloaded data is empty!')
-            return current
-
-        assert new.index[0] > current.index[-1]
-        return pd.concat([current, new])
-
-    def _preload(self, data):
-        """Add UTC timezone."""
-        data.index = data.index.tz_localize('UTC')
-        return data
-
-#
-# Sqlite storage backend.
-#
-
-def _open_sqlite(storage_location):
-    return sqlite3.connect(storage_location/"db.sqlite")
-
-def _close_sqlite(connection):
-    connection.close()
-
-def _loader_sqlite(symbol, storage_location):
-    """Load data in sqlite format.
-
-    We separately store dtypes for data consistency and safety.
-
-    .. note:: If your pandas object's index has a name it will be lost,
-        the index is renamed 'index'. If you pass timestamp data (including
-        the index) it must have explicit timezone.
-    """
-    try:
-        connection = _open_sqlite(storage_location)
-        dtypes = pd.read_sql_query(
-            f"SELECT * FROM {symbol}___dtypes",
-            connection, index_col="index",
-            dtype={"index": "str", "0": "str"})
-
-        parse_dates = 'index'
-        my_dtypes = dict(dtypes["0"])
-
-        tmp = pd.read_sql_query(
-            f"SELECT * FROM {symbol}", connection,
-            index_col="index", parse_dates=parse_dates, dtype=my_dtypes)
-
-        _close_sqlite(connection)
-        multiindex = []
-        for col in tmp.columns:
-            if col[:8] == "___level":
-                multiindex.append(col)
-            else:
-                break
-        if len(multiindex) > 0:
-            multiindex = [tmp.index.name] + multiindex
-            tmp = tmp.reset_index().set_index(multiindex)
-        return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
-    except pd.errors.DatabaseError:
-        return None
-
-def _storer_sqlite(symbol, data, storage_location):
-    """Store data in sqlite format.
-
-    We separately store dtypes for data consistency and safety.
-
-    .. note:: If your pandas object's index has a name it will be lost,
-        the index is renamed 'index'. If you pass timestamp data (including
-        the index) it must have explicit timezone.
-    """
-    connection = _open_sqlite(storage_location)
-    exists = pd.read_sql_query(
-      f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'",
-      connection)
-
-    if len(exists):
-        _ = connection.cursor().execute(f"DROP TABLE '{symbol}'")
-        _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'")
-        connection.commit()
-
-    if hasattr(data.index, "levels"):
-        data.index = data.index.set_names(
-            ["index"] +
-            [f"___level{i}" for i in range(1, len(data.index.levels))]
-        )
-        data = data.reset_index().set_index("index")
-    else:
-        data.index.name = "index"
-
-    if data.index[0].tzinfo is None:
-        warnings.warn('Index has not timezone, setting to UTC')
-        data.index = data.index.tz_localize('UTC')
-
-    data.to_sql(f"{symbol}", connection)
-    pd.DataFrame(data).dtypes.astype("string").to_sql(
-        f"{symbol}___dtypes", connection)
-    _close_sqlite(connection)
-
-
-#
-# Pickle storage backend.
-#
-
-def _loader_pickle(symbol, storage_location):
-    """Load data in pickle format."""
-    return pd.read_pickle(storage_location / f"{symbol}.pickle")
-
-def _storer_pickle(symbol, data, storage_location):
-    """Store data in pickle format."""
-    data.to_pickle(storage_location / f"{symbol}.pickle")
-
-#
-# Csv storage backend.
-#
-
-def _loader_csv(symbol, storage_location):
-    """Load data in csv format."""
-
-    index_dtypes = pd.read_csv(
-        storage_location / f"{symbol}___index_dtypes.csv",
-        index_col=0)["0"]
-
-    dtypes = pd.read_csv(
-        storage_location / f"{symbol}___dtypes.csv", index_col=0,
-        dtype={"index": "str", "0": "str"})
-    dtypes = dict(dtypes["0"])
-    new_dtypes = {}
-    parse_dates = []
-    for i, level in enumerate(index_dtypes):
-        if "datetime64[ns" in level: # includes all timezones
-            parse_dates.append(i)
-    for i, el in enumerate(dtypes):
-        if "datetime64[ns" in dtypes[el]:  # includes all timezones
-            parse_dates += [i + len(index_dtypes)]
-        else:
-            new_dtypes[el] = dtypes[el]
-
-    tmp = pd.read_csv(storage_location / f"{symbol}.csv",
-        index_col=list(range(len(index_dtypes))),
-        parse_dates=parse_dates, dtype=new_dtypes)
-
-    return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
-
-
-def _storer_csv(symbol, data, storage_location):
-    """Store data in csv format."""
-    pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels')
-        else [data.index.dtype]).astype("string").to_csv(
-        storage_location / f"{symbol}___index_dtypes.csv")
-    pd.DataFrame(data).dtypes.astype("string").to_csv(
-        storage_location / f"{symbol}___dtypes.csv")
-    data.to_csv(storage_location / f"{symbol}.csv")
-
-#
-# Market Data
-#
+__all__ = ['DownloadedMarketData', 'MarketData', 'UserProvidedMarketData']
 
 class MarketData:
     """Prepare, hold, and serve market data.
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index d75936d1b..142d5ff41 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -11,19 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module include classes that download, store, and serve market data.
-
-The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
-Neither are exposed outside this module. Their derived classes instead are.
-
-If you want to interface cvxportfolio with financial data source other
-than the ones we provide, you should derive from either of those two classes.
-"""
-
+"""This module defines the :class:`SymbolData` abstraction and derived classes."""
 import datetime
 import logging
 import sqlite3
-import sys
 import warnings
 from pathlib import Path
 from urllib.error import URLError
@@ -33,17 +24,17 @@
 import requests
 import requests.exceptions
 
-from .errors import DataError
-from .utils import (hash_, periods_per_year_from_datetime_index,
-                    resample_returns)
-
-__all__ = ["YahooFinance", "Fred",
-           "UserProvidedMarketData", "DownloadedMarketData"]
+from ..errors import DataError
 
 logger = logging.getLogger(__name__)
 
 BASE_LOCATION = Path.home() / "cvxportfolio_data"
 
+__all__ = [
+    '_loader_csv', '_loader_pickle', '_loader_sqlite',
+    '_storer_csv', '_storer_pickle', '_storer_sqlite',
+    'Fred', 'SymbolData', 'YahooFinance', 'BASE_LOCATION']
+
 def now_timezoned():
     """Return current timestamp with local timezone.
 
@@ -329,15 +320,14 @@ def _clean(data):
         # fill open price with close from day(s) before
         # repeat as long as it helps (up to 1 year)
         for shifter in range(252):
+            logger.info(
+                "Filling opens with close from %s days before", shifter)
             orig_missing_opens = data['open'].isnull().sum()
             data['open'] = data['open'].fillna(data['close'].shift(
                 shifter+1))
             new_missing_opens = data['open'].isnull().sum()
             if orig_missing_opens == new_missing_opens:
                 break
-            logger.info(
-                "Filled missing open prices with close from %s periods before",
-                shifter+1)
 
         # fill close price with same day's open
         data['close'] = data['close'].fillna(data['open'])
@@ -767,676 +757,3 @@ def _storer_csv(symbol, data, storage_location):
     pd.DataFrame(data).dtypes.astype("string").to_csv(
         storage_location / f"{symbol}___dtypes.csv")
     data.to_csv(storage_location / f"{symbol}.csv")
-
-#
-# Market Data
-#
-
-class MarketData:
-    """Prepare, hold, and serve market data.
-
-    :method serve: Serve data for policy and simulator at time :math:`t`.
-    """
-
-    def serve(self, t):
-        """Serve data for policy and simulator at time :math:`t`.
-
-        :param t: Trading time. It must be included in the timestamps returned
-            by :meth:`trading_calendar`.
-        :type t: pandas.Timestamp
-
-        :returns: past_returns, current_returns, past_volumes, current_volumes,
-            current_prices
-        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame,
-            pandas.Series, pandas.Series)
-        """
-        raise NotImplementedError # pragma: no cover
-
-    # pylint: disable=redundant-returns-doc
-    def trading_calendar(
-        self, start_time=None, end_time=None, include_end=True):
-        """Get trading calendar between times.
-
-        :param start_time: Initial time of the trading calendar. Always
-            inclusive if present. If None, use the first available time.
-        :type start_time: pandas.Timestamp
-        :param end_time: Final time of the trading calendar. If None,
-            use the last available time.
-        :type end_time: pandas.Timestamp
-        :param include_end: Include end time.
-        :type include_end: bool
-
-        :returns: Trading calendar.
-        :rtype: pandas.DatetimeIndex
-        """
-        raise NotImplementedError # pragma: no cover
-
-    @property
-    def periods_per_year(self):
-        """Average trading periods per year.
-
-        :rtype: int
-        """
-        raise NotImplementedError # pragma: no cover
-
-    @property
-    def full_universe(self): # pylint: disable=redundant-returns-doc
-        """Full universe, which might not be available for trading.
-
-        :returns: Full universe.
-        :rtype: pandas.Index
-        """
-        raise NotImplementedError # pragma: no cover
-
-    # pylint: disable=unused-argument, redundant-returns-doc
-    def partial_universe_signature(self, partial_universe):
-        """Unique signature of this instance with a partial universe.
-
-        A partial universe is a subset of the full universe that is
-        available at some time for trading.
-
-        This is used in cvxportfolio.cache to sign back-test caches that
-        are saved on disk. If not redefined it returns None which disables
-        on-disk caching.
-
-        :param partial_universe: A subset of the full universe.
-        :type partial_universe: pandas.Index
-
-        :returns: Signature.
-        :rtype: str
-        """
-        return None
-
-# compiled based on Interactive Brokers benchmark rates choices
-# (see https://www.ibkrguides.com/kb/article-2949.htm)
-# and their FRED codes
-RATES = {
-    'USDOLLAR': 'DFF', # Federal funds effective rate
-    'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate
-    'GBPOUND': 'IUDSOIA', # SONIA
-    'JPYEN': 'IRSTCB01JPM156N', # updated monthly
-    }
-
-class MarketDataInMemory(MarketData):
-    """Market data that is stored in memory when initialized."""
-
-    # this is overwritten in the derived classes' initializers
-    returns = None
-
-    def __init__(
-        self, trading_frequency, base_location, cash_key, min_history,
-        online_usage = False):
-        """This must be called by the derived classes."""
-        if (self.returns.index[-1] - self.returns.index[0]) < min_history:
-            raise DataError(
-                "The provided returns have less history "
-                + f"than the min_history {min_history}")
-        if trading_frequency:
-            self._downsample(trading_frequency)
-        self.trading_frequency = trading_frequency
-
-        self._set_read_only()
-        self._check_sizes()
-        self._mask = None
-        self._masked_returns = None
-        self._masked_volumes = None
-        self._masked_prices = None
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-        self._min_history_timedelta = min_history
-        self.online_usage = online_usage
-
-    def _mask_dataframes(self, mask):
-        """Mask internal dataframes if necessary."""
-        if (self._mask is None) or not np.all(self._mask == mask):
-            logger.info("Masking internal %s dataframes.",
-                self.__class__.__name__)
-            colmask = self.returns.columns[mask]
-            # self._masked_returns = self._df_or_ser_set_read_only(
-            #     pd.DataFrame(self.returns.iloc[:, mask], copy=True))
-            self._masked_returns = self._df_or_ser_set_read_only(
-               pd.DataFrame(self.returns.loc[:, colmask], copy=True))
-            # self._masked_returns = self._df_or_ser_set_read_only(
-            #     pd.DataFrame(np.array(self.returns.values[:, mask]),
-            #         index=self.returns.index, columns=colmask))
-            if not self.volumes is None:
-                # self._masked_volumes = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True))
-                self._masked_volumes = self._df_or_ser_set_read_only(
-                    pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True))
-                # self._masked_volumes = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]),
-                #         index=self.volumes.index, columns=colmask[:-1]))
-            if not self.prices is None:
-                # self._masked_prices = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True))
-                self._masked_prices = self._df_or_ser_set_read_only(
-                    pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True))
-            self._mask = mask
-
-    @property
-    def full_universe(self):
-        """Full universe, which might not be available for trading.
-
-        :returns: Full universe.
-        :rtype: pandas.Index
-        """
-        return self.returns.columns
-
-    def serve(self, t):
-        """Serve data for policy and simulator at time :math:`t`.
-
-        :param t: Time of execution, *e.g.*, stock market open of a given day.
-        :type t: pandas.Timestamp
-
-        :returns: (past_returns, current_returns, past_volumes,
-            current_volumes, current_prices)
-        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None,
-            pandas.Series or None, pandas.Series or None)
-        """
-
-        mask = self._universe_mask_at_time(t).values
-        self._mask_dataframes(mask)
-
-        tidx = self.returns.index.get_loc(t)
-        past_returns = self._df_or_ser_set_read_only(
-            pd.DataFrame(self._masked_returns.iloc[:tidx]))
-        current_returns = self._df_or_ser_set_read_only(
-            pd.Series(self._masked_returns.iloc[tidx]))
-
-        if not self.volumes is None:
-            tidx = self.volumes.index.get_loc(t)
-            past_volumes = self._df_or_ser_set_read_only(
-                pd.DataFrame(self._masked_volumes.iloc[:tidx]))
-            current_volumes = self._df_or_ser_set_read_only(
-                pd.Series(self._masked_volumes.iloc[tidx]))
-        else:
-            past_volumes = None
-            current_volumes = None
-
-        if not self.prices is None:
-            tidx = self.prices.index.get_loc(t)
-            current_prices = self._df_or_ser_set_read_only(
-                pd.Series(self._masked_prices.iloc[tidx]))
-        else:
-            current_prices = None
-
-        return (past_returns, current_returns, past_volumes, current_volumes,
-                current_prices)
-
-    def _add_cash_column(self, cash_key, grace_period):
-        """Add the cash column to an already formed returns dataframe.
-
-        This assumes that the trading periods are about equally spaced.
-        If, say, you have trading periods with very different lengths you
-        should redefine this method **and** replace the :class:`CashReturn`
-        objective term.
-        """
-
-        if not cash_key in RATES:
-            raise NotImplementedError(
-                'Currently the only data pipelines built are for cash_key'
-                f' in {list(RATES)}')
-
-        if self.returns.index.tz is None:
-            raise DataError(
-                'Your provided dataframes are not timezone aware.'
-                + " This is not recommended, and doesn't allow to add the cash"
-                + " returns' column internally."
-                + " You can fix this by adding a timezone manually "
-                + "using pandas.DataFrame.tz_localize to the dataframes before"
-                + " you pass them, or you can provide"
-                + " the cash returns' column as the last column of the returns"
-                + " dataframe (so it has one more column than volumes and"
-                + " prices, if provided), and set the cash_key parameter to"
-                + " its name.")
-
-        data = Fred(
-            RATES[cash_key], base_location=self.base_location,
-            grace_period=grace_period)
-
-        cash_returns_per_period = resample_returns(
-            data.data/100, periods=self.periods_per_year)
-
-        # we merge instead of assigning column because indexes might
-        # be misaligned (e.g., with tz-aware timestamps)
-        cash_returns_per_period.name = self.cash_key
-        original_returns_index = self.returns.index
-        tmp = pd.concat([self.returns, cash_returns_per_period], axis=1)
-        tmp[cash_key] = tmp[cash_key].ffill()
-        self.returns = tmp.loc[original_returns_index]
-
-    def trading_calendar(
-        self, start_time=None, end_time=None, include_end=True):
-        """Get trading calendar from market data.
-
-        :param start_time: Initial time of the trading calendar. Always
-            inclusive if present. If None, use the first available time.
-        :type start_time: pandas.Timestamp
-        :param end_time: Final time of the trading calendar. If None,
-            use the last available time.
-        :type end_time: pandas.Timestamp
-        :param include_end: Include end time.
-        :type include_end: bool
-
-        :returns: Trading calendar.
-        :rtype: pandas.DatetimeIndex
-        """
-        result = self.returns.index
-        result = result[result >= self._earliest_backtest_start]
-        if start_time:
-            result = result[result >= start_time]
-        if end_time:
-            result = result[(result <= end_time)]
-        if not include_end:
-            result = result[:-1]
-        return result
-
-    def _universe_mask_at_time(self, t):
-        """Return the valid universe mask at time t."""
-        past_returns = self.returns.loc[self.returns.index < t]
-        if self.online_usage:
-            valid_universe_mask = past_returns.count() >= self.min_history
-        else:
-            valid_universe_mask = ((past_returns.count() >= self.min_history) &
-                (~self.returns.loc[t].isnull()))
-        if sum(valid_universe_mask) <= 1:
-            raise DataError(
-                f'The trading universe at time {t} has size less or equal'
-                + ' than one, i.e., only the cash account. There are probably '
-                + ' issues with missing data in the provided market returns.')
-        return valid_universe_mask
-
-    @staticmethod
-    def _df_or_ser_set_read_only(df_or_ser):
-        """Set numpy array contained in dataframe to read only.
-
-        This is done on data store internally before it is served to the
-        policy or the simulator to ensure data consistency in case some
-        element of the pipeline accidentally corrupts the data.
-
-        This is enough to prevent direct assignement to the resulting
-        dataframe. However it could still be accidentally corrupted by
-        assigning to columns or indices that are not present in the
-        original. We avoid that case as well by returning a wrapped
-        dataframe (which doesn't copy data on creation) in
-        serve_data_policy and serve_data_simulator.
-        """
-        data = df_or_ser.values
-        data.flags.writeable = False
-        if hasattr(df_or_ser, 'columns'):
-            return pd.DataFrame(data, index=df_or_ser.index,
-                                columns=df_or_ser.columns)
-        return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name)
-
-    def _set_read_only(self):
-        """Set internal dataframes to read-only."""
-
-        self.returns = self._df_or_ser_set_read_only(self.returns)
-
-        if not self.prices is None:
-            self.prices = self._df_or_ser_set_read_only(self.prices)
-
-        if not self.volumes is None:
-            self.volumes = self._df_or_ser_set_read_only(self.volumes)
-
-    @property
-    def _earliest_backtest_start(self):
-        """Earliest date at which we can start a backtest."""
-        return self.returns.iloc[:, :-1].dropna(how='all').index[
-            self.min_history]
-
-    sampling_intervals = {
-        'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'}
-
-    # @staticmethod
-    # def _is_first_interval_small(datetimeindex):
-    #     """Check if post-resampling the first interval is small.
-    #
-    #     We have no way of knowing exactly if the first interval
-    #     needs to be dropped. We drop it if its length is smaller
-    #     than the average of all others, minus 2 standard deviation.
-    #     """
-    #     first_interval = (datetimeindex[1] - datetimeindex[0])
-    #     all_others = (datetimeindex[2:] - datetimeindex[1:-1])
-    #     return first_interval < (all_others.mean() - 2 * all_others.std())
-
-    def _downsample(self, interval):
-        """_downsample market data."""
-        if not interval in self.sampling_intervals:
-            raise SyntaxError(
-                'Unsopported trading interval for down-sampling.')
-        interval = self.sampling_intervals[interval]
-        new_returns_index = pd.Series(self.returns.index, self.returns.index
-                                      ).resample(interval, closed='left',
-                                                 label='left').first().values
-        # print(new_returns_index)
-        self.returns = np.exp(np.log(
-            1+self.returns).resample(interval, closed='left', label='left'
-                                     ).sum(min_count=1))-1
-        self.returns.index = new_returns_index
-
-        # last row is always unknown
-        self.returns.iloc[-1] = np.nan
-
-        # # we drop the first row if its interval is small
-        # if self._is_first_interval_small(self.returns.index):
-        #     self.returns = self.returns.iloc[1:]
-
-        # we nan-out the first non-nan element of every col
-        for col in self.returns.columns[:-1]:
-            self.returns[col].loc[
-                    (~(self.returns[col].isnull())).idxmax()
-                ] = np.nan
-
-        # and we drop the first row, which is mostly NaNs anyway
-        self.returns = self.returns.iloc[1:]
-
-        if self.volumes is not None:
-            new_volumes_index = pd.Series(
-                self.volumes.index, self.volumes.index
-                    ).resample(interval, closed='left',
-                               label='left').first().values
-            self.volumes = self.volumes.resample(
-                interval, closed='left', label='left').sum(min_count=1)
-            self.volumes.index = new_volumes_index
-
-            # last row is always unknown
-            self.volumes.iloc[-1] = np.nan
-
-            # # we drop the first row if its interval is small
-            # if self._is_first_interval_small(self.volumes.index):
-            #     self.volumes = self.volumes.iloc[1:]
-
-            # we nan-out the first non-nan element of every col
-            for col in self.volumes.columns:
-                self.volumes[col].loc[
-                        (~(self.volumes[col].isnull())).idxmax()
-                    ] = np.nan
-
-            # and we drop the first row, which is mostly NaNs anyway
-            self.volumes = self.volumes.iloc[1:]
-
-        if self.prices is not None:
-            new_prices_index = pd.Series(
-                self.prices.index, self.prices.index
-                ).resample(
-                    interval, closed='left', label='left').first().values
-            self.prices = self.prices.resample(
-                interval, closed='left', label='left').first()
-            self.prices.index = new_prices_index
-
-            # # we drop the first row if its interval is small
-            # if self._is_first_interval_small(self.prices.index):
-            #     self.prices = self.prices.iloc[1:]
-
-            # we nan-out the first non-nan element of every col
-            for col in self.prices.columns:
-                self.prices[col].loc[
-                        (~(self.prices[col].isnull())).idxmax()
-                    ] = np.nan
-
-            # and we drop the first row, which is mostly NaNs anyway
-            self.prices = self.prices.iloc[1:]
-
-    def _check_sizes(self):
-        """Check sizes of user-provided dataframes."""
-
-        if (not self.volumes is None) and (
-                not (self.volumes.shape[1] == self.returns.shape[1] - 1)
-                or not all(self.volumes.columns == self.returns.columns[:-1])):
-            raise SyntaxError(
-                'Volumes should have same columns as returns, minus cash_key.')
-
-        if (not self.prices is None) and (
-                not (self.prices.shape[1] == self.returns.shape[1] - 1)
-                or not all(self.prices.columns == self.returns.columns[:-1])):
-            raise SyntaxError(
-                'Prices should have same columns as returns, minus cash_key.')
-
-    @property
-    def periods_per_year(self):
-        """Average trading periods per year inferred from the data.
-
-        :returns: Average periods per year.
-        :rtype: int
-        """
-        return periods_per_year_from_datetime_index(self.returns.index)
-
-    @property
-    def min_history(self):
-        """Min history expressed in periods.
-
-        :returns: How many non-null elements of the past returns for a given
-            name are required to include it.
-        :rtype: int
-        """
-        return int(np.round(self.periods_per_year * (
-            self._min_history_timedelta / pd.Timedelta('365.24d'))))
-
-
-class UserProvidedMarketData(MarketDataInMemory):
-    """User-provided market data.
-
-    :param returns: Historical open-to-open returns. The return
-        at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where
-        :math:`p_t` is the (open) price at time :math:`t`. Must
-        have datetime index. You can also include cash
-        returns as its last column, and set ``cash_key`` below to the last
-        column's name.
-    :type returns: pandas.DataFrame
-    :param volumes: Historical market volumes, expressed in units
-        of value (*e.g.*, US dollars).
-    :type volumes: pandas.DataFrame or None
-    :param prices: Historical open prices (*e.g.*, used for rounding
-        trades in the :class:`MarketSimulator`).
-    :type prices: pandas.DataFrame or None
-    :param trading_frequency: Instead of using frequency implied by
-        the index of the returns, down-sample all dataframes.
-        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
-        ``'annual'``. By default (None) don't down-sample.
-    :type trading_frequency: str or None
-    :param min_history: Minimum amount of time for which the returns
-         are not ``np.nan`` before each assets enters in a back-test.
-    :type min_history: pandas.Timedelta
-    :param base_location: The location of the storage, only used
-        in case it downloads the cash returns. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param cash_key: Name of the cash account. If not the last column
-        of the provided returns, it will be downloaded. In that case you should
-        make sure your provided dataframes have a timezone aware datetime
-        index. Its returns are the risk-free rate.
-    :type cash_key: str
-    :param online_usage: Disable removal of assets that have ``np.nan`` returns
-        for the given time. Default False.
-    :type online_usage: bool
-    """
-
-    # pylint: disable=too-many-arguments
-    def __init__(self, returns, volumes=None, prices=None,
-                 copy_dataframes=True, trading_frequency=None,
-                 min_history=pd.Timedelta('365.24d'),
-                 base_location=BASE_LOCATION,
-                 grace_period=pd.Timedelta('1d'),
-                 cash_key='USDOLLAR',
-                 online_usage=False):
-
-        if returns is None:
-            raise SyntaxError(
-                "If you don't specify a universe you should pass `returns`.")
-
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-
-        self.returns = pd.DataFrame(returns, copy=copy_dataframes)
-        self.volumes = volumes if volumes is None else\
-            pd.DataFrame(volumes, copy=copy_dataframes)
-        self.prices = prices if prices is None else\
-            pd.DataFrame(prices, copy=copy_dataframes)
-
-        if cash_key != returns.columns[-1]:
-            self._add_cash_column(cash_key, grace_period=grace_period)
-
-        # this is mandatory
-        super().__init__(
-            trading_frequency=trading_frequency,
-            base_location=base_location,
-            cash_key=cash_key,
-            min_history=min_history,
-            online_usage=online_usage)
-
-
-class DownloadedMarketData(MarketDataInMemory):
-    """Market data that is downloaded.
-
-    :param universe: List of names as understood by the data source
-        used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default
-        Yahoo Finance data source.
-    :type universe: list
-    :param datasource: The data source used.
-    :type datasource: str or :class:`SymbolData` class
-    :param cash_key: Name of the cash account, its rates will be downloaded
-        and added as last columns of the returns. Its returns are the
-        risk-free rate.
-    :type cash_key: str
-    :param base_location: The location of the storage. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param min_history: Minimum amount of time for which the returns
-         are not ``np.nan`` before each assets enters in a back-test.
-    :type min_history: pandas.Timedelta
-    :param grace_period: If the most recent observation of each symbol's
-        data is less old than this we do not download new data.
-        By default it's one day.
-    :type grace_period: pandas.Timedelta
-    :param trading_frequency: Instead of using frequency implied by
-        the index of the returns, down-sample all dataframes.
-        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
-        ``'annual'``. By default (None) don't down-sample.
-    :type trading_frequency: str or None
-    :param online_usage: Disable removal of assets that have ``np.nan`` returns
-        for the given time. Default False.
-    :type online_usage: bool
-    """
-
-    # pylint: disable=too-many-arguments
-    def __init__(self,
-                 universe=(),
-                 datasource='YahooFinance',
-                 cash_key='USDOLLAR',
-                 base_location=BASE_LOCATION,
-                 storage_backend='pickle',
-                 min_history=pd.Timedelta('365.24d'),
-                 grace_period=pd.Timedelta('1d'),
-                 trading_frequency=None,
-                 online_usage=False):
-        """Initializer."""
-
-        # drop duplicates and ensure ordering
-        universe = sorted(set(universe))
-
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-        if isinstance(datasource, type):
-            self.datasource = datasource
-        else: # try to load in current module
-            self.datasource = globals()[datasource]
-        self._get_market_data(
-            universe, grace_period=grace_period,
-            storage_backend=storage_backend)
-        self._add_cash_column(self.cash_key, grace_period=grace_period)
-        self._remove_missing_recent()
-
-        # this is mandatory
-        super().__init__(
-            trading_frequency=trading_frequency,
-            base_location=base_location,
-            cash_key=cash_key,
-            min_history=min_history,
-            online_usage=online_usage)
-
-    def _get_market_data(self, universe, grace_period, storage_backend):
-        """Download market data."""
-        database_accesses = {}
-        print('Updating data', end='')
-        sys.stdout.flush()
-
-        for stock in universe:
-            logger.info(
-                'Updating %s with %s.', stock, self.datasource.__name__)
-            print('.', end='')
-            sys.stdout.flush()
-            database_accesses[stock] = self.datasource(
-                stock, base_location=self.base_location,
-                grace_period=grace_period, storage_backend=storage_backend)
-        print()
-
-        if hasattr(self.datasource, 'IS_OHLCVR') and self.datasource.IS_OHLCVR:
-            self.returns = pd.DataFrame(
-                {stock: database_accesses[stock].data['return']
-                for stock in universe})
-            self.volumes = pd.DataFrame(
-                {stock: database_accesses[stock].data['valuevolume']
-                for stock in universe})
-            self.prices = pd.DataFrame(
-                {stock: database_accesses[stock].data['open']
-                for stock in universe})
-        else:  # for now only Fred for indexes, we assume prices!
-            assert isinstance(database_accesses[universe[0]].data, pd.Series)
-            self.prices = pd.DataFrame(
-                # open prices
-                {stock: database_accesses[stock].data for stock in universe})
-            self.returns = 1 - self.prices / self.prices.shift(-1)
-            self.volumes = None
-
-    def _remove_missing_recent(self):
-        """Clean recent data.
-
-        Yahoo Finance may has issues with most recent data; we remove
-        recent days if there are NaNs.
-        """
-
-        if self.prices.iloc[-5:].isnull().any().any():
-            logger.debug(
-                'Removing some recent lines because there are missing values.')
-            drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax()
-            logger.debug('Dropping at index %s', drop_at)
-            self.returns = self.returns.loc[self.returns.index < drop_at]
-            if self.prices is not None:
-                self.prices = self.prices.loc[self.prices.index < drop_at]
-            if self.volumes is not None:
-                self.volumes = self.volumes.loc[self.volumes.index < drop_at]
-
-        # for consistency we must also nan-out the last row
-        # of returns and volumes
-        self.returns.iloc[-1] = np.nan
-        if self.volumes is not None:
-            self.volumes.iloc[-1] = np.nan
-
-    def partial_universe_signature(self, partial_universe):
-        """Unique signature of this instance with a partial universe.
-
-        A partial universe is a subset of the full universe that is
-        available at some time for trading.
-
-        This is used in cvxportfolio.cache to sign back-test caches that
-        are saved on disk. See its implementation below for details. If
-        not redefined it returns None which disables on-disk caching.
-
-        :param partial_universe: A subset of the full universe.
-        :type partial_universe: pandas.Index
-
-        :returns: Signature.
-        :rtype: str
-        """
-        assert isinstance(partial_universe, pd.Index)
-        assert np.all(partial_universe.isin(self.full_universe))
-        result = f'{self.__class__.__name__}('
-        result += f'datasource={self.datasource.__name__}, '
-        result += f'partial_universe_hash={hash_(np.array(partial_universe))},'
-        result += f' trading_frequency={self.trading_frequency})'
-        return result

From 87ba504ac990864e670a7e7db85734806f9b1d8a Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 12:21:38 +0400
Subject: [PATCH 08/38] Applied changes of commit 564e1fe

---
 cvxportfolio/data/__init__.py    |   2 +-
 cvxportfolio/data/market_data.py |   3 +-
 cvxportfolio/data/symbol_data.py | 129 +++++++++++++++++--------------
 3 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/cvxportfolio/data/__init__.py b/cvxportfolio/data/__init__.py
index 97aad902d..0c2bc403a 100644
--- a/cvxportfolio/data/__init__.py
+++ b/cvxportfolio/data/__init__.py
@@ -23,4 +23,4 @@
 from .symbol_data import *
 
 __all__ = [
-    "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"]
\ No newline at end of file
+    "YahooFinance", "Fred", "UserProvidedMarketData", "DownloadedMarketData"]
diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index be723a618..d382abdc9 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -49,7 +49,6 @@ def serve(self, t):
         """
         raise NotImplementedError # pragma: no cover
 
-    # pylint: disable=redundant-returns-doc
     def trading_calendar(
         self, start_time=None, end_time=None, include_end=True):
         """Get trading calendar between times.
@@ -77,7 +76,7 @@ def periods_per_year(self):
         raise NotImplementedError # pragma: no cover
 
     @property
-    def full_universe(self): # pylint: disable=redundant-returns-doc
+    def full_universe(self):
         """Full universe, which might not be available for trading.
 
         :returns: Full universe.
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 142d5ff41..29ddcb634 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -249,27 +249,17 @@ def _timestamp_convert(unix_seconds_ts):
     """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
     return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
 
+class OHLCV(SymbolData): # pylint: disable=abstract-method
+    """Base class for Open-High-Low-Close-Volume symbol data."""
 
-class YahooFinance(SymbolData):
-    """Yahoo Finance symbol data.
+    # TODO: factor quality check and clean into total-return related and non-
 
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data.
-    :type grace_period: pandas.Timedelta
+class OHLCVTR(OHLCV): # pylint: disable=abstract-method
+    """Base class for Open-High-Low-Close-Volume-Total Return symbol data."""
 
-    :attribute data: The downloaded, and cleaned, data for the symbol.
-    :type data: pandas.DataFrame
-    """
+    # TODO: consider creating a OHLCVAC (adjusted closes) subclass
 
-    # is open-high-low-close-volume-(total)return
+    # is open-high-low-close-volume-total return
     IS_OHLCVR = True
 
     @staticmethod
@@ -379,6 +369,66 @@ def _clean(data):
 
         return data
 
+    def _quality_check(self, data):
+        """Analyze quality of the OHLCV-TR data."""
+
+        # zero volume
+        zerovol_idx = data.index[data.volume == 0]
+        if len(zerovol_idx) > 0:
+            logger.warning(
+                '%s("%s") has volume equal to zero for timestamps: %s',
+                self.__class__.__name__, self.symbol, zerovol_idx)
+
+        def print_extreme(logreturns, name, sigmas=50):
+
+            # TODO: choose
+            m, s = logreturns.median(), np.sqrt((logreturns**2).median())
+            normalized = (logreturns - m)/s
+
+            # normalized = logreturns / logreturns.rolling(252).std().shift(1)
+
+            extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
+            if len(extremereturn_idx) > 0:
+                logger.warning(
+                    '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
+                    self.__class__.__name__, self.symbol, name, sigmas,
+                    extremereturn_idx)
+
+        # extreme logreturns
+        logreturns = np.log(1 + data['return']).dropna()
+        print_extreme(logreturns, 'total returns')
+
+        # extreme open2close
+        open2close = np.log(data['close']) - np.log(data['open']).dropna()
+        print_extreme(open2close, 'open to close returns')
+
+        # extreme open2high
+        open2high = np.log(data['high']) - np.log(data['open']).dropna()
+        print_extreme(open2high, 'open to high returns')
+
+        # extreme open2low
+        open2low = np.log(data['low']) - np.log(data['open']).dropna()
+        print_extreme(open2low, 'open to low returns')
+
+class YahooFinance(OHLCVTR):
+    """Yahoo Finance symbol data.
+
+    :param symbol: The symbol that we downloaded.
+    :type symbol: str
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
+    :type storage_backend: str
+    :param base_storage_location: The location of the storage. We store in a
+        subdirectory named after the class which derives from this.
+    :type base_storage_location: pathlib.Path
+    :param grace_period: If the most recent observation in the data is less
+        old than this we do not download new data.
+    :type grace_period: pandas.Timedelta
+
+    :attribute data: The downloaded, and cleaned, data for the symbol.
+    :type data: pandas.DataFrame
+    """
+
     @staticmethod
     def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
         """Get 1 day OHLC from Yahoo finance.
@@ -432,9 +482,9 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
                 data['indicators']['quote'][0], index=index)
             df_result['adjclose'] = data[
                 'indicators']['adjclose'][0]['adjclose']
-        except KeyError:
+        except KeyError as exc:
             raise DataError(f'Yahoo finance download of {ticker} failed.'
-                + ' Json:', str(res.json())) # pragma: no cover
+                + ' Json:', str(res.json())) from exc # pragma: no cover
 
         # last timestamp is probably broken (not timed to market open)
         # we set its time to same as the day before, but this is wrong
@@ -489,47 +539,6 @@ def _download(self, symbol, current=None,
         new = self._clean(new)
         return pd.concat([current.iloc[:-overlap], new])
 
-    def _quality_check(self, data):
-        """Analyze quality of the OHLCV-TR data."""
-
-        # zero volume
-        zerovol_idx = data.index[data.volume == 0]
-        if len(zerovol_idx) > 0:
-            logger.warning(
-                '%s("%s") has volume equal to zero for timestamps: %s',
-                self.__class__.__name__, self.symbol, zerovol_idx)
-
-        def print_extreme(logreturns, name, sigmas=50):
-
-            # TODO: choose
-            m, s = logreturns.median(), np.sqrt((logreturns**2).median())
-            normalized = (logreturns - m)/s
-
-            # normalized = logreturns / logreturns.rolling(252).std().shift(1)
-
-            extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
-            if len(extremereturn_idx) > 0:
-                logger.warning(
-                    '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
-                    self.__class__.__name__, self.symbol, name, sigmas,
-                    extremereturn_idx)
-
-        # extreme logreturns
-        logreturns = np.log(1 + data['return']).dropna()
-        print_extreme(logreturns, 'total returns')
-
-        # extreme open2close
-        open2close = np.log(data['close']) - np.log(data['open']).dropna()
-        print_extreme(open2close, 'open to close returns')
-
-        # extreme open2high
-        open2high = np.log(data['high']) - np.log(data['open']).dropna()
-        print_extreme(open2high, 'open to high returns')
-
-        # extreme open2low
-        open2low = np.log(data['low']) - np.log(data['open']).dropna()
-        print_extreme(open2low, 'open to low returns')
-
     def _preload(self, data):
         """Prepare data for use by Cvxportfolio.
 

From 7d86c9ca4511ade3d5da9e28dafc2e641040b0dc Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 12:23:57 +0400
Subject: [PATCH 09/38] Applied changes of commit 13f119f

---
 cvxportfolio/data/symbol_data.py | 40 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 29ddcb634..f60a0f131 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -262,9 +262,8 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method
     # is open-high-low-close-volume-total return
     IS_OHLCVR = True
 
-    @staticmethod
-    def _clean(data):
-        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
+    def _nan_impossible(self, data):
+        """Set impossible values to NaN."""
 
         # print(data)
         # print(data.isnull().sum())
@@ -286,6 +285,8 @@ def _clean(data):
         # print(data)
         # print(data.isnull().sum())
 
+        # TODO: these can be made smarter (sometimes the open is clearly wrong)
+
         # if low is not the lowest, set it to nan
         data['low'].loc[
             data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan
@@ -297,9 +298,11 @@ def _clean(data):
         # print(data)
         # print(data.isnull().sum())
 
-        #
-        # fills
-        #
+    def _fill_easy(self, data):
+        """Make easy fills."""
+
+        # print(data)
+        # print(data.isnull().sum())
 
         # fill volumes with zeros (safest choice)
         data['volume'] = data['volume'].fillna(0.)
@@ -331,9 +334,11 @@ def _clean(data):
         # print(data)
         # print(data.isnull().sum())
 
-        #
-        # Compute returns
-        #
+    def _compute_total_returns(self, data):
+        """Compute total open-to-open returns."""
+
+        # print(data)
+        # print(data.isnull().sum())
 
         # compute log of ratio between adjclose and close
         log_adjustment_ratio = np.log(data['adjclose'] / data['close'])
@@ -358,15 +363,26 @@ def _clean(data):
         #     intraday_logreturn.shift(-1)
         # )
         # data["return"] = np.exp(open_to_open_logreturn) - 1
+
+        # print(data)
+        # print(data.isnull().sum())
+
+    def _clean(self, data):
+        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
+
+        self._nan_impossible(data)
+
+        self._fill_easy(data)
+
+        self._compute_total_returns(data)
+
+        # eliminate adjclose column
         del data["adjclose"]
 
         # eliminate last period's intraday data
         data.loc[data.index[-1],
             ["high", "low", "close", "return", "volume"]] = np.nan
 
-        # print(data)
-        # print(data.isnull().sum())
-
         return data
 
     def _quality_check(self, data):

From 60b1459fdbc69e4113dc0e2db7924511e8e1fa38 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 22 Jan 2024 12:26:15 +0400
Subject: [PATCH 10/38] Applied changes of commit b794c7dc

---
 cvxportfolio/data/symbol_data.py | 98 ++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 12 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index f60a0f131..0047e0953 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -262,6 +262,54 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method
     # is open-high-low-close-volume-total return
     IS_OHLCVR = True
 
+    # rolstd windows for finding wrong logreturns
+    _ROLSTD_WINDOWS = [20, 60, 252]
+
+    # threshold for finding wrong logreturns
+    _WRONG_LOGRET_THRESHOLD = 15
+
+    def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold):
+        """Get indexes of logreturns that are extreme wrt trailing stddev."""
+        trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1))
+        bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold]
+        return bad_indexes
+
+    def _find_wrong_daily_logreturns(self, lrets):
+        """Find indexes of logreturns that are most probably data errors."""
+        bad_indexes = []
+        for window in self._ROLSTD_WINDOWS:
+            bad_indexes.append(
+                set(self._indexes_extreme_logrets_wrt_rolstddev(
+                lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD)))
+            bad_indexes.append(
+                set(self._indexes_extreme_logrets_wrt_rolstddev(
+                lrets.iloc[::-1], window=window,
+                treshold=self._WRONG_LOGRET_THRESHOLD)))
+        bad_indexes = set.intersection(*bad_indexes)
+        return bad_indexes
+
+    # TODO: plan
+    # ffill adj closes & compute adj close logreturns
+    # use code above to get indexes of wrong ones, raise warnings, set to 0
+    #
+    # check close vs adj close, there should be only dividends (with y finance)
+    #
+    # throw out opens that are not in [low, high]
+    #
+    # apply similar logic (perhaps using total lrets for the stddev) for
+    # open-close , close-high , close-low, throw out open/low/close not OK
+    #
+    # fill
+    #
+    # compute open-open total returns, then check with same logic for errors
+    #
+    # when doing append, make past data adhere to same format: recompute adj close
+    #
+    # could use volumes as well, if there are jumps in price due to
+    # splits not recorded, then price * volume should be more stable
+    #
+    #
+
     def _nan_impossible(self, data):
         """Set impossible values to NaN."""
 
@@ -269,18 +317,32 @@ def _nan_impossible(self, data):
         # print(data.isnull().sum())
 
         # nan-out nonpositive prices
-        data.loc[data["open"] <= 0, 'open'] = np.nan
-        data.loc[data["close"] <= 0, "close"] = np.nan
-        data.loc[data["high"] <= 0, "high"] = np.nan
-        data.loc[data["low"] <= 0, "low"] = np.nan
-        data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan
+        for column in ["open", "close", "high", "low", "adjclose"]:
+            bad_indexes = data.index[data[column] <= 0]
+            if len(bad_indexes) > 0:
+                logger.warning(
+                    '%s("%s") has non-positive %s prices on timestamps: %s,'
+                    + ' setting to nan',
+                    self.__class__.__name__, self.symbol, column, bad_indexes)
+                data.loc[bad_indexes, column] = np.nan
 
         # nan-out negative volumes
-        data.loc[data["volume"] < 0, 'volume'] = np.nan
+        bad_indexes = data.index[data["volume"] < 0]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") has negative volumes on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            data.loc[bad_indexes, "volume"] = np.nan
 
         # all infinity values are nans
-        data.iloc[:, :] = np.nan_to_num(
-            data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan)
+        if np.isinf(data).sum().sum() > 0:
+            logger.warning(
+                '%s("%s") has +/- infinity values, setting those to nan',
+                self.__class__.__name__, self.symbol)
+            data.iloc[:, :] = np.nan_to_num(
+                data.values, copy=True, nan=np.nan, posinf=np.nan,
+                neginf=np.nan)
 
         # print(data)
         # print(data.isnull().sum())
@@ -288,12 +350,24 @@ def _nan_impossible(self, data):
         # TODO: these can be made smarter (sometimes the open is clearly wrong)
 
         # if low is not the lowest, set it to nan
-        data['low'].loc[
-            data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan
+        bad_indexes = data.index[
+            data['low'] > data[['open', 'high', 'close']].min(1)]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") low prices are not the lowest on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            data.loc[bad_indexes, "low"] = np.nan
 
         # if high is not the highest, set it to nan
-        data['high'].loc[
-            data['high'] < data[['open', 'high', 'close']].max(1)] = np.nan
+        bad_indexes = data.index[
+            data['high'] < data[['open', 'high', 'close']].max(1)]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") high prices are not the highest on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            data.loc[bad_indexes, "high"] = np.nan
 
         # print(data)
         # print(data.isnull().sum())

From 24489eb4447ccd5be8a5ff5f4080c210fc7bf469 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 7 Feb 2024 11:05:25 +0400
Subject: [PATCH 11/38] trying different approach for timestamping last row in
 yahoofinance

---
 cvxportfolio/data/symbol_data.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 0047e0953..bca12bd44 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -559,7 +559,8 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
                 + 'Json output:', str(res.json()))
 
         if res.status_code != 200:
-            raise DataError(f'Yahoo finance download of {ticker} failed. Json:',
+            raise DataError(
+                f'Yahoo finance download of {ticker} failed. Json:',
                 str(res.json())) # pragma: no cover
 
         data = res.json()['chart']['result'][0]
@@ -576,16 +577,25 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
             raise DataError(f'Yahoo finance download of {ticker} failed.'
                 + ' Json:', str(res.json())) from exc # pragma: no cover
 
-        # last timestamp is probably broken (not timed to market open)
-        # we set its time to same as the day before, but this is wrong
-        # on days of DST switch. It's fine though because that line will be
-        # overwritten next update
-        if df_result.index[-1].time() != df_result.index[-2].time():
-            tm1 = df_result.index[-2].time()
-            newlast = df_result.index[-1].replace(
-                hour=tm1.hour, minute=tm1.minute, second=tm1.second)
-            df_result.index = pd.DatetimeIndex(
-                list(df_result.index[:-1]) + [newlast])
+        # last timestamp could be not timed to market open
+        this_periods_open_time = _timestamp_convert(
+            data['meta']['currentTradingPeriod']['regular']['start'])
+
+        if df_result.index[-1] > this_periods_open_time:
+            index = df_result.index.to_numpy()
+            index[-1] = this_periods_open_time
+            df_result.index = pd.DatetimeIndex(index)
+
+        # # last timestamp is probably broken (not timed to market open)
+        # # we set its time to same as the day before, but this is wrong
+        # # on days of DST switch. It's fine though because that line will be
+        # # overwritten next update
+        # if df_result.index[-1].time() != df_result.index[-2].time():
+        #     tm1 = df_result.index[-2].time()
+        #     newlast = df_result.index[-1].replace(
+        #         hour=tm1.hour, minute=tm1.minute, second=tm1.second)
+        #     df_result.index = pd.DatetimeIndex(
+        #         list(df_result.index[:-1]) + [newlast])
 
         return df_result[
             ['open', 'low', 'high', 'close', 'adjclose', 'volume']]

From 66e7b903106e67009d52487784b2220ee9cb5baa Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Fri, 9 Feb 2024 10:38:11 +0400
Subject: [PATCH 12/38] minor

---
 cvxportfolio/data/symbol_data.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index bca12bd44..a42694cd4 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module defines the :class:`SymbolData` abstraction and derived classes."""
+"""This module defines :class:`SymbolData` and derived classes."""
+
 import datetime
 import logging
 import sqlite3
@@ -303,8 +304,8 @@ def _find_wrong_daily_logreturns(self, lrets):
     #
     # compute open-open total returns, then check with same logic for errors
     #
-    # when doing append, make past data adhere to same format: recompute adj close
-    #
+    # when doing append, make past data adhere to same format: recompute adj
+    # close
     # could use volumes as well, if there are jumps in price due to
     # splits not recorded, then price * volume should be more stable
     #
@@ -521,7 +522,20 @@ class YahooFinance(OHLCVTR):
 
     @staticmethod
     def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
-        """Get 1 day OHLC from Yahoo finance.
+        """Get 1-day OHLC-AC-V from Yahoo finance.
+
+        This is roughly equivalent to
+
+        .. code-block::
+
+            import yfinance as yf
+            yf.download(ticker)
+
+        But it does no caching of any sort; only a single request call,
+        error checking (which result in exceptions going all the way to the
+        user, in the current design), json parsing, and a minimal effort to
+        restore the last timestamp. All processing and cleaning is done
+        elsewhere.
 
         Result is timestamped with the open time (time-zoned) of the
         instrument.
@@ -597,6 +611,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
         #     df_result.index = pd.DatetimeIndex(
         #         list(df_result.index[:-1]) + [newlast])
 
+        # these are all the columns, we simply re-order them
         return df_result[
             ['open', 'low', 'high', 'close', 'adjclose', 'volume']]
 

From 788e72e2abac42806f715ec18f6c0c734c392767 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Fri, 9 Feb 2024 11:38:12 +0400
Subject: [PATCH 13/38] symbol_data

---
 cvxportfolio/data/symbol_data.py     | 148 +++++++++++++++++++--------
 examples/strategies/ftse100_daily.py |   1 +
 examples/universes.py                |   4 +-
 3 files changed, 107 insertions(+), 46 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index a42694cd4..ad51db479 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -250,44 +250,105 @@ def _timestamp_convert(unix_seconds_ts):
     """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
     return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
 
+# Windows for filtering extreme logreturns
+_WINDOWS = (10, 20, 50, 100, 200)
+
+def _median_scale_around(lrets, window):
+    """Median absolute logreturn in a window around each timestamp."""
+    return np.abs(lrets).rolling(window, center=True, min_periods=1).median()
+
+def _mean_scale_around(lrets, window):
+    """Root mean squared logreturn in a window around each timestamp."""
+    return np.sqrt(
+        (lrets**2).rolling(window, center=True, min_periods=1).mean())
+
+def _unlikeliness_score(
+        test_logreturns, reference_logreturns, scaler=_median_scale_around,
+        windows=_WINDOWS):
+    """Find problematic indexes for test logreturns compared w/ reference."""
+    scaled = [
+        np.abs(test_logreturns) / scaler(reference_logreturns, window)
+        for window in windows]
+    scaled = pd.DataFrame(scaled).T
+    return scaled.min(axis=1), scaled
+
+
 class OHLCV(SymbolData): # pylint: disable=abstract-method
-    """Base class for Open-High-Low-Close-Volume symbol data."""
+    """Base class for Open-High-Low-Close-Volume symbol data.
+
+    This operates on a dataframe with columns
+
+    .. code-block::
+
+        ['open', 'low', 'high', 'close', 'volume']
+
+    or
+
+    .. code-block::
+
+        ['open', 'low', 'high', 'close', 'volume', 'return']
+
+    in which case the ``'return'`` column is not processed. It only matters in
+    the :meth:`_preload`, method: if open-to-open returns are not present,
+    we compute them there. Otherwise these may be total returns (including
+    dividends, ...) and they're dealt with in derived classes.
+    """
 
     # TODO: factor quality check and clean into total-return related and non-
 
+    def _preload(self, data):
+        """Prepare data for use by Cvxportfolio.
+
+        We drop the `volume` column expressed in number of shares and
+        replace it with `valuevolume` which is an estimate of the (e.g.,
+        US dollar) value of the volume exchanged on the day.
+        """
+
+        # this is not used currently, but if we implement an interface to a
+        # pure OHLCV data source there is no need to store the open-to-open
+        # returns, they can be computed here
+        if not 'return' in data.columns:
+            data['return'] = data['open'].pct_change().shift(-1)
+
+        self._quality_check(data)
+        data["valuevolume"] = data["volume"] * data["open"]
+        del data["volume"]
+
+        return data
+
 class OHLCVTR(OHLCV): # pylint: disable=abstract-method
-    """Base class for Open-High-Low-Close-Volume-Total Return symbol data."""
+    """Open-High-Low-Close-Volume-TotalReturn symbol data."""
 
     # TODO: consider creating a OHLCVAC (adjusted closes) subclass
 
     # is open-high-low-close-volume-total return
     IS_OHLCVR = True
 
-    # rolstd windows for finding wrong logreturns
-    _ROLSTD_WINDOWS = [20, 60, 252]
-
-    # threshold for finding wrong logreturns
-    _WRONG_LOGRET_THRESHOLD = 15
-
-    def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold):
-        """Get indexes of logreturns that are extreme wrt trailing stddev."""
-        trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1))
-        bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold]
-        return bad_indexes
-
-    def _find_wrong_daily_logreturns(self, lrets):
-        """Find indexes of logreturns that are most probably data errors."""
-        bad_indexes = []
-        for window in self._ROLSTD_WINDOWS:
-            bad_indexes.append(
-                set(self._indexes_extreme_logrets_wrt_rolstddev(
-                lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD)))
-            bad_indexes.append(
-                set(self._indexes_extreme_logrets_wrt_rolstddev(
-                lrets.iloc[::-1], window=window,
-                treshold=self._WRONG_LOGRET_THRESHOLD)))
-        bad_indexes = set.intersection(*bad_indexes)
-        return bad_indexes
+    # # rolstd windows for finding wrong logreturns
+    # _ROLSTD_WINDOWS = [20, 60, 252]
+
+    # # threshold for finding wrong logreturns
+    # _WRONG_LOGRET_THRESHOLD = 15
+
+    # def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold):
+    #     """Get indexes of logreturns that are extreme wrt trailing stddev."""
+    #     trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1))
+    #     bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold]
+    #     return bad_indexes
+
+    # def _find_wrong_daily_logreturns(self, lrets):
+    #     """Find indexes of logreturns that are most probably data errors."""
+    #     bad_indexes = []
+    #     for window in self._ROLSTD_WINDOWS:
+    #         bad_indexes.append(
+    #             set(self._indexes_extreme_logrets_wrt_rolstddev(
+    #             lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD)))
+    #         bad_indexes.append(
+    #             set(self._indexes_extreme_logrets_wrt_rolstddev(
+    #             lrets.iloc[::-1], window=window,
+    #             treshold=self._WRONG_LOGRET_THRESHOLD)))
+    #     bad_indexes = set.intersection(*bad_indexes)
+    #     return bad_indexes
 
     # TODO: plan
     # ffill adj closes & compute adj close logreturns
@@ -442,7 +503,7 @@ def _compute_total_returns(self, data):
         # print(data)
         # print(data.isnull().sum())
 
-    def _clean(self, data):
+    def _process(self, data):
         """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
 
         self._nan_impossible(data)
@@ -501,7 +562,18 @@ def print_extreme(logreturns, name, sigmas=50):
         open2low = np.log(data['low']) - np.log(data['open']).dropna()
         print_extreme(open2low, 'open to low returns')
 
-class YahooFinance(OHLCVTR):
+
+class OHLCVAC(OHLCVTR):
+    """Open-High-Low-Close-Volume-AdjustedClose data.
+
+    This is modeled after the data returned by Yahoo Finance. It implements
+    the transformation required to conform to the
+    Open-High-Low-Close-Volume-TotalReturn model, that is, compute
+    returns from the adjusted closes, and do some error checks.
+    """
+
+
+class YahooFinance(OHLCVAC):
     """Yahoo Finance symbol data.
 
     :param symbol: The symbol that we downloaded.
@@ -633,6 +705,7 @@ def _download(self, symbol, current=None,
         Returns:
             updated (pandas.DataFrame): updated DataFrame for the symbol
         """
+        # TODO this could be put at a much lower class hierarchy
         if overlap < 2:
             raise SyntaxError(
                 f'{self.__class__.__name__} with overlap smaller than 2'
@@ -640,7 +713,7 @@ def _download(self, symbol, current=None,
         if (current is None) or (len(current) < overlap):
             updated = self._get_data_yahoo(symbol, **kwargs)
             logger.info('Downloading from the start.')
-            result = self._clean(updated)
+            result = self._process(updated)
             # we remove first row if it contains NaNs
             if np.any(result.iloc[0].isnull()):
                 result = result.iloc[1:]
@@ -651,22 +724,9 @@ def _download(self, symbol, current=None,
                 'Skipping download because stored data is recent enough.')
             return current
         new = self._get_data_yahoo(symbol, start=current.index[-overlap])
-        new = self._clean(new)
+        new = self._process(new)
         return pd.concat([current.iloc[:-overlap], new])
 
-    def _preload(self, data):
-        """Prepare data for use by Cvxportfolio.
-
-        We drop the `volume` column expressed in number of stocks and
-        replace it with `valuevolume` which is an estimate of the (e.g.,
-        US dollar) value of the volume exchanged on the day.
-        """
-
-        self._quality_check(data)
-        data["valuevolume"] = data["volume"] * data["open"]
-        del data["volume"]
-
-        return data
 
 #
 # Fred.
diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py
index 0f0b976db..bc99dd472 100644
--- a/examples/strategies/ftse100_daily.py
+++ b/examples/strategies/ftse100_daily.py
@@ -67,6 +67,7 @@ def policy(gamma_risk, gamma_trade):
 
     else:
         import matplotlib.pyplot as plt
+
         #INDEX_ETF = 'DIA'
 
         research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND')
diff --git a/examples/universes.py b/examples/universes.py
index 8fd2ca747..3687a1112 100644
--- a/examples/universes.py
+++ b/examples/universes.py
@@ -132,8 +132,8 @@
         'ftse100': {
             'page': 'https://en.wikipedia.org/wiki/FTSE_100_Index',
             'table_number': -1,
-            'column_number':1,
-            'suffix':'.L',
+            'column_number': 1,
+            'suffix': '.L',
         }
     }
 

From 96b239a329482104b8cb101c61c76478894e85db Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 12 Feb 2024 11:25:40 +0400
Subject: [PATCH 14/38] refactoring _process of OLHCV

---
 cvxportfolio/data.py             | 1398 ++++++++++++++++++++++++++++++
 cvxportfolio/data/market_data.py |    2 +-
 cvxportfolio/data/symbol_data.py |  325 ++++---
 cvxportfolio/tests/test_data.py  |    2 +-
 4 files changed, 1604 insertions(+), 123 deletions(-)
 create mode 100644 cvxportfolio/data.py

diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
new file mode 100644
index 000000000..3e2be8232
--- /dev/null
+++ b/cvxportfolio/data.py
@@ -0,0 +1,1398 @@
+# Copyright 2023 Enzo Busseti
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module include classes that download, store, and serve market data.
+
+The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
+Neither are exposed outside this module. Their derived classes instead are.
+
+If you want to interface cvxportfolio with financial data source other
+than the ones we provide, you should derive from either of those two classes.
+"""
+
+import datetime
+import logging
+import sqlite3
+import sys
+import warnings
+from pathlib import Path
+from urllib.error import URLError
+
+import numpy as np
+import pandas as pd
+import requests
+import requests.exceptions
+
+from .errors import DataError
+from .utils import (hash_, make_numeric, periods_per_year_from_datetime_index,
+                    resample_returns)
+
+__all__ = ["YahooFinance", "Fred",
+           "UserProvidedMarketData", "DownloadedMarketData"]
+
+logger = logging.getLogger(__name__)
+
+BASE_LOCATION = Path.home() / "cvxportfolio_data"
+
+def now_timezoned():
+    """Return current timestamp with local timezone.
+
+    :returns: Current timestamp with local timezone.
+    :rtype: pandas.Timestamp
+    """
+    return pd.Timestamp(
+        datetime.datetime.now(datetime.timezone.utc).astimezone())
+
+class SymbolData:
+    """Base class for a single symbol time series data.
+
+    The data is either in the form of a Pandas Series or DataFrame
+    and has datetime index.
+
+    This class needs to be derived. At a minimum,
+    one should redefine the ``_download`` method, which
+    implements the downloading of the symbol's time series
+    from an external source. The method takes the current (already
+    downloaded and stored) data and is supposed to **only append** to it.
+    In this way we only store new data and don't modify already downloaded
+    data.
+
+    Additionally one can redefine the ``_preload`` method, which prepares
+    data to serve to the user (so the data is stored in a different format
+    than what the user sees.) We found that this separation can be useful.
+
+    This class interacts with module-level functions named ``_loader_BACKEND``
+    and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage
+    system used. We define ``pickle``, ``csv``, and ``sqlite`` backends.
+    These may have limitations. See their docstrings for more information.
+
+
+    :param symbol: The symbol that we downloaded.
+    :type symbol: str
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
+    :type storage_backend: str
+    :param base_location: The location of the storage. We store in a
+        subdirectory named after the class which derives from this. By default
+        it's a directory named ``cvxportfolio_data`` in your home folder.
+    :type base_location: pathlib.Path
+    :param grace_period: If the most recent observation in the data is less
+        old than this we do not download new data. By default it's one day.
+    :type grace_period: pandas.Timedelta
+
+    :attribute data: The downloaded data for the symbol.
+    """
+
+    def __init__(self, symbol,
+                 storage_backend='pickle',
+                 base_location=BASE_LOCATION,
+                 grace_period=pd.Timedelta('1d')):
+        self._symbol = symbol
+        self._storage_backend = storage_backend
+        self._base_location = base_location
+        self.update(grace_period)
+        self._data = self.load()
+
+    @property
+    def storage_location(self):
+        """Storage location. Directory is created if not existent.
+
+        :rtype: pathlib.Path
+        """
+        loc = self._base_location / f"{self.__class__.__name__}"
+        loc.mkdir(parents=True, exist_ok=True)
+        return loc
+
+    @property
+    def symbol(self):
+        """The symbol whose data this instance contains.
+
+        :rtype: str
+        """
+        return self._symbol
+
+    @property
+    def data(self):
+        """Time series data, updated to the most recent observation.
+
+        :rtype: pandas.Series or pandas.DataFrame
+        """
+        return self._data
+
+    def _load_raw(self):
+        """Load raw data from database."""
+        # we could implement multiprocess safety here
+        loader = globals()['_loader_' + self._storage_backend]
+        try:
+            logger.info(
+                f"{self.__class__.__name__} is trying to load {self.symbol}"
+                + f" with {self._storage_backend} backend"
+                + f" from {self.storage_location}")
+            return loader(self.symbol, self.storage_location)
+        except FileNotFoundError:
+            return None
+
+    def load(self):
+        """Load data from database using `self.preload` function to process.
+
+        :returns: Loaded time-series data for the symbol.
+        :rtype: pandas.Series or pandas.DataFrame
+        """
+        return self._preload(self._load_raw())
+
+    def _store(self, data):
+        """Store data in database.
+
+        :param data: Time-series data to store.
+        :type data: pandas.Series or pandas.DataFrame
+        """
+        # we could implement multiprocess safety here
+        storer = globals()['_storer_' + self._storage_backend]
+        logger.info(
+            f"{self.__class__.__name__} is storing {self.symbol}"
+            + f" with {self._storage_backend} backend"
+            + f" in {self.storage_location}")
+        storer(self.symbol, data, self.storage_location)
+
+    def _print_difference(self, current, new):
+        """Helper method to print difference if update is not append-only.
+
+        This is temporary and will be re-factored.
+        """
+        print("TEMPORARY: Diff between overlap of downloaded and stored")
+        print((new - current).dropna(how='all').tail(5))
+
+    def update(self, grace_period):
+        """Update current stored data for symbol.
+
+        :param grace_period: If the time between now and the last value stored
+            is less than this, we don't update the data already stored.
+        :type grace_period: pandas.Timedelta
+        """
+        current = self._load_raw()
+        logger.info(
+            f"Downloading {self.symbol}"
+            + f" from {self.__class__.__name__}")
+        updated = self._download(
+            self.symbol, current, grace_period=grace_period)
+
+        if np.any(updated.iloc[:-1].isnull()):
+            logger.warning(
+              " cvxportfolio.%s('%s').data contains NaNs."
+              + " You may want to inspect it. If you want, you can delete the"
+              + " data file in %s to force re-download from the start.",
+              self.__class__.__name__, self.symbol, self.storage_location)
+
+        try:
+            if current is not None:
+                if not np.all(
+                        # we use numpy.isclose because returns may be computed
+                        # via logreturns and numerical errors can sift through
+                        np.isclose(updated.loc[current.index[:-1]],
+                            current.iloc[:-1], equal_nan=True,
+                            rtol=1e-08, atol=1e-08)):
+                    logger.error(f"{self.__class__.__name__} update"
+                        + f" of {self.symbol} is not append-only!")
+                    self._print_difference(current, updated)
+                if hasattr(current, 'columns'):
+                    # the first column is open price
+                    if not current.iloc[-1, 0] == updated.loc[
+                            current.index[-1]].iloc[0]:
+                        logger.error(
+                            f"{self.__class__.__name__} update "
+                            + f" of {self.symbol} changed last open price!")
+                        self._print_difference(current, updated)
+                else:
+                    if not current.iloc[-1] == updated.loc[current.index[-1]]:
+                        logger.error(
+                            f"{self.__class__.__name__} update"
+                            + f" of {self.symbol} changed last value!")
+                        self._print_difference(current, updated)
+        except KeyError:
+            logger.error("%s update of %s could not be checked for"
+                + " append-only edits. Was there a DST change?",
+                self.__class__.__name__, self.symbol)
+        self._store(updated)
+
+    def _download(self, symbol, current, grace_period, **kwargs):
+        """Download data from external source given already downloaded data.
+
+        This method must be redefined by derived classes.
+
+        :param symbol: The symbol we download.
+        :type symbol: str
+        :param current: The data already downloaded. We are supposed to
+            **only append** to it. If None, no data is present.
+        :type current: pandas.Series or pandas.DataFrame or None
+        :rtype: pandas.Series or pandas.DataFrame
+        """
+        raise NotImplementedError #pragma: no cover
+
+    def _preload(self, data):
+        """Prepare data to serve to the user.
+
+        This method can be redefined by derived classes.
+
+        :param data: The data returned by the storage backend.
+        :type data: pandas.Series or pandas.DataFrame
+        :rtype: pandas.Series or pandas.DataFrame
+        """
+        return data
+
+
+#
+# Yahoo Finance.
+#
+
+def _timestamp_convert(unix_seconds_ts):
+    """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
+    return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
+
+
+class YahooFinance(SymbolData):
+    """Yahoo Finance symbol data.
+
+    :param symbol: The symbol that we downloaded.
+    :type symbol: str
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
+    :type storage_backend: str
+    :param base_storage_location: The location of the storage. We store in a
+        subdirectory named after the class which derives from this.
+    :type base_storage_location: pathlib.Path
+    :param grace_period: If the most recent observation in the data is less
+        old than this we do not download new data.
+    :type grace_period: pandas.Timedelta
+
+    :attribute data: The downloaded, and cleaned, data for the symbol.
+    :type data: pandas.DataFrame
+    """
+
+    # is open-high-low-close-volume-(total)return
+    IS_OLHCVR = True
+
+    @staticmethod
+    def _clean(data):
+        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        # nan-out nonpositive prices
+        data.loc[data["open"] <= 0, 'open'] = np.nan
+        data.loc[data["close"] <= 0, "close"] = np.nan
+        data.loc[data["high"] <= 0, "high"] = np.nan
+        data.loc[data["low"] <= 0, "low"] = np.nan
+        data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan
+
+        # nan-out negative volumes
+        data.loc[data["volume"] < 0, 'volume'] = np.nan
+
+        # all infinity values are nans
+        data.iloc[:, :] = np.nan_to_num(
+            data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan)
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        # if low is not the lowest, set it to nan
+        data.loc[data['low'] > data[['open', 'high', 'close']].min(1),
+            'low']  = np.nan
+
+        # if high is not the highest, set it to nan
+        data.loc[data['high'] < data[['open', 'high', 'close']].max(1),
+            'high'] = np.nan
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        #
+        # fills
+        #
+
+        # fill volumes with zeros (safest choice)
+        data['volume'] = data['volume'].fillna(0.)
+
+        # fill close price with open price
+        data['close'] = data['close'].fillna(data['open'])
+
+        # fill open price with close from day(s) before
+        # repeat as long as it helps (up to 1 year)
+        for shifter in range(252):
+            orig_missing_opens = data['open'].isnull().sum()
+            data['open'] = data['open'].fillna(data['close'].shift(
+                shifter+1))
+            new_missing_opens = data['open'].isnull().sum()
+            if orig_missing_opens == new_missing_opens:
+                break
+            logger.info(
+                "Filled missing open prices with close from %s periods before",
+                shifter+1)
+
+        # fill close price with same day's open
+        data['close'] = data['close'].fillna(data['open'])
+
+        # fill high price with max
+        data['high'] = data['high'].fillna(data[['open', 'close']].max(1))
+
+        # fill low price with max
+        data['low'] = data['low'].fillna(data[['open', 'close']].min(1))
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        #
+        # Compute returns
+        #
+
+        # compute log of ratio between adjclose and close
+        log_adjustment_ratio = np.log(data['adjclose'] / data['close'])
+
+        # forward fill adjustment ratio
+        log_adjustment_ratio = log_adjustment_ratio.ffill()
+
+        # non-market log returns (dividends, splits)
+        non_market_lr = log_adjustment_ratio.diff().shift(-1)
+
+        # full open-to-open returns
+        open_to_open = np.log(data["open"]).diff().shift(-1)
+        data['return'] = np.exp(open_to_open + non_market_lr) - 1
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        # intraday_logreturn = np.log(data["close"]) - np.log(data["open"])
+        # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1)
+        # open_to_open_logreturn = (
+        #     close_to_close_logreturn + intraday_logreturn -
+        #     intraday_logreturn.shift(-1)
+        # )
+        # data["return"] = np.exp(open_to_open_logreturn) - 1
+        del data["adjclose"]
+
+        # eliminate last period's intraday data
+        data.loc[data.index[-1],
+            ["high", "low", "close", "return", "volume"]] = np.nan
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        return data
+
+    @staticmethod
+    def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
+        """Get 1 day OLHC from Yahoo finance.
+
+        Result is timestamped with the open time (time-zoned) of the
+        instrument.
+        """
+
+        base_url = 'https://query2.finance.yahoo.com'
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'
+            ' AppleWebKit/537.36 (KHTML, like Gecko)'
+            ' Chrome/39.0.2171.95 Safari/537.36'}
+
+        # print(HEADERS)
+        start = int(pd.Timestamp(start).timestamp())
+        end = int(pd.Timestamp(end).timestamp())
+
+        try:
+            res = requests.get(
+                url=f"{base_url}/v8/finance/chart/{ticker}",
+                params={'interval': '1d',
+                    "period1": start,
+                    "period2": end},
+                headers=headers,
+                timeout=10) # seconds
+        except requests.ConnectionError as exc:
+            raise DataError(
+                f"Download of {ticker} from YahooFinance failed."
+                + " Are you connected to the Internet?") from exc
+
+        # print(res)
+
+        if res.status_code == 404:
+            raise DataError(
+                f'Data for symbol {ticker} is not available.'
+                + 'Json output:', str(res.json()))
+
+        if res.status_code != 200:
+            raise DataError(f'Yahoo finance download of {ticker} failed. Json:',
+                str(res.json())) # pragma: no cover
+
+        data = res.json()['chart']['result'][0]
+
+        try:
+            index = pd.DatetimeIndex(
+                [_timestamp_convert(el) for el in data['timestamp']])
+
+            df_result = pd.DataFrame(
+                data['indicators']['quote'][0], index=index)
+            df_result['adjclose'] = data[
+                'indicators']['adjclose'][0]['adjclose']
+        except KeyError:
+            raise DataError(f'Yahoo finance download of {ticker} failed.'
+                + ' Json:', str(res.json())) # pragma: no cover
+
+        # last timestamp is probably broken (not timed to market open)
+        # we set its time to same as the day before, but this is wrong
+        # on days of DST switch. It's fine though because that line will be
+        # overwritten next update
+        if df_result.index[-1].time() != df_result.index[-2].time():
+            tm1 = df_result.index[-2].time()
+            newlast = df_result.index[-1].replace(
+                hour=tm1.hour, minute=tm1.minute, second=tm1.second)
+            df_result.index = pd.DatetimeIndex(
+                list(df_result.index[:-1]) + [newlast])
+
+        return df_result[
+            ['open', 'low', 'high', 'close', 'adjclose', 'volume']]
+
+    def _download(self, symbol, current=None,
+                overlap=5, grace_period='5d', **kwargs):
+        """Download single stock from Yahoo Finance.
+
+        If data was already downloaded we only download
+        the most recent missing portion.
+
+        Args:
+
+            symbol (str): yahoo name of the instrument
+            current (pandas.DataFrame or None): current data present locally
+            overlap (int): how many lines of current data will be overwritten
+                by newly downloaded data
+            kwargs (dict): extra arguments passed to yfinance.download
+
+        Returns:
+            updated (pandas.DataFrame): updated DataFrame for the symbol
+        """
+        if overlap < 2:
+            raise SyntaxError(
+                f'{self.__class__.__name__} with overlap smaller than 2'
+                + ' could have issues with DST.')
+        if (current is None) or (len(current) < overlap):
+            updated = self._get_data_yahoo(symbol, **kwargs)
+            logger.info('Downloading from the start.')
+            result = self._clean(updated)
+            # we remove first row if it contains NaNs
+            if np.any(result.iloc[0].isnull()):
+                result = result.iloc[1:]
+            return result
+        if (now_timezoned() - current.index[-1]
+                ) < pd.Timedelta(grace_period):
+            logger.info(
+                'Skipping download because stored data is recent enough.')
+            return current
+        new = self._get_data_yahoo(symbol, start=current.index[-overlap])
+        new = self._clean(new)
+        return pd.concat([current.iloc[:-overlap], new])
+
+    def _preload(self, data):
+        """Prepare data for use by Cvxportfolio.
+
+        We drop the `volume` column expressed in number of stocks and
+        replace it with `valuevolume` which is an estimate of the (e.g.,
+        US dollar) value of the volume exchanged on the day.
+        """
+        data["valuevolume"] = data["volume"] * data["open"]
+        del data["volume"]
+
+        return data
+
+#
+# Fred.
+#
+
+class Fred(SymbolData):
+    """Fred single-symbol data.
+
+    :param symbol: The symbol that we downloaded.
+    :type symbol: str
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
+    :type storage_backend: str
+    :param base_storage_location: The location of the storage. We store in a
+        subdirectory named after the class which derives from this. By default
+        it's a directory named ``cvxportfolio_data`` in your home folder.
+    :type base_storage_location: pathlib.Path
+    :param grace_period: If the most recent observation in the data is less
+        old than this we do not download new data. By default it's one day.
+    :type grace_period: pandas.Timedelta
+
+    :attribute data: The downloaded data for the symbol.
+    """
+
+    URL = "https://fred.stlouisfed.org/graph/fredgraph.csv"
+
+    # TODO: implement Fred point-in-time
+    # example:
+    # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06
+    # hourly wages time series **as it appeared** on 2023-07-06
+    # store using pd.Series() of diff'ed values only.
+
+    def _internal_download(self, symbol):
+        try:
+            return pd.read_csv(
+                self.URL + f'?id={symbol}',
+                index_col=0, parse_dates=[0])[symbol]
+        except URLError as exc:
+            raise DataError(f"Download of {symbol}"
+                + f" from {self.__class__.__name__} failed."
+                + " Are you connected to the Internet?") from exc
+
+    def _download(
+        self, symbol="DFF", current=None, grace_period='5d', **kwargs):
+        """Download or update pandas Series from Fred.
+
+        If already downloaded don't change data stored locally and only
+        add new entries at the end.
+
+        Additionally, we allow for a `grace period`, if the data already
+        downloaded has a last entry not older than the grace period, we
+        don't download new data.
+        """
+        if current is None:
+            return self._internal_download(symbol)
+        if (pd.Timestamp.today() - current.index[-1]
+            ) < pd.Timedelta(grace_period):
+            logger.info(
+                'Skipping download because stored data is recent enough.')
+            return current
+
+        new = self._internal_download(symbol)
+        new = new.loc[new.index > current.index[-1]]
+
+        if new.empty:
+            logger.info('New downloaded data is empty!')
+            return current
+
+        assert new.index[0] > current.index[-1]
+        return pd.concat([current, new])
+
+    def _preload(self, data):
+        """Add UTC timezone."""
+        data.index = data.index.tz_localize('UTC')
+        return data
+
+#
+# Sqlite storage backend.
+#
+
+def _open_sqlite(storage_location):
+    return sqlite3.connect(storage_location/"db.sqlite")
+
+def _close_sqlite(connection):
+    connection.close()
+
+def _loader_sqlite(symbol, storage_location):
+    """Load data in sqlite format.
+
+    We separately store dtypes for data consistency and safety.
+
+    .. note:: If your pandas object's index has a name it will be lost,
+        the index is renamed 'index'. If you pass timestamp data (including
+        the index) it must have explicit timezone.
+    """
+    try:
+        connection = _open_sqlite(storage_location)
+        dtypes = pd.read_sql_query(
+            f"SELECT * FROM {symbol}___dtypes",
+            connection, index_col="index",
+            dtype={"index": "str", "0": "str"})
+
+        parse_dates = 'index'
+        my_dtypes = dict(dtypes["0"])
+
+        tmp = pd.read_sql_query(
+            f"SELECT * FROM {symbol}", connection,
+            index_col="index", parse_dates=parse_dates, dtype=my_dtypes)
+
+        _close_sqlite(connection)
+        multiindex = []
+        for col in tmp.columns:
+            if col[:8] == "___level":
+                multiindex.append(col)
+            else:
+                break
+        if len(multiindex) > 0:
+            multiindex = [tmp.index.name] + multiindex
+            tmp = tmp.reset_index().set_index(multiindex)
+        return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
+    except pd.errors.DatabaseError:
+        return None
+
+def _storer_sqlite(symbol, data, storage_location):
+    """Store data in sqlite format.
+
+    We separately store dtypes for data consistency and safety.
+
+    .. note:: If your pandas object's index has a name it will be lost,
+        the index is renamed 'index'. If you pass timestamp data (including
+        the index) it must have explicit timezone.
+    """
+    connection = _open_sqlite(storage_location)
+    exists = pd.read_sql_query(
+      f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'",
+      connection)
+
+    if len(exists):
+        _ = connection.cursor().execute(f"DROP TABLE '{symbol}'")
+        _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'")
+        connection.commit()
+
+    if hasattr(data.index, "levels"):
+        data.index = data.index.set_names(
+            ["index"] +
+            [f"___level{i}" for i in range(1, len(data.index.levels))]
+        )
+        data = data.reset_index().set_index("index")
+    else:
+        data.index.name = "index"
+
+    if data.index[0].tzinfo is None:
+        warnings.warn('Index has not timezone, setting to UTC')
+        data.index = data.index.tz_localize('UTC')
+
+    data.to_sql(f"{symbol}", connection)
+    pd.DataFrame(data).dtypes.astype("string").to_sql(
+        f"{symbol}___dtypes", connection)
+    _close_sqlite(connection)
+
+
+#
+# Pickle storage backend.
+#
+
+def _loader_pickle(symbol, storage_location):
+    """Load data in pickle format."""
+    return pd.read_pickle(storage_location / f"{symbol}.pickle")
+
+def _storer_pickle(symbol, data, storage_location):
+    """Store data in pickle format."""
+    data.to_pickle(storage_location / f"{symbol}.pickle")
+
+#
+# Csv storage backend.
+#
+
+def _loader_csv(symbol, storage_location):
+    """Load data in csv format."""
+
+    index_dtypes = pd.read_csv(
+        storage_location / f"{symbol}___index_dtypes.csv",
+        index_col=0)["0"]
+
+    dtypes = pd.read_csv(
+        storage_location / f"{symbol}___dtypes.csv", index_col=0,
+        dtype={"index": "str", "0": "str"})
+    dtypes = dict(dtypes["0"])
+    new_dtypes = {}
+    parse_dates = []
+    for i, level in enumerate(index_dtypes):
+        if "datetime64[ns" in level: # includes all timezones
+            parse_dates.append(i)
+    for i, el in enumerate(dtypes):
+        if "datetime64[ns" in dtypes[el]:  # includes all timezones
+            parse_dates += [i + len(index_dtypes)]
+        else:
+            new_dtypes[el] = dtypes[el]
+
+    tmp = pd.read_csv(storage_location / f"{symbol}.csv",
+        index_col=list(range(len(index_dtypes))),
+        parse_dates=parse_dates, dtype=new_dtypes)
+
+    return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
+
+
+def _storer_csv(symbol, data, storage_location):
+    """Store data in csv format."""
+    pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels')
+        else [data.index.dtype]).astype("string").to_csv(
+        storage_location / f"{symbol}___index_dtypes.csv")
+    pd.DataFrame(data).dtypes.astype("string").to_csv(
+        storage_location / f"{symbol}___dtypes.csv")
+    data.to_csv(storage_location / f"{symbol}.csv")
+
+#
+# Market Data
+#
+
+class MarketData:
+    """Prepare, hold, and serve market data.
+
+    :method serve: Serve data for policy and simulator at time :math:`t`.
+    """
+
+    def serve(self, t):
+        """Serve data for policy and simulator at time :math:`t`.
+
+        :param t: Trading time. It must be included in the timestamps returned
+            by :meth:`trading_calendar`.
+        :type t: pandas.Timestamp
+
+        :returns: past_returns, current_returns, past_volumes, current_volumes,
+            current_prices
+        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame,
+            pandas.Series, pandas.Series)
+        """
+        raise NotImplementedError # pragma: no cover
+
+    # pylint: disable=redundant-returns-doc
+    def trading_calendar(
+        self, start_time=None, end_time=None, include_end=True):
+        """Get trading calendar between times.
+
+        :param start_time: Initial time of the trading calendar. Always
+            inclusive if present. If None, use the first available time.
+        :type start_time: pandas.Timestamp
+        :param end_time: Final time of the trading calendar. If None,
+            use the last available time.
+        :type end_time: pandas.Timestamp
+        :param include_end: Include end time.
+        :type include_end: bool
+
+        :returns: Trading calendar.
+        :rtype: pandas.DatetimeIndex
+        """
+        raise NotImplementedError # pragma: no cover
+
+    @property
+    def periods_per_year(self):
+        """Average trading periods per year.
+
+        :rtype: int
+        """
+        raise NotImplementedError # pragma: no cover
+
+    @property
+    def full_universe(self): # pylint: disable=redundant-returns-doc
+        """Full universe, which might not be available for trading.
+
+        :returns: Full universe.
+        :rtype: pandas.Index
+        """
+        raise NotImplementedError # pragma: no cover
+
+    # pylint: disable=unused-argument, redundant-returns-doc
+    def partial_universe_signature(self, partial_universe):
+        """Unique signature of this instance with a partial universe.
+
+        A partial universe is a subset of the full universe that is
+        available at some time for trading.
+
+        This is used in cvxportfolio.cache to sign back-test caches that
+        are saved on disk. If not redefined it returns None which disables
+        on-disk caching.
+
+        :param partial_universe: A subset of the full universe.
+        :type partial_universe: pandas.Index
+
+        :returns: Signature.
+        :rtype: str
+        """
+        return None
+
+# compiled based on Interactive Brokers benchmark rates choices
+# (see https://www.ibkrguides.com/kb/article-2949.htm)
+# and their FRED codes
+RATES = {
+    'USDOLLAR': 'DFF', # Federal funds effective rate
+    'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate
+    'GBPOUND': 'IUDSOIA', # SONIA
+    'JPYEN': 'IRSTCB01JPM156N', # updated monthly
+    }
+
+class MarketDataInMemory(MarketData):
+    """Market data that is stored in memory when initialized."""
+
+    # this is overwritten in the derived classes' initializers
+    returns = None
+
+    def __init__(
+        self, trading_frequency, base_location, cash_key, min_history,
+        online_usage = False):
+        """This must be called by the derived classes."""
+        if (self.returns.index[-1] - self.returns.index[0]) < min_history:
+            raise DataError(
+                "The provided returns have less history "
+                + f"than the min_history {min_history}")
+        if trading_frequency:
+            self._downsample(trading_frequency)
+        self.trading_frequency = trading_frequency
+
+        self._set_read_only()
+        self._check_sizes()
+        self._mask = None
+        self._masked_returns = None
+        self._masked_volumes = None
+        self._masked_prices = None
+        self.base_location = Path(base_location)
+        self.cash_key = cash_key
+        self._min_history_timedelta = min_history
+        self.online_usage = online_usage
+
+    def _mask_dataframes(self, mask):
+        """Mask internal dataframes if necessary."""
+        if (self._mask is None) or not np.all(self._mask == mask):
+            logger.info("Masking internal %s dataframes.",
+                self.__class__.__name__)
+            colmask = self.returns.columns[mask]
+            # self._masked_returns = self._df_or_ser_set_read_only(
+            #     pd.DataFrame(self.returns.iloc[:, mask], copy=True))
+            self._masked_returns = self._df_or_ser_set_read_only(
+               pd.DataFrame(self.returns.loc[:, colmask], copy=True))
+            # self._masked_returns = self._df_or_ser_set_read_only(
+            #     pd.DataFrame(np.array(self.returns.values[:, mask]),
+            #         index=self.returns.index, columns=colmask))
+            if not self.volumes is None:
+                # self._masked_volumes = self._df_or_ser_set_read_only(
+                #     pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True))
+                self._masked_volumes = self._df_or_ser_set_read_only(
+                    pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True))
+                # self._masked_volumes = self._df_or_ser_set_read_only(
+                #     pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]),
+                #         index=self.volumes.index, columns=colmask[:-1]))
+            if not self.prices is None:
+                # self._masked_prices = self._df_or_ser_set_read_only(
+                #     pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True))
+                self._masked_prices = self._df_or_ser_set_read_only(
+                    pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True))
+            self._mask = mask
+
+    @property
+    def full_universe(self):
+        """Full universe, which might not be available for trading.
+
+        :returns: Full universe.
+        :rtype: pandas.Index
+        """
+        return self.returns.columns
+
+    def serve(self, t):
+        """Serve data for policy and simulator at time :math:`t`.
+
+        :param t: Time of execution, *e.g.*, stock market open of a given day.
+        :type t: pandas.Timestamp
+
+        :returns: (past_returns, current_returns, past_volumes,
+            current_volumes, current_prices)
+        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None,
+            pandas.Series or None, pandas.Series or None)
+        """
+
+        mask = self._universe_mask_at_time(t).values
+        self._mask_dataframes(mask)
+
+        tidx = self.returns.index.get_loc(t)
+        past_returns = self._df_or_ser_set_read_only(
+            pd.DataFrame(self._masked_returns.iloc[:tidx]))
+        current_returns = self._df_or_ser_set_read_only(
+            pd.Series(self._masked_returns.iloc[tidx]))
+
+        if not self.volumes is None:
+            tidx = self.volumes.index.get_loc(t)
+            past_volumes = self._df_or_ser_set_read_only(
+                pd.DataFrame(self._masked_volumes.iloc[:tidx]))
+            current_volumes = self._df_or_ser_set_read_only(
+                pd.Series(self._masked_volumes.iloc[tidx]))
+        else:
+            past_volumes = None
+            current_volumes = None
+
+        if not self.prices is None:
+            tidx = self.prices.index.get_loc(t)
+            current_prices = self._df_or_ser_set_read_only(
+                pd.Series(self._masked_prices.iloc[tidx]))
+        else:
+            current_prices = None
+
+        return (past_returns, current_returns, past_volumes, current_volumes,
+                current_prices)
+
+    def _add_cash_column(self, cash_key, grace_period):
+        """Add the cash column to an already formed returns dataframe.
+
+        This assumes that the trading periods are about equally spaced.
+        If, say, you have trading periods with very different lengths you
+        should redefine this method **and** replace the :class:`CashReturn`
+        objective term.
+        """
+
+        if not cash_key in RATES:
+            raise NotImplementedError(
+                'Currently the only data pipelines built are for cash_key'
+                f' in {list(RATES)}')
+
+        if self.returns.index.tz is None:
+            raise DataError(
+                'Your provided dataframes are not timezone aware.'
+                + " This is not recommended, and doesn't allow to add the cash"
+                + " returns' column internally."
+                + " You can fix this by adding a timezone manually "
+                + "using pandas.DataFrame.tz_localize to the dataframes before"
+                + " you pass them, or you can provide"
+                + " the cash returns' column as the last column of the returns"
+                + " dataframe (so it has one more column than volumes and"
+                + " prices, if provided), and set the cash_key parameter to"
+                + " its name.")
+
+        data = Fred(
+            RATES[cash_key], base_location=self.base_location,
+            grace_period=grace_period)
+
+        cash_returns_per_period = resample_returns(
+            data.data/100, periods=self.periods_per_year)
+
+        # we merge instead of assigning column because indexes might
+        # be misaligned (e.g., with tz-aware timestamps)
+        cash_returns_per_period.name = self.cash_key
+        original_returns_index = self.returns.index
+        tmp = pd.concat(
+            [self.returns, cash_returns_per_period], sort=True, axis=1)
+        tmp[cash_key] = tmp[cash_key].ffill()
+        self.returns = tmp.loc[original_returns_index]
+
+    def trading_calendar(
+        self, start_time=None, end_time=None, include_end=True):
+        """Get trading calendar from market data.
+
+        :param start_time: Initial time of the trading calendar. Always
+            inclusive if present. If None, use the first available time.
+        :type start_time: pandas.Timestamp
+        :param end_time: Final time of the trading calendar. If None,
+            use the last available time.
+        :type end_time: pandas.Timestamp
+        :param include_end: Include end time.
+        :type include_end: bool
+
+        :returns: Trading calendar.
+        :rtype: pandas.DatetimeIndex
+        """
+        result = self.returns.index
+        result = result[result >= self._earliest_backtest_start]
+        if start_time:
+            result = result[result >= start_time]
+        if end_time:
+            result = result[(result <= end_time)]
+        if not include_end:
+            result = result[:-1]
+        return result
+
+    def _universe_mask_at_time(self, t):
+        """Return the valid universe mask at time t."""
+        past_returns = self.returns.loc[self.returns.index < t]
+        if self.online_usage:
+            valid_universe_mask = past_returns.count() >= self.min_history
+        else:
+            valid_universe_mask = ((past_returns.count() >= self.min_history) &
+                (~self.returns.loc[t].isnull()))
+        if sum(valid_universe_mask) <= 1:
+            raise DataError(
+                f'The trading universe at time {t} has size less or equal'
+                + ' than one, i.e., only the cash account. There are probably '
+                + ' issues with missing data in the provided market returns.')
+        return valid_universe_mask
+
+    @staticmethod
+    def _df_or_ser_set_read_only(df_or_ser):
+        """Set numpy array contained in dataframe to read only.
+
+        This is done on data store internally before it is served to the
+        policy or the simulator to ensure data consistency in case some
+        element of the pipeline accidentally corrupts the data.
+
+        This is enough to prevent direct assignement to the resulting
+        dataframe. However it could still be accidentally corrupted by
+        assigning to columns or indices that are not present in the
+        original. We avoid that case as well by returning a wrapped
+        dataframe (which doesn't copy data on creation) in
+        serve_data_policy and serve_data_simulator.
+        """
+        data = df_or_ser.values
+        data.flags.writeable = False
+        if hasattr(df_or_ser, 'columns'):
+            return pd.DataFrame(data, index=df_or_ser.index,
+                                columns=df_or_ser.columns)
+        return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name)
+
+    def _set_read_only(self):
+        """Set internal dataframes to read-only."""
+
+        self.returns = self._df_or_ser_set_read_only(self.returns)
+
+        if not self.prices is None:
+            self.prices = self._df_or_ser_set_read_only(self.prices)
+
+        if not self.volumes is None:
+            self.volumes = self._df_or_ser_set_read_only(self.volumes)
+
+    @property
+    def _earliest_backtest_start(self):
+        """Earliest date at which we can start a backtest."""
+        return self.returns.iloc[:, :-1].dropna(how='all').index[
+            self.min_history]
+
+    sampling_intervals = {
+        'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'}
+
+    # @staticmethod
+    # def _is_first_interval_small(datetimeindex):
+    #     """Check if post-resampling the first interval is small.
+    #
+    #     We have no way of knowing exactly if the first interval
+    #     needs to be dropped. We drop it if its length is smaller
+    #     than the average of all others, minus 2 standard deviation.
+    #     """
+    #     first_interval = (datetimeindex[1] - datetimeindex[0])
+    #     all_others = (datetimeindex[2:] - datetimeindex[1:-1])
+    #     return first_interval < (all_others.mean() - 2 * all_others.std())
+
+    def _downsample(self, interval):
+        """_downsample market data."""
+        if not interval in self.sampling_intervals:
+            raise SyntaxError(
+                'Unsopported trading interval for down-sampling.')
+        interval = self.sampling_intervals[interval]
+        new_returns_index = pd.Series(self.returns.index, self.returns.index
+                                      ).resample(interval, closed='left',
+                                                 label='left').first().values
+        # print(new_returns_index)
+        self.returns = np.exp(np.log(
+            1+self.returns).resample(interval, closed='left', label='left'
+                                     ).sum(min_count=1))-1
+        self.returns.index = new_returns_index
+
+        # last row is always unknown
+        self.returns.iloc[-1] = np.nan
+
+        # # we drop the first row if its interval is small
+        # if self._is_first_interval_small(self.returns.index):
+        #     self.returns = self.returns.iloc[1:]
+
+        # we nan-out the first non-nan element of every col
+        for col in self.returns.columns[:-1]:
+            self.returns.loc[
+                    (~(self.returns[col].isnull())).idxmax(), col] = np.nan
+
+        # and we drop the first row, which is mostly NaNs anyway
+        self.returns = self.returns.iloc[1:]
+
+        if self.volumes is not None:
+            new_volumes_index = pd.Series(
+                self.volumes.index, self.volumes.index
+                    ).resample(interval, closed='left',
+                               label='left').first().values
+            self.volumes = self.volumes.resample(
+                interval, closed='left', label='left').sum(min_count=1)
+            self.volumes.index = new_volumes_index
+
+            # last row is always unknown
+            self.volumes.iloc[-1] = np.nan
+
+            # # we drop the first row if its interval is small
+            # if self._is_first_interval_small(self.volumes.index):
+            #     self.volumes = self.volumes.iloc[1:]
+
+            # we nan-out the first non-nan element of every col
+            for col in self.volumes.columns:
+                self.volumes.loc[
+                    (~(self.volumes[col].isnull())).idxmax(), col] = np.nan
+
+            # and we drop the first row, which is mostly NaNs anyway
+            self.volumes = self.volumes.iloc[1:]
+
+        if self.prices is not None:
+            new_prices_index = pd.Series(
+                self.prices.index, self.prices.index
+                ).resample(
+                    interval, closed='left', label='left').first().values
+            self.prices = self.prices.resample(
+                interval, closed='left', label='left').first()
+            self.prices.index = new_prices_index
+
+            # # we drop the first row if its interval is small
+            # if self._is_first_interval_small(self.prices.index):
+            #     self.prices = self.prices.iloc[1:]
+
+            # we nan-out the first non-nan element of every col
+            for col in self.prices.columns:
+                self.prices.loc[
+                    (~(self.prices[col].isnull())).idxmax(), col] = np.nan
+
+            # and we drop the first row, which is mostly NaNs anyway
+            self.prices = self.prices.iloc[1:]
+
+    def _check_sizes(self):
+        """Check sizes of user-provided dataframes."""
+
+        if (not self.volumes is None) and (
+                not (self.volumes.shape[1] == self.returns.shape[1] - 1)
+                or not all(self.volumes.columns == self.returns.columns[:-1])):
+            raise SyntaxError(
+                'Volumes should have same columns as returns, minus cash_key.')
+
+        if (not self.prices is None) and (
+                not (self.prices.shape[1] == self.returns.shape[1] - 1)
+                or not all(self.prices.columns == self.returns.columns[:-1])):
+            raise SyntaxError(
+                'Prices should have same columns as returns, minus cash_key.')
+
+    @property
+    def periods_per_year(self):
+        """Average trading periods per year inferred from the data.
+
+        :returns: Average periods per year.
+        :rtype: int
+        """
+        return periods_per_year_from_datetime_index(self.returns.index)
+
+    @property
+    def min_history(self):
+        """Min history expressed in periods.
+
+        :returns: How many non-null elements of the past returns for a given
+            name are required to include it.
+        :rtype: int
+        """
+        return int(np.round(self.periods_per_year * (
+            self._min_history_timedelta / pd.Timedelta('365.24d'))))
+
+
+class UserProvidedMarketData(MarketDataInMemory):
+    """User-provided market data.
+
+    :param returns: Historical open-to-open returns. The return
+        at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where
+        :math:`p_t` is the (open) price at time :math:`t`. Must
+        have datetime index. You can also include cash
+        returns as its last column, and set ``cash_key`` below to the last
+        column's name.
+    :type returns: pandas.DataFrame
+    :param volumes: Historical market volumes, expressed in units
+        of value (*e.g.*, US dollars).
+    :type volumes: pandas.DataFrame or None
+    :param prices: Historical open prices (*e.g.*, used for rounding
+        trades in the :class:`MarketSimulator`).
+    :type prices: pandas.DataFrame or None
+    :param trading_frequency: Instead of using frequency implied by
+        the index of the returns, down-sample all dataframes.
+        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
+        ``'annual'``. By default (None) don't down-sample.
+    :type trading_frequency: str or None
+    :param min_history: Minimum amount of time for which the returns
+         are not ``np.nan`` before each assets enters in a back-test.
+    :type min_history: pandas.Timedelta
+    :param base_location: The location of the storage, only used
+        in case it downloads the cash returns. By default
+        it's a directory named ``cvxportfolio_data`` in your home folder.
+    :type base_location: pathlib.Path
+    :param cash_key: Name of the cash account. If not the last column
+        of the provided returns, it will be downloaded. In that case you should
+        make sure your provided dataframes have a timezone aware datetime
+        index. Its returns are the risk-free rate.
+    :type cash_key: str
+    :param online_usage: Disable removal of assets that have ``np.nan`` returns
+        for the given time. Default False.
+    :type online_usage: bool
+    """
+
+    # pylint: disable=too-many-arguments
+    def __init__(self, returns, volumes=None, prices=None,
+                 copy_dataframes=True, trading_frequency=None,
+                 min_history=pd.Timedelta('365.24d'),
+                 base_location=BASE_LOCATION,
+                 grace_period=pd.Timedelta('1d'),
+                 cash_key='USDOLLAR',
+                 online_usage=False):
+
+        if returns is None:
+            raise SyntaxError(
+                "If you don't specify a universe you should pass `returns`.")
+
+        self.base_location = Path(base_location)
+        self.cash_key = cash_key
+
+        self.returns = pd.DataFrame(
+            make_numeric(returns), copy=copy_dataframes)
+        self.volumes = volumes if volumes is None else\
+            pd.DataFrame(make_numeric(volumes), copy=copy_dataframes)
+        self.prices = prices if prices is None else\
+            pd.DataFrame(make_numeric(prices), copy=copy_dataframes)
+
+        if cash_key != returns.columns[-1]:
+            self._add_cash_column(cash_key, grace_period=grace_period)
+
+        # this is mandatory
+        super().__init__(
+            trading_frequency=trading_frequency,
+            base_location=base_location,
+            cash_key=cash_key,
+            min_history=min_history,
+            online_usage=online_usage)
+
+
+class DownloadedMarketData(MarketDataInMemory):
+    """Market data that is downloaded.
+
+    :param universe: List of names as understood by the data source
+        used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default
+        Yahoo Finance data source.
+    :type universe: list
+    :param datasource: The data source used.
+    :type datasource: str or :class:`SymbolData` class
+    :param cash_key: Name of the cash account, its rates will be downloaded
+        and added as last columns of the returns. Its returns are the
+        risk-free rate.
+    :type cash_key: str
+    :param base_location: The location of the storage. By default
+        it's a directory named ``cvxportfolio_data`` in your home folder.
+    :type base_location: pathlib.Path
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
+    :type storage_backend: str
+    :param min_history: Minimum amount of time for which the returns
+         are not ``np.nan`` before each assets enters in a back-test.
+    :type min_history: pandas.Timedelta
+    :param grace_period: If the most recent observation of each symbol's
+        data is less old than this we do not download new data.
+        By default it's one day.
+    :type grace_period: pandas.Timedelta
+    :param trading_frequency: Instead of using frequency implied by
+        the index of the returns, down-sample all dataframes.
+        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
+        ``'annual'``. By default (None) don't down-sample.
+    :type trading_frequency: str or None
+    :param online_usage: Disable removal of assets that have ``np.nan`` returns
+        for the given time. Default False.
+    :type online_usage: bool
+    """
+
+    # pylint: disable=too-many-arguments
+    def __init__(self,
+                 universe=(),
+                 datasource='YahooFinance',
+                 cash_key='USDOLLAR',
+                 base_location=BASE_LOCATION,
+                 storage_backend='pickle',
+                 min_history=pd.Timedelta('365.24d'),
+                 grace_period=pd.Timedelta('1d'),
+                 trading_frequency=None,
+                 online_usage=False):
+        """Initializer."""
+
+        # drop duplicates and ensure ordering
+        universe = sorted(set(universe))
+
+        self.base_location = Path(base_location)
+        self.cash_key = cash_key
+        if isinstance(datasource, type):
+            self.datasource = datasource
+        else: # try to load in current module
+            self.datasource = globals()[datasource]
+        self._get_market_data(
+            universe, grace_period=grace_period,
+            storage_backend=storage_backend)
+        self._add_cash_column(self.cash_key, grace_period=grace_period)
+        self._remove_missing_recent()
+
+        # this is mandatory
+        super().__init__(
+            trading_frequency=trading_frequency,
+            base_location=base_location,
+            cash_key=cash_key,
+            min_history=min_history,
+            online_usage=online_usage)
+
+    def _get_market_data(self, universe, grace_period, storage_backend):
+        """Download market data."""
+        database_accesses = {}
+        print('Updating data', end='')
+        sys.stdout.flush()
+
+        for stock in universe:
+            logger.info(
+                'Updating %s with %s.', stock, self.datasource.__name__)
+            print('.', end='')
+            sys.stdout.flush()
+            database_accesses[stock] = self.datasource(
+                stock, base_location=self.base_location,
+                grace_period=grace_period, storage_backend=storage_backend)
+        print()
+
+        if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR:
+            self.returns = pd.DataFrame(
+                {stock: database_accesses[stock].data['return']
+                for stock in universe})
+            self.volumes = pd.DataFrame(
+                {stock: database_accesses[stock].data['valuevolume']
+                for stock in universe})
+            self.prices = pd.DataFrame(
+                {stock: database_accesses[stock].data['open']
+                for stock in universe})
+        else:  # for now only Fred for indexes, we assume prices!
+            assert isinstance(database_accesses[universe[0]].data, pd.Series)
+            self.prices = pd.DataFrame(
+                # open prices
+                {stock: database_accesses[stock].data for stock in universe})
+            self.returns = 1 - self.prices / self.prices.shift(-1)
+            self.volumes = None
+
+    def _remove_missing_recent(self):
+        """Clean recent data.
+
+        Yahoo Finance may has issues with most recent data; we remove
+        recent days if there are NaNs.
+        """
+
+        if self.prices.iloc[-5:].isnull().any().any():
+            logger.debug(
+                'Removing some recent lines because there are missing values.')
+            drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax()
+            logger.debug('Dropping at index %s', drop_at)
+            self.returns = self.returns.loc[self.returns.index < drop_at]
+            if self.prices is not None:
+                self.prices = self.prices.loc[self.prices.index < drop_at]
+            if self.volumes is not None:
+                self.volumes = self.volumes.loc[self.volumes.index < drop_at]
+
+        # for consistency we must also nan-out the last row
+        # of returns and volumes
+        self.returns.iloc[-1] = np.nan
+        if self.volumes is not None:
+            self.volumes.iloc[-1] = np.nan
+
+    def partial_universe_signature(self, partial_universe):
+        """Unique signature of this instance with a partial universe.
+
+        A partial universe is a subset of the full universe that is
+        available at some time for trading.
+
+        This is used in cvxportfolio.cache to sign back-test caches that
+        are saved on disk. See its implementation below for details. If
+        not redefined it returns None which disables on-disk caching.
+
+        :param partial_universe: A subset of the full universe.
+        :type partial_universe: pandas.Index
+
+        :returns: Signature.
+        :rtype: str
+        """
+        assert isinstance(partial_universe, pd.Index)
+        assert np.all(partial_universe.isin(self.full_universe))
+        result = f'{self.__class__.__name__}('
+        result += f'datasource={self.datasource.__name__}, '
+        result += f'partial_universe_hash={hash_(np.array(partial_universe))},'
+        result += f' trading_frequency={self.trading_frequency})'
+        return result
diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index ecaef70c9..1502d4dd5 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -631,7 +631,7 @@ def _get_market_data(self, universe, grace_period, storage_backend):
                 grace_period=grace_period, storage_backend=storage_backend)
         print()
 
-        if hasattr(self.datasource, 'IS_OHLCVR') and self.datasource.IS_OHLCVR:
+        if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR:
             self.returns = pd.DataFrame(
                 {stock: database_accesses[stock].data['return']
                 for stock in universe})
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index ad51db479..b85b56cff 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -273,8 +273,8 @@ def _unlikeliness_score(
     return scaled.min(axis=1), scaled
 
 
-class OHLCV(SymbolData): # pylint: disable=abstract-method
-    """Base class for Open-High-Low-Close-Volume symbol data.
+class OLHCV(SymbolData): # pylint: disable=abstract-method
+    """Base class for Open-Low-High-Close-Volume symbol data.
 
     This operates on a dataframe with columns
 
@@ -294,6 +294,159 @@ class OHLCV(SymbolData): # pylint: disable=abstract-method
     dividends, ...) and they're dealt with in derived classes.
     """
 
+    def _process(self, new_data, saved_data=None):
+        """Base method for processing (cleaning) data.
+
+        It operates on the ``new_data`` dataframe, which is the newly
+        downloaded data. The ``saved_data`` dataframe is provided as well
+        (None if there is none). It has the same columns, older timestamps
+        with possible overlap with new_data at its end, and is **read only**:
+        it is used as reference to help with the cleaning, it has already
+        been cleaned.
+
+        The method is composed of the following steps, split between child
+        classes at the appropriate hierarchy level.
+
+        #. :meth:`_nan_impossible`: Nan-out impossible values in ``new_data``.
+        #. :meth:`_specific_process`: Do processing specific to the class,
+            before the following step (*e.g.,*, because we might want unlikely
+            values to still be there).
+        #. :meth:`_nan_unlikely`: Nan-out values that are (highly) unlikely,
+            with threshold-based testing.
+        #. :meth:`_fill`: Fill nans.
+        #. :meth:`_post_process`: Do final processing specific to the class.
+
+        With this factoring we should have the flexibility to handle various
+        data sources, by choosing at each level if each method calls
+        the parent's before or after its own processing.
+        """
+
+        self._nan_impossible(new_data, saved_data=saved_data)
+        self._specific_process(new_data, saved_data=saved_data)
+        self._nan_unlikely(new_data, saved_data=saved_data)
+        self._fill(new_data, saved_data=saved_data)
+        self._post_process(new_data, saved_data=saved_data)
+
+        return new_data
+
+    def _specific_process(self, new_data, saved_data=None):
+        """Specific process, do nothing."""
+        # return new_data
+
+    def _post_process(self, new_data, saved_data=None):
+        """Post process, do nothing."""
+        # return new_data
+
+    def _nan_unlikely(self, new_data, saved_data=None):
+        """Nan-out unlikely values."""
+        # return new_data
+
+    def _fill(self, new_data, saved_data=None):
+        """Make easy fills."""
+
+        # TODO: simplify
+
+        # print(data)
+        # print(data.isnull().sum())
+
+        # fill volumes with zeros (safest choice)
+        new_data['volume'] = new_data['volume'].fillna(0.)
+
+        # fill close price with open price
+        new_data['close'] = new_data['close'].fillna(new_data['open'])
+
+        # fill open price with close from day(s) before
+        # repeat as long as it helps (up to 1 year)
+        for shifter in range(252):
+            logger.info(
+                "Filling opens with close from %s days before", shifter)
+            orig_missing_opens = new_data['open'].isnull().sum()
+            new_data['open'] = new_data['open'].fillna(new_data['close'].shift(
+                shifter+1))
+            new_missing_opens = new_data['open'].isnull().sum()
+            if orig_missing_opens == new_missing_opens:
+                break
+
+        # fill close price with same day's open
+        new_data['close'] = new_data['close'].fillna(new_data['open'])
+
+        # fill high price with max
+        new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1))
+
+        # fill low price with max
+        new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1))
+
+        # print(data)
+        # print(data.isnull().sum())
+
+    def _nan_nonpositive_prices(self, data, prices_name):
+        """Set non-positive prices (chosen column) to NaN, in-place."""
+
+        bad_indexes = data.index[data[prices_name] <= 0]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") has non-positive %s prices on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, prices_name, bad_indexes)
+            data.loc[bad_indexes, prices_name] = np.nan
+
+    def _nan_negative_volumes(self, data):
+        """Set negative volumes to NaN, in-place."""
+
+        bad_indexes = data.index[data["volume"] < 0]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") has negative volumes on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            data.loc[bad_indexes, "volume"] = np.nan
+
+    def _set_infty_to_nan(self, data):
+        """Set all +/- infty elements of data to NaN, in-place."""
+
+        if np.isinf(data).sum().sum() > 0:
+            logger.warning(
+                '%s("%s") has +/- infinity values, setting those to nan',
+                self.__class__.__name__, self.symbol)
+            data.iloc[:, :] = np.nan_to_num(
+                data.values, copy=True, nan=np.nan, posinf=np.nan,
+                neginf=np.nan)
+
+    def _nan_impossible(self, new_data, saved_data=None):
+        """Set some impossible values of new_data to NaN, in-place."""
+
+        # nan-out nonpositive prices
+        for column in ["open", "close", "high", "low"]:
+            self._nan_nonpositive_prices(new_data, column)
+
+        # nan-out negative volumes
+        self._nan_negative_volumes(new_data)
+
+        # all infinity values are nans
+        self._set_infty_to_nan(new_data)
+
+        # TODO: these can be made smarter (sometimes the open is clearly wrong)
+
+        # if low is not the lowest, set it to nan
+        bad_indexes = new_data.index[
+            new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") low prices are not the lowest on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            new_data.loc[bad_indexes, "low"] = np.nan
+
+        # if high is not the highest, set it to nan
+        bad_indexes = new_data.index[
+            new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s") high prices are not the highest on timestamps: %s,'
+                + ' setting to nan',
+                self.__class__.__name__, self.symbol, bad_indexes)
+            new_data.loc[bad_indexes, "high"] = np.nan
+
     # TODO: factor quality check and clean into total-return related and non-
 
     def _preload(self, data):
@@ -305,7 +458,7 @@ def _preload(self, data):
         """
 
         # this is not used currently, but if we implement an interface to a
-        # pure OHLCV data source there is no need to store the open-to-open
+        # pure OLHCV data source there is no need to store the open-to-open
         # returns, they can be computed here
         if not 'return' in data.columns:
             data['return'] = data['open'].pct_change().shift(-1)
@@ -316,13 +469,12 @@ def _preload(self, data):
 
         return data
 
-class OHLCVTR(OHLCV): # pylint: disable=abstract-method
-    """Open-High-Low-Close-Volume-TotalReturn symbol data."""
-
-    # TODO: consider creating a OHLCVAC (adjusted closes) subclass
+class OLHCVTR(OLHCV): # pylint: disable=abstract-method
+    """Open-Low-High-Close-Volume-TotalReturn symbol data."""
 
+    # TODO: this becomes a isinstance(OLHC) in the caller
     # is open-high-low-close-volume-total return
-    IS_OHLCVR = True
+    IS_OLHCVR = True
 
     # # rolstd windows for finding wrong logreturns
     # _ROLSTD_WINDOWS = [20, 60, 252]
@@ -372,104 +524,6 @@ class OHLCVTR(OHLCV): # pylint: disable=abstract-method
     #
     #
 
-    def _nan_impossible(self, data):
-        """Set impossible values to NaN."""
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # nan-out nonpositive prices
-        for column in ["open", "close", "high", "low", "adjclose"]:
-            bad_indexes = data.index[data[column] <= 0]
-            if len(bad_indexes) > 0:
-                logger.warning(
-                    '%s("%s") has non-positive %s prices on timestamps: %s,'
-                    + ' setting to nan',
-                    self.__class__.__name__, self.symbol, column, bad_indexes)
-                data.loc[bad_indexes, column] = np.nan
-
-        # nan-out negative volumes
-        bad_indexes = data.index[data["volume"] < 0]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") has negative volumes on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            data.loc[bad_indexes, "volume"] = np.nan
-
-        # all infinity values are nans
-        if np.isinf(data).sum().sum() > 0:
-            logger.warning(
-                '%s("%s") has +/- infinity values, setting those to nan',
-                self.__class__.__name__, self.symbol)
-            data.iloc[:, :] = np.nan_to_num(
-                data.values, copy=True, nan=np.nan, posinf=np.nan,
-                neginf=np.nan)
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # TODO: these can be made smarter (sometimes the open is clearly wrong)
-
-        # if low is not the lowest, set it to nan
-        bad_indexes = data.index[
-            data['low'] > data[['open', 'high', 'close']].min(1)]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") low prices are not the lowest on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            data.loc[bad_indexes, "low"] = np.nan
-
-        # if high is not the highest, set it to nan
-        bad_indexes = data.index[
-            data['high'] < data[['open', 'high', 'close']].max(1)]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") high prices are not the highest on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            data.loc[bad_indexes, "high"] = np.nan
-
-        # print(data)
-        # print(data.isnull().sum())
-
-    def _fill_easy(self, data):
-        """Make easy fills."""
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # fill volumes with zeros (safest choice)
-        data['volume'] = data['volume'].fillna(0.)
-
-        # fill close price with open price
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill open price with close from day(s) before
-        # repeat as long as it helps (up to 1 year)
-        for shifter in range(252):
-            logger.info(
-                "Filling opens with close from %s days before", shifter)
-            orig_missing_opens = data['open'].isnull().sum()
-            data['open'] = data['open'].fillna(data['close'].shift(
-                shifter+1))
-            new_missing_opens = data['open'].isnull().sum()
-            if orig_missing_opens == new_missing_opens:
-                break
-
-        # fill close price with same day's open
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill high price with max
-        data['high'] = data['high'].fillna(data[['open', 'close']].max(1))
-
-        # fill low price with max
-        data['low'] = data['low'].fillna(data[['open', 'close']].min(1))
-
-        # print(data)
-        # print(data.isnull().sum())
-
     def _compute_total_returns(self, data):
         """Compute total open-to-open returns."""
 
@@ -503,26 +557,38 @@ def _compute_total_returns(self, data):
         # print(data)
         # print(data.isnull().sum())
 
-    def _process(self, data):
-        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
-
-        self._nan_impossible(data)
+    def _post_process(self, new_data, saved_data=None):
+        """Temporary."""
 
-        self._fill_easy(data)
-
-        self._compute_total_returns(data)
+        self._compute_total_returns(new_data)
 
         # eliminate adjclose column
-        del data["adjclose"]
+        del new_data["adjclose"]
 
         # eliminate last period's intraday data
-        data.loc[data.index[-1],
+        new_data.loc[new_data.index[-1],
             ["high", "low", "close", "return", "volume"]] = np.nan
 
-        return data
+    # def _process(self, data):
+    #     """Clean Yahoo Finance open-low-high-close-volume-adjclose data."""
+
+    #     self._nan_impossible(data)
+
+    #     self._fill(data)
+
+    #     self._compute_total_returns(data)
+
+    #     # eliminate adjclose column
+    #     del data["adjclose"]
+
+    #     # eliminate last period's intraday data
+    #     data.loc[data.index[-1],
+    #         ["high", "low", "close", "return", "volume"]] = np.nan
+
+    #     return data
 
     def _quality_check(self, data):
-        """Analyze quality of the OHLCV-TR data."""
+        """Analyze quality of the OLHCV-TR data."""
 
         # zero volume
         zerovol_idx = data.index[data.volume == 0]
@@ -563,7 +629,7 @@ def print_extreme(logreturns, name, sigmas=50):
         print_extreme(open2low, 'open to low returns')
 
 
-class OHLCVAC(OHLCVTR):
+class OLHCVAC(OLHCVTR):
     """Open-High-Low-Close-Volume-AdjustedClose data.
 
     This is modeled after the data returned by Yahoo Finance. It implements
@@ -572,8 +638,25 @@ class OHLCVAC(OHLCVTR):
     returns from the adjusted closes, and do some error checks.
     """
 
+    def _nan_impossible(self, new_data, saved_data=None):
+        """Set impossible values to NaN."""
+
+        # call the OLHCV method
+        super()._nan_impossible(new_data)
+
+        # also do it on adjclose
+        self._nan_nonpositive_prices(new_data, "adjclose")
+
+    # def _process(self, data):
+    #     """Obtain total returns and call parent's method."""
+
+    #     # data['total_return'] = data['adjclose'].ffill().pct_change()
+
+    #     # Then continue with OLHCVTR processing
+    #     return super()._process(data)
+
 
-class YahooFinance(OHLCVAC):
+class YahooFinance(OLHCVAC):
     """Yahoo Finance symbol data.
 
     :param symbol: The symbol that we downloaded.
@@ -594,7 +677,7 @@ class YahooFinance(OHLCVAC):
 
     @staticmethod
     def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
-        """Get 1-day OHLC-AC-V from Yahoo finance.
+        """Get 1-day OLHC-AC-V from Yahoo finance.
 
         This is roughly equivalent to
 
@@ -705,7 +788,7 @@ def _download(self, symbol, current=None,
         Returns:
             updated (pandas.DataFrame): updated DataFrame for the symbol
         """
-        # TODO this could be put at a much lower class hierarchy
+        # TODO this could be put at a lower class hierarchy
         if overlap < 2:
             raise SyntaxError(
                 f'{self.__class__.__name__} with overlap smaller than 2'
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index bb23f2b50..a1d4d048b 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -144,7 +144,7 @@ def test_yahoo_finance(self):
             base_location=self.datadir)
 
     def test_yahoo_finance_removefirstline(self):
-        """Test that the first line of OHLCV is removed if there are NaNs."""
+        """Test that the first line of OLHCV is removed if there are NaNs."""
 
         # this symbol was found to have NaNs in the first line
         _ = YahooFinance(

From fddac3e534ea28aa9b6706216b639a48a740ccde Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 12 Feb 2024 12:07:42 +0400
Subject: [PATCH 15/38] refactoring

---
 cvxportfolio/data/symbol_data.py | 32 ++++++++++++++++++++++++++------
 cvxportfolio/tests/test_data.py  |  2 +-
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index b85b56cff..8e020986f 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -300,7 +300,7 @@ def _process(self, new_data, saved_data=None):
         It operates on the ``new_data`` dataframe, which is the newly
         downloaded data. The ``saved_data`` dataframe is provided as well
         (None if there is none). It has the same columns, older timestamps
-        with possible overlap with new_data at its end, and is **read only**:
+        (possibly overlapping with new_data at the end), and is **read only**:
         it is used as reference to help with the cleaning, it has already
         been cleaned.
 
@@ -562,6 +562,21 @@ def _post_process(self, new_data, saved_data=None):
 
         self._compute_total_returns(new_data)
 
+        # close2close_total = np.log(1 + new_data['total_return'])
+        # open2close = np.log(new_data['close']) - np.log(new_data['open'])
+        # open2open_total = close2close_total - open2close + open2close.shift(1)
+        # alt = (np.exp(open2open_total) - 1).shift(-1)
+
+        close_div_open = new_data['close'] / new_data['open']
+        alt = ((1 + new_data['total_return']) / close_div_open) * close_div_open.shift(1) - 1
+        alt = alt.shift(-1)
+
+        # import code; code.interact(local=locals())
+
+        assert np.allclose(new_data['return'].dropna(), alt.dropna())
+
+        new_data['return'] = alt
+
         # eliminate adjclose column
         del new_data["adjclose"]
 
@@ -647,13 +662,18 @@ def _nan_impossible(self, new_data, saved_data=None):
         # also do it on adjclose
         self._nan_nonpositive_prices(new_data, "adjclose")
 
-    # def _process(self, data):
-    #     """Obtain total returns and call parent's method."""
+    def _specific_process(self, new_data, saved_data=None):
+        """Specific process, compute total returns."""
 
-    #     # data['total_return'] = data['adjclose'].ffill().pct_change()
+        # Close-to-close total return, so we can delegate to parent class.
+        # Note that this uses different time alignment than Cvxportfolio,
+        # Here today's return uses yesterday close and today close, while
+        # today's returns in Cvxportfolio use today open and tomorrow open.
+        # However this is the format more common among data vendors.
+        new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
 
-    #     # Then continue with OLHCVTR processing
-    #     return super()._process(data)
+        # We don't need this any more.
+        # del new_data['adjclose']
 
 
 class YahooFinance(OLHCVAC):
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index a1d4d048b..bb23f2b50 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -144,7 +144,7 @@ def test_yahoo_finance(self):
             base_location=self.datadir)
 
     def test_yahoo_finance_removefirstline(self):
-        """Test that the first line of OLHCV is removed if there are NaNs."""
+        """Test that the first line of OHLCV is removed if there are NaNs."""
 
         # this symbol was found to have NaNs in the first line
         _ = YahooFinance(

From a5988541f08fdc2fd73026522764a230fc2eab84 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 12 Feb 2024 16:28:16 +0400
Subject: [PATCH 16/38] refactoring, test cvxportfolio/tests/test_data.py
 TestData.test_yfinance_download became fragile, need to understand why

---
 cvxportfolio/data/market_data.py |  3 +-
 cvxportfolio/data/symbol_data.py | 48 ++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index 1502d4dd5..6ba1c406e 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -24,6 +24,7 @@
 from ..utils import (hash_, make_numeric, periods_per_year_from_datetime_index,
                      resample_returns)
 from .symbol_data import *
+from .symbol_data import OLHCV
 
 logger = logging.getLogger(__name__)
 
@@ -631,7 +632,7 @@ def _get_market_data(self, universe, grace_period, storage_backend):
                 grace_period=grace_period, storage_backend=storage_backend)
         print()
 
-        if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR:
+        if issubclass(self.datasource, OLHCV):
             self.returns = pd.DataFrame(
                 {stock: database_accesses[stock].data['return']
                 for stock in universe})
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 8e020986f..440410522 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -379,6 +379,9 @@ def _fill(self, new_data, saved_data=None):
         # print(data)
         # print(data.isnull().sum())
 
+    def _nan_indexes(self, data, bad_indexes, columns, message):
+        pass
+
     def _nan_nonpositive_prices(self, data, prices_name):
         """Set non-positive prices (chosen column) to NaN, in-place."""
 
@@ -460,8 +463,8 @@ def _preload(self, data):
         # this is not used currently, but if we implement an interface to a
         # pure OLHCV data source there is no need to store the open-to-open
         # returns, they can be computed here
-        if not 'return' in data.columns:
-            data['return'] = data['open'].pct_change().shift(-1)
+        #if not 'return' in data.columns:
+        #    data['return'] = data['open'].pct_change().shift(-1)
 
         self._quality_check(data)
         data["valuevolume"] = data["volume"] * data["open"]
@@ -469,12 +472,22 @@ def _preload(self, data):
 
         return data
 
-class OLHCVTR(OLHCV): # pylint: disable=abstract-method
-    """Open-Low-High-Close-Volume-TotalReturn symbol data."""
+class OLHCVAC(OLHCV):
+    """Open-High-Low-Close-Volume-AdjustedClose data.
+
+    This is modeled after the data returned by Yahoo Finance.
+    """
+#     It implements
+#     the transformation required to conform to the
+#     Open-High-Low-Close-Volume-TotalReturn model, that is, compute
+#     returns from the adjusted closes, and do some error checks.
+#     """
+# class OLHCVTR(OLHCV): # pylint: disable=abstract-method
+#     """Open-Low-High-Close-Volume-TotalReturn symbol data."""
 
     # TODO: this becomes a isinstance(OLHC) in the caller
     # is open-high-low-close-volume-total return
-    IS_OLHCVR = True
+    # IS_OLHCVR = True
 
     # # rolstd windows for finding wrong logreturns
     # _ROLSTD_WINDOWS = [20, 60, 252]
@@ -567,15 +580,18 @@ def _post_process(self, new_data, saved_data=None):
         # open2open_total = close2close_total - open2close + open2close.shift(1)
         # alt = (np.exp(open2open_total) - 1).shift(-1)
 
-        close_div_open = new_data['close'] / new_data['open']
-        alt = ((1 + new_data['total_return']) / close_div_open) * close_div_open.shift(1) - 1
-        alt = alt.shift(-1)
+        # close_div_open = new_data['close'] / new_data['open']
+        # open_to_open_total = (
+        #     (1 + new_data['total_return']) / close_div_open
+        #         ) * close_div_open.shift(1) - 1
 
         # import code; code.interact(local=locals())
 
-        assert np.allclose(new_data['return'].dropna(), alt.dropna())
+        # assert np.allclose(new_data['return'].dropna(), open_to_open_total.shift(-1).dropna())
+
+        # new_data['return'] = open_to_open_total.shift(-1)
 
-        new_data['return'] = alt
+        # del new_data['total_return']
 
         # eliminate adjclose column
         del new_data["adjclose"]
@@ -643,16 +659,6 @@ def print_extreme(logreturns, name, sigmas=50):
         open2low = np.log(data['low']) - np.log(data['open']).dropna()
         print_extreme(open2low, 'open to low returns')
 
-
-class OLHCVAC(OLHCVTR):
-    """Open-High-Low-Close-Volume-AdjustedClose data.
-
-    This is modeled after the data returned by Yahoo Finance. It implements
-    the transformation required to conform to the
-    Open-High-Low-Close-Volume-TotalReturn model, that is, compute
-    returns from the adjusted closes, and do some error checks.
-    """
-
     def _nan_impossible(self, new_data, saved_data=None):
         """Set impossible values to NaN."""
 
@@ -670,7 +676,7 @@ def _specific_process(self, new_data, saved_data=None):
         # Here today's return uses yesterday close and today close, while
         # today's returns in Cvxportfolio use today open and tomorrow open.
         # However this is the format more common among data vendors.
-        new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
+        # new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
 
         # We don't need this any more.
         # del new_data['adjclose']

From af76cb997f81ddf8e260737a73588b02e626f3e5 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Mon, 12 Feb 2024 23:08:15 +0400
Subject: [PATCH 17/38] more, cleaning needed

---
 cvxportfolio/data/symbol_data.py | 184 +++++++++++++++++++++----------
 1 file changed, 127 insertions(+), 57 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 440410522..516d95de8 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -322,20 +322,20 @@ def _process(self, new_data, saved_data=None):
         """
 
         self._nan_impossible(new_data, saved_data=saved_data)
-        self._specific_process(new_data, saved_data=saved_data)
+        # self._specific_process(new_data, saved_data=saved_data)
         self._nan_unlikely(new_data, saved_data=saved_data)
         self._fill(new_data, saved_data=saved_data)
-        self._post_process(new_data, saved_data=saved_data)
+        # self._post_process(new_data, saved_data=saved_data)
 
         return new_data
 
-    def _specific_process(self, new_data, saved_data=None):
-        """Specific process, do nothing."""
-        # return new_data
+    # def _specific_process(self, new_data, saved_data=None):
+    #     """Specific process, do nothing."""
+    #     # return new_data
 
-    def _post_process(self, new_data, saved_data=None):
-        """Post process, do nothing."""
-        # return new_data
+    # def _post_process(self, new_data, saved_data=None):
+    #     """Post process, do nothing."""
+    #     # return new_data
 
     def _nan_unlikely(self, new_data, saved_data=None):
         """Nan-out unlikely values."""
@@ -379,30 +379,86 @@ def _fill(self, new_data, saved_data=None):
         # print(data)
         # print(data.isnull().sum())
 
-    def _nan_indexes(self, data, bad_indexes, columns, message):
-        pass
-
-    def _nan_nonpositive_prices(self, data, prices_name):
-        """Set non-positive prices (chosen column) to NaN, in-place."""
+    def _nan_values(self, data, condition, columns_to_nan, message):
+        """Set to NaN in-place on indexing condition chosen columns."""
 
-        bad_indexes = data.index[data[prices_name] <= 0]
+        bad_indexes = data.index[condition]
         if len(bad_indexes) > 0:
             logger.warning(
-                '%s("%s") has non-positive %s prices on timestamps: %s,'
+                '%s("%s") has %s on timestamps: %s,'
                 + ' setting to nan',
-                self.__class__.__name__, self.symbol, prices_name, bad_indexes)
-            data.loc[bad_indexes, prices_name] = np.nan
+                self.__class__.__name__, self.symbol, message, bad_indexes)
+            data.loc[bad_indexes, columns_to_nan] = np.nan
+
+    def _nan_nonpositive_prices(self, data, prices_name):
+        """Set non-positive prices (chosen price name) to NaN, in-place."""
+        self._nan_values(
+            data=data, condition = data[prices_name] <= 0,
+            columns_to_nan = prices_name,
+            message = f'non-positive {prices_name} prices')
 
     def _nan_negative_volumes(self, data):
         """Set negative volumes to NaN, in-place."""
-
-        bad_indexes = data.index[data["volume"] < 0]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") has negative volumes on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            data.loc[bad_indexes, "volume"] = np.nan
+        self._nan_values(
+            data=data, condition = data["volume"] < 0,
+            columns_to_nan = "volume", message = 'negative volumes')
+
+    def _nan_open_lower_low(self, data):
+        """Set open price to NaN if lower than low, in-place."""
+        self._nan_values(
+            data=data, condition = data['open'] < data['low'],
+            columns_to_nan = "open",
+            message = 'open price lower than low price')
+
+    def _nan_open_higher_high(self, data):
+        """Set open price to NaN if higher than high, in-place."""
+        self._nan_values(
+            data=data, condition = data['open'] > data['high'],
+            columns_to_nan = "open",
+            message = 'open price higher than high price')
+
+    def _nan_incompatible_low_high(self, data):
+        """Set low and high to NaN if low is higher, in-place."""
+        self._nan_values(
+            data=data, condition = data['low'] > data['high'],
+            columns_to_nan = ["low", "high"],
+            message = 'low price higher than high price')
+
+    def _nan_high_lower_close(self, data):
+        """Set high price to NaN if lower than close, in-place."""
+        self._nan_values(
+            data=data, condition = data['high'] < data['close'],
+            columns_to_nan = "high",
+            message = 'high price lower than close price')
+
+    def _nan_low_higher_close(self, data):
+        """Set low price to NaN if higher than close, in-place."""
+        self._nan_values(
+            data=data, condition = data['low'] > data['close'],
+            columns_to_nan = "low",
+            message = 'low price higher than close price')
+
+    # def _nan_nonpositive_prices(self, data, prices_name):
+    #     """Set non-positive prices (chosen column) to NaN, in-place."""
+
+    #     bad_indexes = data.index[data[prices_name] <= 0]
+    #     if len(bad_indexes) > 0:
+    #         logger.warning(
+    #             '%s("%s") has non-positive %s prices on timestamps: %s,'
+    #             + ' setting to nan',
+    #             self.__class__.__name__, self.symbol, prices_name, bad_indexes)
+    #         data.loc[bad_indexes, prices_name] = np.nan
+
+    # def _nan_negative_volumes(self, data):
+    #     """Set negative volumes to NaN, in-place."""
+
+    #     bad_indexes = data.index[data["volume"] < 0]
+    #     if len(bad_indexes) > 0:
+    #         logger.warning(
+    #             '%s("%s") has negative volumes on timestamps: %s,'
+    #             + ' setting to nan',
+    #             self.__class__.__name__, self.symbol, bad_indexes)
+    #         data.loc[bad_indexes, "volume"] = np.nan
 
     def _set_infty_to_nan(self, data):
         """Set all +/- infty elements of data to NaN, in-place."""
@@ -428,27 +484,34 @@ def _nan_impossible(self, new_data, saved_data=None):
         # all infinity values are nans
         self._set_infty_to_nan(new_data)
 
-        # TODO: these can be made smarter (sometimes the open is clearly wrong)
+        # more
+        self._nan_open_lower_low(new_data)
+        self._nan_open_higher_high(new_data)
+        self._nan_incompatible_low_high(new_data)
+        self._nan_high_lower_close(new_data)
+        self._nan_low_higher_close(new_data)
 
-        # if low is not the lowest, set it to nan
-        bad_indexes = new_data.index[
-            new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") low prices are not the lowest on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            new_data.loc[bad_indexes, "low"] = np.nan
+        # TODO: these can be made smarter (sometimes the open is clearly wrong)
 
-        # if high is not the highest, set it to nan
-        bad_indexes = new_data.index[
-            new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
-        if len(bad_indexes) > 0:
-            logger.warning(
-                '%s("%s") high prices are not the highest on timestamps: %s,'
-                + ' setting to nan',
-                self.__class__.__name__, self.symbol, bad_indexes)
-            new_data.loc[bad_indexes, "high"] = np.nan
+        # # if low is not the lowest, set it to nan
+        # bad_indexes = new_data.index[
+        #     new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
+        # if len(bad_indexes) > 0:
+        #     logger.warning(
+        #         '%s("%s") low prices are not the lowest on timestamps: %s,'
+        #         + ' setting to nan',
+        #         self.__class__.__name__, self.symbol, bad_indexes)
+        #     new_data.loc[bad_indexes, "low"] = np.nan
+
+        # # if high is not the highest, set it to nan
+        # bad_indexes = new_data.index[
+        #     new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
+        # if len(bad_indexes) > 0:
+        #     logger.warning(
+        #         '%s("%s") high prices are not the highest on timestamps: %s,'
+        #         + ' setting to nan',
+        #         self.__class__.__name__, self.symbol, bad_indexes)
+        #     new_data.loc[bad_indexes, "high"] = np.nan
 
     # TODO: factor quality check and clean into total-return related and non-
 
@@ -463,8 +526,8 @@ def _preload(self, data):
         # this is not used currently, but if we implement an interface to a
         # pure OLHCV data source there is no need to store the open-to-open
         # returns, they can be computed here
-        #if not 'return' in data.columns:
-        #    data['return'] = data['open'].pct_change().shift(-1)
+        if not 'return' in data.columns:
+           data['return'] = data['open'].pct_change().shift(-1)
 
         self._quality_check(data)
         data["valuevolume"] = data["volume"] * data["open"]
@@ -552,6 +615,10 @@ def _compute_total_returns(self, data):
         # non-market log returns (dividends, splits)
         non_market_lr = log_adjustment_ratio.diff().shift(-1)
 
+        # dividend_return = (data['adjclose'] /  data['close']).pct_change().shift(-1)
+
+        # import code; code.interact(local=locals())
+
         # full open-to-open returns
         open_to_open = np.log(data["open"]).diff().shift(-1)
         data['return'] = np.exp(open_to_open + non_market_lr) - 1
@@ -570,9 +637,10 @@ def _compute_total_returns(self, data):
         # print(data)
         # print(data.isnull().sum())
 
-    def _post_process(self, new_data, saved_data=None):
+    def _process(self, new_data, saved_data=None):
         """Temporary."""
 
+        super()._process(new_data, saved_data=saved_data)
         self._compute_total_returns(new_data)
 
         # close2close_total = np.log(1 + new_data['total_return'])
@@ -600,6 +668,8 @@ def _post_process(self, new_data, saved_data=None):
         new_data.loc[new_data.index[-1],
             ["high", "low", "close", "return", "volume"]] = np.nan
 
+        return new_data
+
     # def _process(self, data):
     #     """Clean Yahoo Finance open-low-high-close-volume-adjclose data."""
 
@@ -668,18 +738,18 @@ def _nan_impossible(self, new_data, saved_data=None):
         # also do it on adjclose
         self._nan_nonpositive_prices(new_data, "adjclose")
 
-    def _specific_process(self, new_data, saved_data=None):
-        """Specific process, compute total returns."""
+    # def _specific_process(self, new_data, saved_data=None):
+    #     """Specific process, compute total returns."""
 
-        # Close-to-close total return, so we can delegate to parent class.
-        # Note that this uses different time alignment than Cvxportfolio,
-        # Here today's return uses yesterday close and today close, while
-        # today's returns in Cvxportfolio use today open and tomorrow open.
-        # However this is the format more common among data vendors.
-        # new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
+    #     # Close-to-close total return, so we can delegate to parent class.
+    #     # Note that this uses different time alignment than Cvxportfolio,
+    #     # Here today's return uses yesterday close and today close, while
+    #     # today's returns in Cvxportfolio use today open and tomorrow open.
+    #     # However this is the format more common among data vendors.
+    #     # new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
 
-        # We don't need this any more.
-        # del new_data['adjclose']
+    #     # We don't need this any more.
+    #     # del new_data['adjclose']
 
 
 class YahooFinance(OLHCVAC):

From f085125a758685ab76198738ae99d493eb941c00 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 10:53:17 +0400
Subject: [PATCH 18/38] some cleaning, adding read_only

---
 cvxportfolio/data.py             | 1398 ------------------------------
 cvxportfolio/data/market_data.py |   56 +-
 cvxportfolio/data/symbol_data.py |   86 +-
 cvxportfolio/tests/test_utils.py |    2 +-
 cvxportfolio/utils.py            |   30 +
 5 files changed, 51 insertions(+), 1521 deletions(-)
 delete mode 100644 cvxportfolio/data.py

diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
deleted file mode 100644
index 3e2be8232..000000000
--- a/cvxportfolio/data.py
+++ /dev/null
@@ -1,1398 +0,0 @@
-# Copyright 2023 Enzo Busseti
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This module include classes that download, store, and serve market data.
-
-The two main abstractions are :class:`SymbolData` and :class:`MarketData`.
-Neither are exposed outside this module. Their derived classes instead are.
-
-If you want to interface cvxportfolio with financial data source other
-than the ones we provide, you should derive from either of those two classes.
-"""
-
-import datetime
-import logging
-import sqlite3
-import sys
-import warnings
-from pathlib import Path
-from urllib.error import URLError
-
-import numpy as np
-import pandas as pd
-import requests
-import requests.exceptions
-
-from .errors import DataError
-from .utils import (hash_, make_numeric, periods_per_year_from_datetime_index,
-                    resample_returns)
-
-__all__ = ["YahooFinance", "Fred",
-           "UserProvidedMarketData", "DownloadedMarketData"]
-
-logger = logging.getLogger(__name__)
-
-BASE_LOCATION = Path.home() / "cvxportfolio_data"
-
-def now_timezoned():
-    """Return current timestamp with local timezone.
-
-    :returns: Current timestamp with local timezone.
-    :rtype: pandas.Timestamp
-    """
-    return pd.Timestamp(
-        datetime.datetime.now(datetime.timezone.utc).astimezone())
-
-class SymbolData:
-    """Base class for a single symbol time series data.
-
-    The data is either in the form of a Pandas Series or DataFrame
-    and has datetime index.
-
-    This class needs to be derived. At a minimum,
-    one should redefine the ``_download`` method, which
-    implements the downloading of the symbol's time series
-    from an external source. The method takes the current (already
-    downloaded and stored) data and is supposed to **only append** to it.
-    In this way we only store new data and don't modify already downloaded
-    data.
-
-    Additionally one can redefine the ``_preload`` method, which prepares
-    data to serve to the user (so the data is stored in a different format
-    than what the user sees.) We found that this separation can be useful.
-
-    This class interacts with module-level functions named ``_loader_BACKEND``
-    and ``_storer_BACKEND``, where ``BACKEND`` is the name of the storage
-    system used. We define ``pickle``, ``csv``, and ``sqlite`` backends.
-    These may have limitations. See their docstrings for more information.
-
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param base_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data. By default it's one day.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded data for the symbol.
-    """
-
-    def __init__(self, symbol,
-                 storage_backend='pickle',
-                 base_location=BASE_LOCATION,
-                 grace_period=pd.Timedelta('1d')):
-        self._symbol = symbol
-        self._storage_backend = storage_backend
-        self._base_location = base_location
-        self.update(grace_period)
-        self._data = self.load()
-
-    @property
-    def storage_location(self):
-        """Storage location. Directory is created if not existent.
-
-        :rtype: pathlib.Path
-        """
-        loc = self._base_location / f"{self.__class__.__name__}"
-        loc.mkdir(parents=True, exist_ok=True)
-        return loc
-
-    @property
-    def symbol(self):
-        """The symbol whose data this instance contains.
-
-        :rtype: str
-        """
-        return self._symbol
-
-    @property
-    def data(self):
-        """Time series data, updated to the most recent observation.
-
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return self._data
-
-    def _load_raw(self):
-        """Load raw data from database."""
-        # we could implement multiprocess safety here
-        loader = globals()['_loader_' + self._storage_backend]
-        try:
-            logger.info(
-                f"{self.__class__.__name__} is trying to load {self.symbol}"
-                + f" with {self._storage_backend} backend"
-                + f" from {self.storage_location}")
-            return loader(self.symbol, self.storage_location)
-        except FileNotFoundError:
-            return None
-
-    def load(self):
-        """Load data from database using `self.preload` function to process.
-
-        :returns: Loaded time-series data for the symbol.
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return self._preload(self._load_raw())
-
-    def _store(self, data):
-        """Store data in database.
-
-        :param data: Time-series data to store.
-        :type data: pandas.Series or pandas.DataFrame
-        """
-        # we could implement multiprocess safety here
-        storer = globals()['_storer_' + self._storage_backend]
-        logger.info(
-            f"{self.__class__.__name__} is storing {self.symbol}"
-            + f" with {self._storage_backend} backend"
-            + f" in {self.storage_location}")
-        storer(self.symbol, data, self.storage_location)
-
-    def _print_difference(self, current, new):
-        """Helper method to print difference if update is not append-only.
-
-        This is temporary and will be re-factored.
-        """
-        print("TEMPORARY: Diff between overlap of downloaded and stored")
-        print((new - current).dropna(how='all').tail(5))
-
-    def update(self, grace_period):
-        """Update current stored data for symbol.
-
-        :param grace_period: If the time between now and the last value stored
-            is less than this, we don't update the data already stored.
-        :type grace_period: pandas.Timedelta
-        """
-        current = self._load_raw()
-        logger.info(
-            f"Downloading {self.symbol}"
-            + f" from {self.__class__.__name__}")
-        updated = self._download(
-            self.symbol, current, grace_period=grace_period)
-
-        if np.any(updated.iloc[:-1].isnull()):
-            logger.warning(
-              " cvxportfolio.%s('%s').data contains NaNs."
-              + " You may want to inspect it. If you want, you can delete the"
-              + " data file in %s to force re-download from the start.",
-              self.__class__.__name__, self.symbol, self.storage_location)
-
-        try:
-            if current is not None:
-                if not np.all(
-                        # we use numpy.isclose because returns may be computed
-                        # via logreturns and numerical errors can sift through
-                        np.isclose(updated.loc[current.index[:-1]],
-                            current.iloc[:-1], equal_nan=True,
-                            rtol=1e-08, atol=1e-08)):
-                    logger.error(f"{self.__class__.__name__} update"
-                        + f" of {self.symbol} is not append-only!")
-                    self._print_difference(current, updated)
-                if hasattr(current, 'columns'):
-                    # the first column is open price
-                    if not current.iloc[-1, 0] == updated.loc[
-                            current.index[-1]].iloc[0]:
-                        logger.error(
-                            f"{self.__class__.__name__} update "
-                            + f" of {self.symbol} changed last open price!")
-                        self._print_difference(current, updated)
-                else:
-                    if not current.iloc[-1] == updated.loc[current.index[-1]]:
-                        logger.error(
-                            f"{self.__class__.__name__} update"
-                            + f" of {self.symbol} changed last value!")
-                        self._print_difference(current, updated)
-        except KeyError:
-            logger.error("%s update of %s could not be checked for"
-                + " append-only edits. Was there a DST change?",
-                self.__class__.__name__, self.symbol)
-        self._store(updated)
-
-    def _download(self, symbol, current, grace_period, **kwargs):
-        """Download data from external source given already downloaded data.
-
-        This method must be redefined by derived classes.
-
-        :param symbol: The symbol we download.
-        :type symbol: str
-        :param current: The data already downloaded. We are supposed to
-            **only append** to it. If None, no data is present.
-        :type current: pandas.Series or pandas.DataFrame or None
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        raise NotImplementedError #pragma: no cover
-
-    def _preload(self, data):
-        """Prepare data to serve to the user.
-
-        This method can be redefined by derived classes.
-
-        :param data: The data returned by the storage backend.
-        :type data: pandas.Series or pandas.DataFrame
-        :rtype: pandas.Series or pandas.DataFrame
-        """
-        return data
-
-
-#
-# Yahoo Finance.
-#
-
-def _timestamp_convert(unix_seconds_ts):
-    """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
-    return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
-
-
-class YahooFinance(SymbolData):
-    """Yahoo Finance symbol data.
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded, and cleaned, data for the symbol.
-    :type data: pandas.DataFrame
-    """
-
-    # is open-high-low-close-volume-(total)return
-    IS_OLHCVR = True
-
-    @staticmethod
-    def _clean(data):
-        """Clean Yahoo Finance open-close-high-low-volume-adjclose data."""
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # nan-out nonpositive prices
-        data.loc[data["open"] <= 0, 'open'] = np.nan
-        data.loc[data["close"] <= 0, "close"] = np.nan
-        data.loc[data["high"] <= 0, "high"] = np.nan
-        data.loc[data["low"] <= 0, "low"] = np.nan
-        data.loc[data["adjclose"] <= 0, "adjclose"] = np.nan
-
-        # nan-out negative volumes
-        data.loc[data["volume"] < 0, 'volume'] = np.nan
-
-        # all infinity values are nans
-        data.iloc[:, :] = np.nan_to_num(
-            data.values, copy=True, nan=np.nan, posinf=np.nan, neginf=np.nan)
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # if low is not the lowest, set it to nan
-        data.loc[data['low'] > data[['open', 'high', 'close']].min(1),
-            'low']  = np.nan
-
-        # if high is not the highest, set it to nan
-        data.loc[data['high'] < data[['open', 'high', 'close']].max(1),
-            'high'] = np.nan
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        #
-        # fills
-        #
-
-        # fill volumes with zeros (safest choice)
-        data['volume'] = data['volume'].fillna(0.)
-
-        # fill close price with open price
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill open price with close from day(s) before
-        # repeat as long as it helps (up to 1 year)
-        for shifter in range(252):
-            orig_missing_opens = data['open'].isnull().sum()
-            data['open'] = data['open'].fillna(data['close'].shift(
-                shifter+1))
-            new_missing_opens = data['open'].isnull().sum()
-            if orig_missing_opens == new_missing_opens:
-                break
-            logger.info(
-                "Filled missing open prices with close from %s periods before",
-                shifter+1)
-
-        # fill close price with same day's open
-        data['close'] = data['close'].fillna(data['open'])
-
-        # fill high price with max
-        data['high'] = data['high'].fillna(data[['open', 'close']].max(1))
-
-        # fill low price with max
-        data['low'] = data['low'].fillna(data[['open', 'close']].min(1))
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        #
-        # Compute returns
-        #
-
-        # compute log of ratio between adjclose and close
-        log_adjustment_ratio = np.log(data['adjclose'] / data['close'])
-
-        # forward fill adjustment ratio
-        log_adjustment_ratio = log_adjustment_ratio.ffill()
-
-        # non-market log returns (dividends, splits)
-        non_market_lr = log_adjustment_ratio.diff().shift(-1)
-
-        # full open-to-open returns
-        open_to_open = np.log(data["open"]).diff().shift(-1)
-        data['return'] = np.exp(open_to_open + non_market_lr) - 1
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # intraday_logreturn = np.log(data["close"]) - np.log(data["open"])
-        # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1)
-        # open_to_open_logreturn = (
-        #     close_to_close_logreturn + intraday_logreturn -
-        #     intraday_logreturn.shift(-1)
-        # )
-        # data["return"] = np.exp(open_to_open_logreturn) - 1
-        del data["adjclose"]
-
-        # eliminate last period's intraday data
-        data.loc[data.index[-1],
-            ["high", "low", "close", "return", "volume"]] = np.nan
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        return data
-
-    @staticmethod
-    def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
-        """Get 1 day OLHC from Yahoo finance.
-
-        Result is timestamped with the open time (time-zoned) of the
-        instrument.
-        """
-
-        base_url = 'https://query2.finance.yahoo.com'
-
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'
-            ' AppleWebKit/537.36 (KHTML, like Gecko)'
-            ' Chrome/39.0.2171.95 Safari/537.36'}
-
-        # print(HEADERS)
-        start = int(pd.Timestamp(start).timestamp())
-        end = int(pd.Timestamp(end).timestamp())
-
-        try:
-            res = requests.get(
-                url=f"{base_url}/v8/finance/chart/{ticker}",
-                params={'interval': '1d',
-                    "period1": start,
-                    "period2": end},
-                headers=headers,
-                timeout=10) # seconds
-        except requests.ConnectionError as exc:
-            raise DataError(
-                f"Download of {ticker} from YahooFinance failed."
-                + " Are you connected to the Internet?") from exc
-
-        # print(res)
-
-        if res.status_code == 404:
-            raise DataError(
-                f'Data for symbol {ticker} is not available.'
-                + 'Json output:', str(res.json()))
-
-        if res.status_code != 200:
-            raise DataError(f'Yahoo finance download of {ticker} failed. Json:',
-                str(res.json())) # pragma: no cover
-
-        data = res.json()['chart']['result'][0]
-
-        try:
-            index = pd.DatetimeIndex(
-                [_timestamp_convert(el) for el in data['timestamp']])
-
-            df_result = pd.DataFrame(
-                data['indicators']['quote'][0], index=index)
-            df_result['adjclose'] = data[
-                'indicators']['adjclose'][0]['adjclose']
-        except KeyError:
-            raise DataError(f'Yahoo finance download of {ticker} failed.'
-                + ' Json:', str(res.json())) # pragma: no cover
-
-        # last timestamp is probably broken (not timed to market open)
-        # we set its time to same as the day before, but this is wrong
-        # on days of DST switch. It's fine though because that line will be
-        # overwritten next update
-        if df_result.index[-1].time() != df_result.index[-2].time():
-            tm1 = df_result.index[-2].time()
-            newlast = df_result.index[-1].replace(
-                hour=tm1.hour, minute=tm1.minute, second=tm1.second)
-            df_result.index = pd.DatetimeIndex(
-                list(df_result.index[:-1]) + [newlast])
-
-        return df_result[
-            ['open', 'low', 'high', 'close', 'adjclose', 'volume']]
-
-    def _download(self, symbol, current=None,
-                overlap=5, grace_period='5d', **kwargs):
-        """Download single stock from Yahoo Finance.
-
-        If data was already downloaded we only download
-        the most recent missing portion.
-
-        Args:
-
-            symbol (str): yahoo name of the instrument
-            current (pandas.DataFrame or None): current data present locally
-            overlap (int): how many lines of current data will be overwritten
-                by newly downloaded data
-            kwargs (dict): extra arguments passed to yfinance.download
-
-        Returns:
-            updated (pandas.DataFrame): updated DataFrame for the symbol
-        """
-        if overlap < 2:
-            raise SyntaxError(
-                f'{self.__class__.__name__} with overlap smaller than 2'
-                + ' could have issues with DST.')
-        if (current is None) or (len(current) < overlap):
-            updated = self._get_data_yahoo(symbol, **kwargs)
-            logger.info('Downloading from the start.')
-            result = self._clean(updated)
-            # we remove first row if it contains NaNs
-            if np.any(result.iloc[0].isnull()):
-                result = result.iloc[1:]
-            return result
-        if (now_timezoned() - current.index[-1]
-                ) < pd.Timedelta(grace_period):
-            logger.info(
-                'Skipping download because stored data is recent enough.')
-            return current
-        new = self._get_data_yahoo(symbol, start=current.index[-overlap])
-        new = self._clean(new)
-        return pd.concat([current.iloc[:-overlap], new])
-
-    def _preload(self, data):
-        """Prepare data for use by Cvxportfolio.
-
-        We drop the `volume` column expressed in number of stocks and
-        replace it with `valuevolume` which is an estimate of the (e.g.,
-        US dollar) value of the volume exchanged on the day.
-        """
-        data["valuevolume"] = data["volume"] * data["open"]
-        del data["volume"]
-
-        return data
-
-#
-# Fred.
-#
-
-class Fred(SymbolData):
-    """Fred single-symbol data.
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data. By default it's one day.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded data for the symbol.
-    """
-
-    URL = "https://fred.stlouisfed.org/graph/fredgraph.csv"
-
-    # TODO: implement Fred point-in-time
-    # example:
-    # https://alfred.stlouisfed.org/graph/alfredgraph.csv?id=CES0500000003&vintage_date=2023-07-06
-    # hourly wages time series **as it appeared** on 2023-07-06
-    # store using pd.Series() of diff'ed values only.
-
-    def _internal_download(self, symbol):
-        try:
-            return pd.read_csv(
-                self.URL + f'?id={symbol}',
-                index_col=0, parse_dates=[0])[symbol]
-        except URLError as exc:
-            raise DataError(f"Download of {symbol}"
-                + f" from {self.__class__.__name__} failed."
-                + " Are you connected to the Internet?") from exc
-
-    def _download(
-        self, symbol="DFF", current=None, grace_period='5d', **kwargs):
-        """Download or update pandas Series from Fred.
-
-        If already downloaded don't change data stored locally and only
-        add new entries at the end.
-
-        Additionally, we allow for a `grace period`, if the data already
-        downloaded has a last entry not older than the grace period, we
-        don't download new data.
-        """
-        if current is None:
-            return self._internal_download(symbol)
-        if (pd.Timestamp.today() - current.index[-1]
-            ) < pd.Timedelta(grace_period):
-            logger.info(
-                'Skipping download because stored data is recent enough.')
-            return current
-
-        new = self._internal_download(symbol)
-        new = new.loc[new.index > current.index[-1]]
-
-        if new.empty:
-            logger.info('New downloaded data is empty!')
-            return current
-
-        assert new.index[0] > current.index[-1]
-        return pd.concat([current, new])
-
-    def _preload(self, data):
-        """Add UTC timezone."""
-        data.index = data.index.tz_localize('UTC')
-        return data
-
-#
-# Sqlite storage backend.
-#
-
-def _open_sqlite(storage_location):
-    return sqlite3.connect(storage_location/"db.sqlite")
-
-def _close_sqlite(connection):
-    connection.close()
-
-def _loader_sqlite(symbol, storage_location):
-    """Load data in sqlite format.
-
-    We separately store dtypes for data consistency and safety.
-
-    .. note:: If your pandas object's index has a name it will be lost,
-        the index is renamed 'index'. If you pass timestamp data (including
-        the index) it must have explicit timezone.
-    """
-    try:
-        connection = _open_sqlite(storage_location)
-        dtypes = pd.read_sql_query(
-            f"SELECT * FROM {symbol}___dtypes",
-            connection, index_col="index",
-            dtype={"index": "str", "0": "str"})
-
-        parse_dates = 'index'
-        my_dtypes = dict(dtypes["0"])
-
-        tmp = pd.read_sql_query(
-            f"SELECT * FROM {symbol}", connection,
-            index_col="index", parse_dates=parse_dates, dtype=my_dtypes)
-
-        _close_sqlite(connection)
-        multiindex = []
-        for col in tmp.columns:
-            if col[:8] == "___level":
-                multiindex.append(col)
-            else:
-                break
-        if len(multiindex) > 0:
-            multiindex = [tmp.index.name] + multiindex
-            tmp = tmp.reset_index().set_index(multiindex)
-        return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
-    except pd.errors.DatabaseError:
-        return None
-
-def _storer_sqlite(symbol, data, storage_location):
-    """Store data in sqlite format.
-
-    We separately store dtypes for data consistency and safety.
-
-    .. note:: If your pandas object's index has a name it will be lost,
-        the index is renamed 'index'. If you pass timestamp data (including
-        the index) it must have explicit timezone.
-    """
-    connection = _open_sqlite(storage_location)
-    exists = pd.read_sql_query(
-      f"SELECT name FROM sqlite_master WHERE type='table' AND name='{symbol}'",
-      connection)
-
-    if len(exists):
-        _ = connection.cursor().execute(f"DROP TABLE '{symbol}'")
-        _ = connection.cursor().execute(f"DROP TABLE '{symbol}___dtypes'")
-        connection.commit()
-
-    if hasattr(data.index, "levels"):
-        data.index = data.index.set_names(
-            ["index"] +
-            [f"___level{i}" for i in range(1, len(data.index.levels))]
-        )
-        data = data.reset_index().set_index("index")
-    else:
-        data.index.name = "index"
-
-    if data.index[0].tzinfo is None:
-        warnings.warn('Index has not timezone, setting to UTC')
-        data.index = data.index.tz_localize('UTC')
-
-    data.to_sql(f"{symbol}", connection)
-    pd.DataFrame(data).dtypes.astype("string").to_sql(
-        f"{symbol}___dtypes", connection)
-    _close_sqlite(connection)
-
-
-#
-# Pickle storage backend.
-#
-
-def _loader_pickle(symbol, storage_location):
-    """Load data in pickle format."""
-    return pd.read_pickle(storage_location / f"{symbol}.pickle")
-
-def _storer_pickle(symbol, data, storage_location):
-    """Store data in pickle format."""
-    data.to_pickle(storage_location / f"{symbol}.pickle")
-
-#
-# Csv storage backend.
-#
-
-def _loader_csv(symbol, storage_location):
-    """Load data in csv format."""
-
-    index_dtypes = pd.read_csv(
-        storage_location / f"{symbol}___index_dtypes.csv",
-        index_col=0)["0"]
-
-    dtypes = pd.read_csv(
-        storage_location / f"{symbol}___dtypes.csv", index_col=0,
-        dtype={"index": "str", "0": "str"})
-    dtypes = dict(dtypes["0"])
-    new_dtypes = {}
-    parse_dates = []
-    for i, level in enumerate(index_dtypes):
-        if "datetime64[ns" in level: # includes all timezones
-            parse_dates.append(i)
-    for i, el in enumerate(dtypes):
-        if "datetime64[ns" in dtypes[el]:  # includes all timezones
-            parse_dates += [i + len(index_dtypes)]
-        else:
-            new_dtypes[el] = dtypes[el]
-
-    tmp = pd.read_csv(storage_location / f"{symbol}.csv",
-        index_col=list(range(len(index_dtypes))),
-        parse_dates=parse_dates, dtype=new_dtypes)
-
-    return tmp.iloc[:, 0] if tmp.shape[1] == 1 else tmp
-
-
-def _storer_csv(symbol, data, storage_location):
-    """Store data in csv format."""
-    pd.DataFrame(data.index.dtypes if hasattr(data.index, 'levels')
-        else [data.index.dtype]).astype("string").to_csv(
-        storage_location / f"{symbol}___index_dtypes.csv")
-    pd.DataFrame(data).dtypes.astype("string").to_csv(
-        storage_location / f"{symbol}___dtypes.csv")
-    data.to_csv(storage_location / f"{symbol}.csv")
-
-#
-# Market Data
-#
-
-class MarketData:
-    """Prepare, hold, and serve market data.
-
-    :method serve: Serve data for policy and simulator at time :math:`t`.
-    """
-
-    def serve(self, t):
-        """Serve data for policy and simulator at time :math:`t`.
-
-        :param t: Trading time. It must be included in the timestamps returned
-            by :meth:`trading_calendar`.
-        :type t: pandas.Timestamp
-
-        :returns: past_returns, current_returns, past_volumes, current_volumes,
-            current_prices
-        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame,
-            pandas.Series, pandas.Series)
-        """
-        raise NotImplementedError # pragma: no cover
-
-    # pylint: disable=redundant-returns-doc
-    def trading_calendar(
-        self, start_time=None, end_time=None, include_end=True):
-        """Get trading calendar between times.
-
-        :param start_time: Initial time of the trading calendar. Always
-            inclusive if present. If None, use the first available time.
-        :type start_time: pandas.Timestamp
-        :param end_time: Final time of the trading calendar. If None,
-            use the last available time.
-        :type end_time: pandas.Timestamp
-        :param include_end: Include end time.
-        :type include_end: bool
-
-        :returns: Trading calendar.
-        :rtype: pandas.DatetimeIndex
-        """
-        raise NotImplementedError # pragma: no cover
-
-    @property
-    def periods_per_year(self):
-        """Average trading periods per year.
-
-        :rtype: int
-        """
-        raise NotImplementedError # pragma: no cover
-
-    @property
-    def full_universe(self): # pylint: disable=redundant-returns-doc
-        """Full universe, which might not be available for trading.
-
-        :returns: Full universe.
-        :rtype: pandas.Index
-        """
-        raise NotImplementedError # pragma: no cover
-
-    # pylint: disable=unused-argument, redundant-returns-doc
-    def partial_universe_signature(self, partial_universe):
-        """Unique signature of this instance with a partial universe.
-
-        A partial universe is a subset of the full universe that is
-        available at some time for trading.
-
-        This is used in cvxportfolio.cache to sign back-test caches that
-        are saved on disk. If not redefined it returns None which disables
-        on-disk caching.
-
-        :param partial_universe: A subset of the full universe.
-        :type partial_universe: pandas.Index
-
-        :returns: Signature.
-        :rtype: str
-        """
-        return None
-
-# compiled based on Interactive Brokers benchmark rates choices
-# (see https://www.ibkrguides.com/kb/article-2949.htm)
-# and their FRED codes
-RATES = {
-    'USDOLLAR': 'DFF', # Federal funds effective rate
-    'EURO': 'ECBESTRVOLWGTTRMDMNRT', # BCE short term rate
-    'GBPOUND': 'IUDSOIA', # SONIA
-    'JPYEN': 'IRSTCB01JPM156N', # updated monthly
-    }
-
-class MarketDataInMemory(MarketData):
-    """Market data that is stored in memory when initialized."""
-
-    # this is overwritten in the derived classes' initializers
-    returns = None
-
-    def __init__(
-        self, trading_frequency, base_location, cash_key, min_history,
-        online_usage = False):
-        """This must be called by the derived classes."""
-        if (self.returns.index[-1] - self.returns.index[0]) < min_history:
-            raise DataError(
-                "The provided returns have less history "
-                + f"than the min_history {min_history}")
-        if trading_frequency:
-            self._downsample(trading_frequency)
-        self.trading_frequency = trading_frequency
-
-        self._set_read_only()
-        self._check_sizes()
-        self._mask = None
-        self._masked_returns = None
-        self._masked_volumes = None
-        self._masked_prices = None
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-        self._min_history_timedelta = min_history
-        self.online_usage = online_usage
-
-    def _mask_dataframes(self, mask):
-        """Mask internal dataframes if necessary."""
-        if (self._mask is None) or not np.all(self._mask == mask):
-            logger.info("Masking internal %s dataframes.",
-                self.__class__.__name__)
-            colmask = self.returns.columns[mask]
-            # self._masked_returns = self._df_or_ser_set_read_only(
-            #     pd.DataFrame(self.returns.iloc[:, mask], copy=True))
-            self._masked_returns = self._df_or_ser_set_read_only(
-               pd.DataFrame(self.returns.loc[:, colmask], copy=True))
-            # self._masked_returns = self._df_or_ser_set_read_only(
-            #     pd.DataFrame(np.array(self.returns.values[:, mask]),
-            #         index=self.returns.index, columns=colmask))
-            if not self.volumes is None:
-                # self._masked_volumes = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True))
-                self._masked_volumes = self._df_or_ser_set_read_only(
-                    pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True))
-                # self._masked_volumes = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]),
-                #         index=self.volumes.index, columns=colmask[:-1]))
-            if not self.prices is None:
-                # self._masked_prices = self._df_or_ser_set_read_only(
-                #     pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True))
-                self._masked_prices = self._df_or_ser_set_read_only(
-                    pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True))
-            self._mask = mask
-
-    @property
-    def full_universe(self):
-        """Full universe, which might not be available for trading.
-
-        :returns: Full universe.
-        :rtype: pandas.Index
-        """
-        return self.returns.columns
-
-    def serve(self, t):
-        """Serve data for policy and simulator at time :math:`t`.
-
-        :param t: Time of execution, *e.g.*, stock market open of a given day.
-        :type t: pandas.Timestamp
-
-        :returns: (past_returns, current_returns, past_volumes,
-            current_volumes, current_prices)
-        :rtype: (pandas.DataFrame, pandas.Series, pandas.DataFrame or None,
-            pandas.Series or None, pandas.Series or None)
-        """
-
-        mask = self._universe_mask_at_time(t).values
-        self._mask_dataframes(mask)
-
-        tidx = self.returns.index.get_loc(t)
-        past_returns = self._df_or_ser_set_read_only(
-            pd.DataFrame(self._masked_returns.iloc[:tidx]))
-        current_returns = self._df_or_ser_set_read_only(
-            pd.Series(self._masked_returns.iloc[tidx]))
-
-        if not self.volumes is None:
-            tidx = self.volumes.index.get_loc(t)
-            past_volumes = self._df_or_ser_set_read_only(
-                pd.DataFrame(self._masked_volumes.iloc[:tidx]))
-            current_volumes = self._df_or_ser_set_read_only(
-                pd.Series(self._masked_volumes.iloc[tidx]))
-        else:
-            past_volumes = None
-            current_volumes = None
-
-        if not self.prices is None:
-            tidx = self.prices.index.get_loc(t)
-            current_prices = self._df_or_ser_set_read_only(
-                pd.Series(self._masked_prices.iloc[tidx]))
-        else:
-            current_prices = None
-
-        return (past_returns, current_returns, past_volumes, current_volumes,
-                current_prices)
-
-    def _add_cash_column(self, cash_key, grace_period):
-        """Add the cash column to an already formed returns dataframe.
-
-        This assumes that the trading periods are about equally spaced.
-        If, say, you have trading periods with very different lengths you
-        should redefine this method **and** replace the :class:`CashReturn`
-        objective term.
-        """
-
-        if not cash_key in RATES:
-            raise NotImplementedError(
-                'Currently the only data pipelines built are for cash_key'
-                f' in {list(RATES)}')
-
-        if self.returns.index.tz is None:
-            raise DataError(
-                'Your provided dataframes are not timezone aware.'
-                + " This is not recommended, and doesn't allow to add the cash"
-                + " returns' column internally."
-                + " You can fix this by adding a timezone manually "
-                + "using pandas.DataFrame.tz_localize to the dataframes before"
-                + " you pass them, or you can provide"
-                + " the cash returns' column as the last column of the returns"
-                + " dataframe (so it has one more column than volumes and"
-                + " prices, if provided), and set the cash_key parameter to"
-                + " its name.")
-
-        data = Fred(
-            RATES[cash_key], base_location=self.base_location,
-            grace_period=grace_period)
-
-        cash_returns_per_period = resample_returns(
-            data.data/100, periods=self.periods_per_year)
-
-        # we merge instead of assigning column because indexes might
-        # be misaligned (e.g., with tz-aware timestamps)
-        cash_returns_per_period.name = self.cash_key
-        original_returns_index = self.returns.index
-        tmp = pd.concat(
-            [self.returns, cash_returns_per_period], sort=True, axis=1)
-        tmp[cash_key] = tmp[cash_key].ffill()
-        self.returns = tmp.loc[original_returns_index]
-
-    def trading_calendar(
-        self, start_time=None, end_time=None, include_end=True):
-        """Get trading calendar from market data.
-
-        :param start_time: Initial time of the trading calendar. Always
-            inclusive if present. If None, use the first available time.
-        :type start_time: pandas.Timestamp
-        :param end_time: Final time of the trading calendar. If None,
-            use the last available time.
-        :type end_time: pandas.Timestamp
-        :param include_end: Include end time.
-        :type include_end: bool
-
-        :returns: Trading calendar.
-        :rtype: pandas.DatetimeIndex
-        """
-        result = self.returns.index
-        result = result[result >= self._earliest_backtest_start]
-        if start_time:
-            result = result[result >= start_time]
-        if end_time:
-            result = result[(result <= end_time)]
-        if not include_end:
-            result = result[:-1]
-        return result
-
-    def _universe_mask_at_time(self, t):
-        """Return the valid universe mask at time t."""
-        past_returns = self.returns.loc[self.returns.index < t]
-        if self.online_usage:
-            valid_universe_mask = past_returns.count() >= self.min_history
-        else:
-            valid_universe_mask = ((past_returns.count() >= self.min_history) &
-                (~self.returns.loc[t].isnull()))
-        if sum(valid_universe_mask) <= 1:
-            raise DataError(
-                f'The trading universe at time {t} has size less or equal'
-                + ' than one, i.e., only the cash account. There are probably '
-                + ' issues with missing data in the provided market returns.')
-        return valid_universe_mask
-
-    @staticmethod
-    def _df_or_ser_set_read_only(df_or_ser):
-        """Set numpy array contained in dataframe to read only.
-
-        This is done on data store internally before it is served to the
-        policy or the simulator to ensure data consistency in case some
-        element of the pipeline accidentally corrupts the data.
-
-        This is enough to prevent direct assignement to the resulting
-        dataframe. However it could still be accidentally corrupted by
-        assigning to columns or indices that are not present in the
-        original. We avoid that case as well by returning a wrapped
-        dataframe (which doesn't copy data on creation) in
-        serve_data_policy and serve_data_simulator.
-        """
-        data = df_or_ser.values
-        data.flags.writeable = False
-        if hasattr(df_or_ser, 'columns'):
-            return pd.DataFrame(data, index=df_or_ser.index,
-                                columns=df_or_ser.columns)
-        return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name)
-
-    def _set_read_only(self):
-        """Set internal dataframes to read-only."""
-
-        self.returns = self._df_or_ser_set_read_only(self.returns)
-
-        if not self.prices is None:
-            self.prices = self._df_or_ser_set_read_only(self.prices)
-
-        if not self.volumes is None:
-            self.volumes = self._df_or_ser_set_read_only(self.volumes)
-
-    @property
-    def _earliest_backtest_start(self):
-        """Earliest date at which we can start a backtest."""
-        return self.returns.iloc[:, :-1].dropna(how='all').index[
-            self.min_history]
-
-    sampling_intervals = {
-        'weekly': 'W-MON', 'monthly': 'MS', 'quarterly': 'QS', 'annual': 'AS'}
-
-    # @staticmethod
-    # def _is_first_interval_small(datetimeindex):
-    #     """Check if post-resampling the first interval is small.
-    #
-    #     We have no way of knowing exactly if the first interval
-    #     needs to be dropped. We drop it if its length is smaller
-    #     than the average of all others, minus 2 standard deviation.
-    #     """
-    #     first_interval = (datetimeindex[1] - datetimeindex[0])
-    #     all_others = (datetimeindex[2:] - datetimeindex[1:-1])
-    #     return first_interval < (all_others.mean() - 2 * all_others.std())
-
-    def _downsample(self, interval):
-        """_downsample market data."""
-        if not interval in self.sampling_intervals:
-            raise SyntaxError(
-                'Unsopported trading interval for down-sampling.')
-        interval = self.sampling_intervals[interval]
-        new_returns_index = pd.Series(self.returns.index, self.returns.index
-                                      ).resample(interval, closed='left',
-                                                 label='left').first().values
-        # print(new_returns_index)
-        self.returns = np.exp(np.log(
-            1+self.returns).resample(interval, closed='left', label='left'
-                                     ).sum(min_count=1))-1
-        self.returns.index = new_returns_index
-
-        # last row is always unknown
-        self.returns.iloc[-1] = np.nan
-
-        # # we drop the first row if its interval is small
-        # if self._is_first_interval_small(self.returns.index):
-        #     self.returns = self.returns.iloc[1:]
-
-        # we nan-out the first non-nan element of every col
-        for col in self.returns.columns[:-1]:
-            self.returns.loc[
-                    (~(self.returns[col].isnull())).idxmax(), col] = np.nan
-
-        # and we drop the first row, which is mostly NaNs anyway
-        self.returns = self.returns.iloc[1:]
-
-        if self.volumes is not None:
-            new_volumes_index = pd.Series(
-                self.volumes.index, self.volumes.index
-                    ).resample(interval, closed='left',
-                               label='left').first().values
-            self.volumes = self.volumes.resample(
-                interval, closed='left', label='left').sum(min_count=1)
-            self.volumes.index = new_volumes_index
-
-            # last row is always unknown
-            self.volumes.iloc[-1] = np.nan
-
-            # # we drop the first row if its interval is small
-            # if self._is_first_interval_small(self.volumes.index):
-            #     self.volumes = self.volumes.iloc[1:]
-
-            # we nan-out the first non-nan element of every col
-            for col in self.volumes.columns:
-                self.volumes.loc[
-                    (~(self.volumes[col].isnull())).idxmax(), col] = np.nan
-
-            # and we drop the first row, which is mostly NaNs anyway
-            self.volumes = self.volumes.iloc[1:]
-
-        if self.prices is not None:
-            new_prices_index = pd.Series(
-                self.prices.index, self.prices.index
-                ).resample(
-                    interval, closed='left', label='left').first().values
-            self.prices = self.prices.resample(
-                interval, closed='left', label='left').first()
-            self.prices.index = new_prices_index
-
-            # # we drop the first row if its interval is small
-            # if self._is_first_interval_small(self.prices.index):
-            #     self.prices = self.prices.iloc[1:]
-
-            # we nan-out the first non-nan element of every col
-            for col in self.prices.columns:
-                self.prices.loc[
-                    (~(self.prices[col].isnull())).idxmax(), col] = np.nan
-
-            # and we drop the first row, which is mostly NaNs anyway
-            self.prices = self.prices.iloc[1:]
-
-    def _check_sizes(self):
-        """Check sizes of user-provided dataframes."""
-
-        if (not self.volumes is None) and (
-                not (self.volumes.shape[1] == self.returns.shape[1] - 1)
-                or not all(self.volumes.columns == self.returns.columns[:-1])):
-            raise SyntaxError(
-                'Volumes should have same columns as returns, minus cash_key.')
-
-        if (not self.prices is None) and (
-                not (self.prices.shape[1] == self.returns.shape[1] - 1)
-                or not all(self.prices.columns == self.returns.columns[:-1])):
-            raise SyntaxError(
-                'Prices should have same columns as returns, minus cash_key.')
-
-    @property
-    def periods_per_year(self):
-        """Average trading periods per year inferred from the data.
-
-        :returns: Average periods per year.
-        :rtype: int
-        """
-        return periods_per_year_from_datetime_index(self.returns.index)
-
-    @property
-    def min_history(self):
-        """Min history expressed in periods.
-
-        :returns: How many non-null elements of the past returns for a given
-            name are required to include it.
-        :rtype: int
-        """
-        return int(np.round(self.periods_per_year * (
-            self._min_history_timedelta / pd.Timedelta('365.24d'))))
-
-
-class UserProvidedMarketData(MarketDataInMemory):
-    """User-provided market data.
-
-    :param returns: Historical open-to-open returns. The return
-        at time :math:`t` is :math:`r_t = p_{t+1}/p_t -1` where
-        :math:`p_t` is the (open) price at time :math:`t`. Must
-        have datetime index. You can also include cash
-        returns as its last column, and set ``cash_key`` below to the last
-        column's name.
-    :type returns: pandas.DataFrame
-    :param volumes: Historical market volumes, expressed in units
-        of value (*e.g.*, US dollars).
-    :type volumes: pandas.DataFrame or None
-    :param prices: Historical open prices (*e.g.*, used for rounding
-        trades in the :class:`MarketSimulator`).
-    :type prices: pandas.DataFrame or None
-    :param trading_frequency: Instead of using frequency implied by
-        the index of the returns, down-sample all dataframes.
-        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
-        ``'annual'``. By default (None) don't down-sample.
-    :type trading_frequency: str or None
-    :param min_history: Minimum amount of time for which the returns
-         are not ``np.nan`` before each assets enters in a back-test.
-    :type min_history: pandas.Timedelta
-    :param base_location: The location of the storage, only used
-        in case it downloads the cash returns. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param cash_key: Name of the cash account. If not the last column
-        of the provided returns, it will be downloaded. In that case you should
-        make sure your provided dataframes have a timezone aware datetime
-        index. Its returns are the risk-free rate.
-    :type cash_key: str
-    :param online_usage: Disable removal of assets that have ``np.nan`` returns
-        for the given time. Default False.
-    :type online_usage: bool
-    """
-
-    # pylint: disable=too-many-arguments
-    def __init__(self, returns, volumes=None, prices=None,
-                 copy_dataframes=True, trading_frequency=None,
-                 min_history=pd.Timedelta('365.24d'),
-                 base_location=BASE_LOCATION,
-                 grace_period=pd.Timedelta('1d'),
-                 cash_key='USDOLLAR',
-                 online_usage=False):
-
-        if returns is None:
-            raise SyntaxError(
-                "If you don't specify a universe you should pass `returns`.")
-
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-
-        self.returns = pd.DataFrame(
-            make_numeric(returns), copy=copy_dataframes)
-        self.volumes = volumes if volumes is None else\
-            pd.DataFrame(make_numeric(volumes), copy=copy_dataframes)
-        self.prices = prices if prices is None else\
-            pd.DataFrame(make_numeric(prices), copy=copy_dataframes)
-
-        if cash_key != returns.columns[-1]:
-            self._add_cash_column(cash_key, grace_period=grace_period)
-
-        # this is mandatory
-        super().__init__(
-            trading_frequency=trading_frequency,
-            base_location=base_location,
-            cash_key=cash_key,
-            min_history=min_history,
-            online_usage=online_usage)
-
-
-class DownloadedMarketData(MarketDataInMemory):
-    """Market data that is downloaded.
-
-    :param universe: List of names as understood by the data source
-        used, *e.g.*, ``['AAPL', 'GOOG']`` if using the default
-        Yahoo Finance data source.
-    :type universe: list
-    :param datasource: The data source used.
-    :type datasource: str or :class:`SymbolData` class
-    :param cash_key: Name of the cash account, its rates will be downloaded
-        and added as last columns of the returns. Its returns are the
-        risk-free rate.
-    :type cash_key: str
-    :param base_location: The location of the storage. By default
-        it's a directory named ``cvxportfolio_data`` in your home folder.
-    :type base_location: pathlib.Path
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``. By default ``'pickle'``.
-    :type storage_backend: str
-    :param min_history: Minimum amount of time for which the returns
-         are not ``np.nan`` before each assets enters in a back-test.
-    :type min_history: pandas.Timedelta
-    :param grace_period: If the most recent observation of each symbol's
-        data is less old than this we do not download new data.
-        By default it's one day.
-    :type grace_period: pandas.Timedelta
-    :param trading_frequency: Instead of using frequency implied by
-        the index of the returns, down-sample all dataframes.
-        We implement ``'weekly'``, ``'monthly'``, ``'quarterly'`` and
-        ``'annual'``. By default (None) don't down-sample.
-    :type trading_frequency: str or None
-    :param online_usage: Disable removal of assets that have ``np.nan`` returns
-        for the given time. Default False.
-    :type online_usage: bool
-    """
-
-    # pylint: disable=too-many-arguments
-    def __init__(self,
-                 universe=(),
-                 datasource='YahooFinance',
-                 cash_key='USDOLLAR',
-                 base_location=BASE_LOCATION,
-                 storage_backend='pickle',
-                 min_history=pd.Timedelta('365.24d'),
-                 grace_period=pd.Timedelta('1d'),
-                 trading_frequency=None,
-                 online_usage=False):
-        """Initializer."""
-
-        # drop duplicates and ensure ordering
-        universe = sorted(set(universe))
-
-        self.base_location = Path(base_location)
-        self.cash_key = cash_key
-        if isinstance(datasource, type):
-            self.datasource = datasource
-        else: # try to load in current module
-            self.datasource = globals()[datasource]
-        self._get_market_data(
-            universe, grace_period=grace_period,
-            storage_backend=storage_backend)
-        self._add_cash_column(self.cash_key, grace_period=grace_period)
-        self._remove_missing_recent()
-
-        # this is mandatory
-        super().__init__(
-            trading_frequency=trading_frequency,
-            base_location=base_location,
-            cash_key=cash_key,
-            min_history=min_history,
-            online_usage=online_usage)
-
-    def _get_market_data(self, universe, grace_period, storage_backend):
-        """Download market data."""
-        database_accesses = {}
-        print('Updating data', end='')
-        sys.stdout.flush()
-
-        for stock in universe:
-            logger.info(
-                'Updating %s with %s.', stock, self.datasource.__name__)
-            print('.', end='')
-            sys.stdout.flush()
-            database_accesses[stock] = self.datasource(
-                stock, base_location=self.base_location,
-                grace_period=grace_period, storage_backend=storage_backend)
-        print()
-
-        if hasattr(self.datasource, 'IS_OLHCVR') and self.datasource.IS_OLHCVR:
-            self.returns = pd.DataFrame(
-                {stock: database_accesses[stock].data['return']
-                for stock in universe})
-            self.volumes = pd.DataFrame(
-                {stock: database_accesses[stock].data['valuevolume']
-                for stock in universe})
-            self.prices = pd.DataFrame(
-                {stock: database_accesses[stock].data['open']
-                for stock in universe})
-        else:  # for now only Fred for indexes, we assume prices!
-            assert isinstance(database_accesses[universe[0]].data, pd.Series)
-            self.prices = pd.DataFrame(
-                # open prices
-                {stock: database_accesses[stock].data for stock in universe})
-            self.returns = 1 - self.prices / self.prices.shift(-1)
-            self.volumes = None
-
-    def _remove_missing_recent(self):
-        """Clean recent data.
-
-        Yahoo Finance may has issues with most recent data; we remove
-        recent days if there are NaNs.
-        """
-
-        if self.prices.iloc[-5:].isnull().any().any():
-            logger.debug(
-                'Removing some recent lines because there are missing values.')
-            drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax()
-            logger.debug('Dropping at index %s', drop_at)
-            self.returns = self.returns.loc[self.returns.index < drop_at]
-            if self.prices is not None:
-                self.prices = self.prices.loc[self.prices.index < drop_at]
-            if self.volumes is not None:
-                self.volumes = self.volumes.loc[self.volumes.index < drop_at]
-
-        # for consistency we must also nan-out the last row
-        # of returns and volumes
-        self.returns.iloc[-1] = np.nan
-        if self.volumes is not None:
-            self.volumes.iloc[-1] = np.nan
-
-    def partial_universe_signature(self, partial_universe):
-        """Unique signature of this instance with a partial universe.
-
-        A partial universe is a subset of the full universe that is
-        available at some time for trading.
-
-        This is used in cvxportfolio.cache to sign back-test caches that
-        are saved on disk. See its implementation below for details. If
-        not redefined it returns None which disables on-disk caching.
-
-        :param partial_universe: A subset of the full universe.
-        :type partial_universe: pandas.Index
-
-        :returns: Signature.
-        :rtype: str
-        """
-        assert isinstance(partial_universe, pd.Index)
-        assert np.all(partial_universe.isin(self.full_universe))
-        result = f'{self.__class__.__name__}('
-        result += f'datasource={self.datasource.__name__}, '
-        result += f'partial_universe_hash={hash_(np.array(partial_universe))},'
-        result += f' trading_frequency={self.trading_frequency})'
-        return result
diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index 6ba1c406e..e69530b5b 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -22,7 +22,7 @@
 
 from ..errors import DataError
 from ..utils import (hash_, make_numeric, periods_per_year_from_datetime_index,
-                     resample_returns)
+                     resample_returns, set_pd_read_only)
 from .symbol_data import *
 from .symbol_data import OLHCV
 
@@ -150,25 +150,25 @@ def _mask_dataframes(self, mask):
             logger.info("Masking internal %s dataframes.",
                 self.__class__.__name__)
             colmask = self.returns.columns[mask]
-            # self._masked_returns = self._df_or_ser_set_read_only(
+            # self._masked_returns = set_pd_read_only(
             #     pd.DataFrame(self.returns.iloc[:, mask], copy=True))
-            self._masked_returns = self._df_or_ser_set_read_only(
+            self._masked_returns = set_pd_read_only(
                pd.DataFrame(self.returns.loc[:, colmask], copy=True))
-            # self._masked_returns = self._df_or_ser_set_read_only(
+            # self._masked_returns = set_pd_read_only(
             #     pd.DataFrame(np.array(self.returns.values[:, mask]),
             #         index=self.returns.index, columns=colmask))
             if not self.volumes is None:
-                # self._masked_volumes = self._df_or_ser_set_read_only(
+                # self._masked_volumes = set_pd_read_only(
                 #     pd.DataFrame(self.volumes.iloc[:, mask[:-1]], copy=True))
-                self._masked_volumes = self._df_or_ser_set_read_only(
+                self._masked_volumes = set_pd_read_only(
                     pd.DataFrame(self.volumes.loc[:, colmask[:-1]], copy=True))
-                # self._masked_volumes = self._df_or_ser_set_read_only(
+                # self._masked_volumes = set_pd_read_only(
                 #     pd.DataFrame(np.array(self.volumes.values[:, mask[:-1]]),
                 #         index=self.volumes.index, columns=colmask[:-1]))
             if not self.prices is None:
-                # self._masked_prices = self._df_or_ser_set_read_only(
+                # self._masked_prices = set_pd_read_only(
                 #     pd.DataFrame(self.prices.iloc[:, mask[:-1]], copy=True))
-                self._masked_prices = self._df_or_ser_set_read_only(
+                self._masked_prices = set_pd_read_only(
                     pd.DataFrame(self.prices.loc[:, colmask[:-1]], copy=True))
             self._mask = mask
 
@@ -197,16 +197,16 @@ def serve(self, t):
         self._mask_dataframes(mask)
 
         tidx = self.returns.index.get_loc(t)
-        past_returns = self._df_or_ser_set_read_only(
+        past_returns = set_pd_read_only(
             pd.DataFrame(self._masked_returns.iloc[:tidx]))
-        current_returns = self._df_or_ser_set_read_only(
+        current_returns = set_pd_read_only(
             pd.Series(self._masked_returns.iloc[tidx]))
 
         if not self.volumes is None:
             tidx = self.volumes.index.get_loc(t)
-            past_volumes = self._df_or_ser_set_read_only(
+            past_volumes = set_pd_read_only(
                 pd.DataFrame(self._masked_volumes.iloc[:tidx]))
-            current_volumes = self._df_or_ser_set_read_only(
+            current_volumes = set_pd_read_only(
                 pd.Series(self._masked_volumes.iloc[tidx]))
         else:
             past_volumes = None
@@ -214,7 +214,7 @@ def serve(self, t):
 
         if not self.prices is None:
             tidx = self.prices.index.get_loc(t)
-            current_prices = self._df_or_ser_set_read_only(
+            current_prices = set_pd_read_only(
                 pd.Series(self._masked_prices.iloc[tidx]))
         else:
             current_prices = None
@@ -306,38 +306,16 @@ def _universe_mask_at_time(self, t):
                 + ' issues with missing data in the provided market returns.')
         return valid_universe_mask
 
-    @staticmethod
-    def _df_or_ser_set_read_only(df_or_ser):
-        """Set numpy array contained in dataframe to read only.
-
-        This is done on data store internally before it is served to the
-        policy or the simulator to ensure data consistency in case some
-        element of the pipeline accidentally corrupts the data.
-
-        This is enough to prevent direct assignement to the resulting
-        dataframe. However it could still be accidentally corrupted by
-        assigning to columns or indices that are not present in the
-        original. We avoid that case as well by returning a wrapped
-        dataframe (which doesn't copy data on creation) in
-        serve_data_policy and serve_data_simulator.
-        """
-        data = df_or_ser.values
-        data.flags.writeable = False
-        if hasattr(df_or_ser, 'columns'):
-            return pd.DataFrame(data, index=df_or_ser.index,
-                                columns=df_or_ser.columns)
-        return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name)
-
     def _set_read_only(self):
         """Set internal dataframes to read-only."""
 
-        self.returns = self._df_or_ser_set_read_only(self.returns)
+        self.returns = set_pd_read_only(self.returns)
 
         if not self.prices is None:
-            self.prices = self._df_or_ser_set_read_only(self.prices)
+            self.prices = set_pd_read_only(self.prices)
 
         if not self.volumes is None:
-            self.volumes = self._df_or_ser_set_read_only(self.volumes)
+            self.volumes = set_pd_read_only(self.volumes)
 
     @property
     def _earliest_backtest_start(self):
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 516d95de8..0419c645f 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -26,6 +26,7 @@
 import requests.exceptions
 
 from ..errors import DataError
+from ..utils import set_pd_read_only
 
 logger = logging.getLogger(__name__)
 
@@ -140,7 +141,7 @@ def load(self):
         :returns: Loaded time-series data for the symbol.
         :rtype: pandas.Series or pandas.DataFrame
         """
-        return self._preload(self._load_raw())
+        return set_pd_read_only(self._preload(self._load_raw()))
 
     def _store(self, data):
         """Store data in database.
@@ -329,14 +330,6 @@ def _process(self, new_data, saved_data=None):
 
         return new_data
 
-    # def _specific_process(self, new_data, saved_data=None):
-    #     """Specific process, do nothing."""
-    #     # return new_data
-
-    # def _post_process(self, new_data, saved_data=None):
-    #     """Post process, do nothing."""
-    #     # return new_data
-
     def _nan_unlikely(self, new_data, saved_data=None):
         """Nan-out unlikely values."""
         # return new_data
@@ -380,7 +373,7 @@ def _fill(self, new_data, saved_data=None):
         # print(data.isnull().sum())
 
     def _nan_values(self, data, condition, columns_to_nan, message):
-        """Set to NaN in-place on indexing condition chosen columns."""
+        """Set to NaN in-place for indexing condition and chosen columns."""
 
         bad_indexes = data.index[condition]
         if len(bad_indexes) > 0:
@@ -438,28 +431,6 @@ def _nan_low_higher_close(self, data):
             columns_to_nan = "low",
             message = 'low price higher than close price')
 
-    # def _nan_nonpositive_prices(self, data, prices_name):
-    #     """Set non-positive prices (chosen column) to NaN, in-place."""
-
-    #     bad_indexes = data.index[data[prices_name] <= 0]
-    #     if len(bad_indexes) > 0:
-    #         logger.warning(
-    #             '%s("%s") has non-positive %s prices on timestamps: %s,'
-    #             + ' setting to nan',
-    #             self.__class__.__name__, self.symbol, prices_name, bad_indexes)
-    #         data.loc[bad_indexes, prices_name] = np.nan
-
-    # def _nan_negative_volumes(self, data):
-    #     """Set negative volumes to NaN, in-place."""
-
-    #     bad_indexes = data.index[data["volume"] < 0]
-    #     if len(bad_indexes) > 0:
-    #         logger.warning(
-    #             '%s("%s") has negative volumes on timestamps: %s,'
-    #             + ' setting to nan',
-    #             self.__class__.__name__, self.symbol, bad_indexes)
-    #         data.loc[bad_indexes, "volume"] = np.nan
-
     def _set_infty_to_nan(self, data):
         """Set all +/- infty elements of data to NaN, in-place."""
 
@@ -491,28 +462,6 @@ def _nan_impossible(self, new_data, saved_data=None):
         self._nan_high_lower_close(new_data)
         self._nan_low_higher_close(new_data)
 
-        # TODO: these can be made smarter (sometimes the open is clearly wrong)
-
-        # # if low is not the lowest, set it to nan
-        # bad_indexes = new_data.index[
-        #     new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
-        # if len(bad_indexes) > 0:
-        #     logger.warning(
-        #         '%s("%s") low prices are not the lowest on timestamps: %s,'
-        #         + ' setting to nan',
-        #         self.__class__.__name__, self.symbol, bad_indexes)
-        #     new_data.loc[bad_indexes, "low"] = np.nan
-
-        # # if high is not the highest, set it to nan
-        # bad_indexes = new_data.index[
-        #     new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
-        # if len(bad_indexes) > 0:
-        #     logger.warning(
-        #         '%s("%s") high prices are not the highest on timestamps: %s,'
-        #         + ' setting to nan',
-        #         self.__class__.__name__, self.symbol, bad_indexes)
-        #     new_data.loc[bad_indexes, "high"] = np.nan
-
     # TODO: factor quality check and clean into total-return related and non-
 
     def _preload(self, data):
@@ -540,17 +489,6 @@ class OLHCVAC(OLHCV):
 
     This is modeled after the data returned by Yahoo Finance.
     """
-#     It implements
-#     the transformation required to conform to the
-#     Open-High-Low-Close-Volume-TotalReturn model, that is, compute
-#     returns from the adjusted closes, and do some error checks.
-#     """
-# class OLHCVTR(OLHCV): # pylint: disable=abstract-method
-#     """Open-Low-High-Close-Volume-TotalReturn symbol data."""
-
-    # TODO: this becomes a isinstance(OLHC) in the caller
-    # is open-high-low-close-volume-total return
-    # IS_OLHCVR = True
 
     # # rolstd windows for finding wrong logreturns
     # _ROLSTD_WINDOWS = [20, 60, 252]
@@ -670,24 +608,6 @@ def _process(self, new_data, saved_data=None):
 
         return new_data
 
-    # def _process(self, data):
-    #     """Clean Yahoo Finance open-low-high-close-volume-adjclose data."""
-
-    #     self._nan_impossible(data)
-
-    #     self._fill(data)
-
-    #     self._compute_total_returns(data)
-
-    #     # eliminate adjclose column
-    #     del data["adjclose"]
-
-    #     # eliminate last period's intraday data
-    #     data.loc[data.index[-1],
-    #         ["high", "low", "close", "return", "volume"]] = np.nan
-
-    #     return data
-
     def _quality_check(self, data):
         """Analyze quality of the OLHCV-TR data."""
 
diff --git a/cvxportfolio/tests/test_utils.py b/cvxportfolio/tests/test_utils.py
index efdb7ba01..f4d3d3f4c 100644
--- a/cvxportfolio/tests/test_utils.py
+++ b/cvxportfolio/tests/test_utils.py
@@ -80,7 +80,7 @@ def test_make_numeric(self):
                 np.array(['1', 2], dtype=object),
                 pd.Series([1, '2', 3], dtype=object),
                 pd.DataFrame([[1, '2.', 3], [4, '5.', 6]], dtype=object)]:
-            make_numeric(data)
+            self.assertTrue(np.all(data.astype(float) == make_numeric(data)))
 
         for data in [
                 np.array(['1a', 2], dtype=object),
diff --git a/cvxportfolio/utils.py b/cvxportfolio/utils.py
index 0f5d3a25d..1454966fb 100644
--- a/cvxportfolio/utils.py
+++ b/cvxportfolio/utils.py
@@ -29,6 +29,36 @@
            'average_periods_per_year']
 
 
+@staticmethod
+def set_pd_read_only(df_or_ser):
+    """Set numpy array contained in dataframe or series to read only.
+
+    This is done on data store internally before it is served to the
+    policy or the simulator to ensure data consistency in case some
+    element of the pipeline accidentally corrupts the data.
+
+    This is enough to prevent direct assignement to the resulting
+    dataframe. However it could still be accidentally corrupted by
+    assigning to columns or indices that are not present in the
+    original. We avoid that case as well by returning a wrapped
+    dataframe (which doesn't copy data on creation) in
+    serve_data_policy and serve_data_simulator.
+
+    :param df_or_ser: Series or Dataframe, only numeric (better if
+        homogeneous) dtype.
+    :type df_or_ser: pd.Series or pd.DataFrame
+
+    :returns: Pandas object set to read only.
+    :rtype: pd.Series or pd.DataFrame
+    """
+    data = df_or_ser.values
+    data.flags.writeable = False
+    if hasattr(df_or_ser, 'columns'):
+        return pd.DataFrame(data, index=df_or_ser.index,
+                            columns=df_or_ser.columns)
+    return pd.Series(data, index=df_or_ser.index, name=df_or_ser.name)
+
+
 def average_periods_per_year(num_periods, first_time, last_time):
     """Average periods per year of a datetime index (unpacked), rounded to int.
 

From 4072d546a1616c05b5d181e287a85bb629596031 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 11:40:23 +0400
Subject: [PATCH 19/38] testing

---
 cvxportfolio/data/symbol_data.py |  18 +-
 cvxportfolio/tests/test_data.py  | 304 ++++++++++++++++++-------------
 2 files changed, 191 insertions(+), 131 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 0419c645f..f94d49409 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -410,12 +410,12 @@ def _nan_open_higher_high(self, data):
             columns_to_nan = "open",
             message = 'open price higher than high price')
 
-    def _nan_incompatible_low_high(self, data):
-        """Set low and high to NaN if low is higher, in-place."""
-        self._nan_values(
-            data=data, condition = data['low'] > data['high'],
-            columns_to_nan = ["low", "high"],
-            message = 'low price higher than high price')
+    # def _nan_incompatible_low_high(self, data):
+    #     """Set low and high to NaN if low is higher, in-place."""
+    #     self._nan_values(
+    #         data=data, condition = data['low'] > data['high'],
+    #         columns_to_nan = ["low", "high"],
+    #         message = 'low price higher than high price')
 
     def _nan_high_lower_close(self, data):
         """Set high price to NaN if lower than close, in-place."""
@@ -456,11 +456,11 @@ def _nan_impossible(self, new_data, saved_data=None):
         self._set_infty_to_nan(new_data)
 
         # more
-        self._nan_open_lower_low(new_data)
-        self._nan_open_higher_high(new_data)
-        self._nan_incompatible_low_high(new_data)
         self._nan_high_lower_close(new_data)
         self._nan_low_higher_close(new_data)
+        self._nan_open_lower_low(new_data)
+        self._nan_open_higher_high(new_data)
+        # self._nan_incompatible_low_high(new_data)
 
     # TODO: factor quality check and clean into total-return related and non-
 
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index bb23f2b50..e5d920e98 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -327,6 +327,188 @@ def _base_test_multiindex(self, loader, storer):
         self.assertTrue(all(data.index.dtypes == data1.index.dtypes))
         self.assertTrue(all(data.dtypes == data1.dtypes))
 
+    def test_download_errors(self):
+        """Test single-symbol download error."""
+
+        storer = YahooFinance(
+            'AAPL', grace_period=self.data_grace_period,
+            base_location=self.datadir)
+        with self.assertRaises(SyntaxError):
+            # pylint: disable=protected-access
+            storer._download('AAPL', overlap=1)
+
+        class YahooFinanceErroneous(YahooFinance):
+            """Modified YF that nans last open price."""
+            def _download(self, symbol, current=None,
+                    overlap=5, grace_period='5d', **kwargs):
+                """Modified download method."""
+                res = super()._download(symbol, current,
+                    grace_period=grace_period)
+                res.iloc[-1, 0 ] = np.nan
+                return res
+
+        _ = YahooFinanceErroneous('AMZN', base_location=self.datadir)
+        with self.assertLogs(level='ERROR') as _:
+            _ = YahooFinanceErroneous(
+                'AMZN', base_location=self.datadir)
+
+        class YahooFinanceErroneous2(YahooFinance):
+            """Modified YF that nans some line."""
+            def _download(self, symbol, current=None,
+                    overlap=5, grace_period='5d', **kwargs):
+                """Modified download method."""
+                res = super()._download(symbol, current,
+                    grace_period=grace_period)
+                res.iloc[-20] = np.nan
+                return res
+        with self.assertLogs(level='WARNING') as _:
+            _ = YahooFinanceErroneous2('GOOGL',
+                base_location=self.datadir)
+        with self.assertLogs(level='WARNING') as _:
+            _ = YahooFinanceErroneous2(
+                'GOOGL', base_location=self.datadir)
+
+        class FredErroneous(Fred):
+            """Modified FRED SymbolData that gives a NaN in the last entry."""
+
+            def _download(self, symbol, current, grace_period):
+                """Modified download method."""
+                res = super()._download(symbol, current,
+                    grace_period=grace_period)
+                res.iloc[-1] = np.nan
+                return res
+
+        _ = FredErroneous('DFF', base_location=self.datadir)
+        with self.assertLogs(level='ERROR') as _:
+            _ = FredErroneous(
+                'DFF', base_location=self.datadir)
+
+        class YahooFinanceErroneous3(YahooFinance):
+            """Modified YF that is not append-only."""
+            counter = 0
+            def _download(self, symbol, current=None,
+                    overlap=5, grace_period='5d', **kwargs):
+                """Modified download method."""
+                res = super()._download(symbol, current,
+                    grace_period=grace_period)
+                if self.counter > 0:
+                    res.iloc[-2] = 0.
+                self.counter += 1
+                return res
+        storer = YahooFinanceErroneous3('GOOGL', base_location=self.datadir)
+        with self.assertLogs(level='ERROR') as _:
+            storer.update(pd.Timedelta('0d'))
+
+    def test_no_internet(self):
+        """Test errors thrown when not connected to the internet."""
+
+        with NoInternet():
+            with self.assertRaises(DataError):
+                cvx.YahooFinance('BABA', base_location=self.datadir)
+
+        with NoInternet():
+            with self.assertRaises(DataError):
+                cvx.Fred('CES0500000003', base_location=self.datadir)
+
+    def test_yahoo_finance_errors(self):
+        """Test errors with Yahoo Finance."""
+
+        with self.assertRaises(DataError):
+            YahooFinance("DOESNTEXIST", base_location=self.datadir)
+
+    def test_yahoo_finance_cleaning(self):
+        """Test our logic to clean Yahoo Finance data."""
+
+        # this stock was found to have NaN issues
+        data = YahooFinance("ENI.MI", base_location=self.datadir).data
+        self.assertTrue((data.valuevolume == 0).sum() > 0)
+        self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0)
+
+    def test_yahoo_finance_cleaning_granular(self):
+        """Test each step of cleaning."""
+
+        # pylint: disable=protected-access
+        raw_data = YahooFinance._get_data_yahoo('ZM')
+        print(raw_data)
+        empty_instance = YahooFinance.__new__(YahooFinance)
+        empty_instance._symbol = 'ZM' # because the warnings use the symbol
+
+        def _test_warning(data_transformation, part_of_message):
+            """Test that warning is raised w/ message containing some word."""
+            data = pd.DataFrame(raw_data, copy=True)
+            exec(data_transformation) # pylint: disable=exec-used
+            with self.assertLogs(level='WARNING') as _:
+                _cleaned = empty_instance._process(data, None)
+                self.assertTrue(part_of_message in _.output[0])
+                # check all NaNs have been filled
+                self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0)
+
+        # infty
+        _test_warning(
+            'data.iloc[2,2] = np.inf',
+            'infinity')
+
+        # non-pos price
+        _test_warning(
+            'data.iloc[2,0] = -1',
+            'non-positive open')
+        _test_warning(
+            'data.iloc[2,0] = 0',
+            'non-positive open')
+        _test_warning(
+            'data.iloc[4,2] = 0',
+            'non-positive high')
+
+        # neg volume
+        _test_warning(
+            'data.iloc[2,-1] = -1',
+            'negative volumes')
+
+        # open lower low
+        _test_warning(
+            'data.iloc[1,0] = data.iloc[1,1]*.9',
+            'open price lower than low price')
+
+        # open higher high
+        _test_warning(
+            'data.iloc[1,0] = data.iloc[1,2]*1.1',
+            'open price higher than high price')
+
+        # low higher close
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3].close * 1.1',
+            'low price higher than close price')
+
+        # high lower close
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3].close * .9',
+            'high price lower than close price')
+
+    # def test_yahoo_finance_wrong_last_time(self):
+    #     """Test that we correct last time if intraday."""
+    #
+    #     class YahooFinanceErroneous4(YahooFinance):
+    #         """Modified YF that sets last time wrong."""
+    #         counter = 0
+    #
+    #         @staticmethod
+    #         def _get_data_yahoo(
+    #             ticker, start='1900-01-01', end='2100-01-01'):
+    #             """Modified download method."""
+    #             res = YahooFinance._get_data_yahoo(
+    #                 ticker, start=start, end=end)
+    #             if self.counter > 0:
+    #                 res.index = list(res.index)[:-1] + [
+    #                     res.index[-1] - pd.Timedelta('3h')]
+    #             self.counter += 1
+    #             print(res)
+    #             return res
+    #
+    #     storer = YahooFinanceErroneous4('GOOGL', base_location=self.datadir)
+    #     print(storer.data)
+    #     #storer.update(pd.Timedelta('0d'))
+    #     #print(storer.data)
+
 
 class TestMarketData(CvxportfolioTest):
     """Test MarketData methods and interface."""
@@ -536,128 +718,6 @@ def test_signature(self):
 
         print(md.partial_universe_signature(md.full_universe))
 
-    def test_download_errors(self):
-        """Test single-symbol download error."""
-
-        storer = YahooFinance(
-            'AAPL', grace_period=self.data_grace_period,
-            base_location=self.datadir)
-        with self.assertRaises(SyntaxError):
-            # pylint: disable=protected-access
-            storer._download('AAPL', overlap=1)
-
-        class YahooFinanceErroneous(YahooFinance):
-            """Modified YF that nans last open price."""
-            def _download(self, symbol, current=None,
-                    overlap=5, grace_period='5d', **kwargs):
-                """Modified download method."""
-                res = super()._download(symbol, current,
-                    grace_period=grace_period)
-                res.iloc[-1, 0 ] = np.nan
-                return res
-
-        _ = YahooFinanceErroneous('AMZN', base_location=self.datadir)
-        with self.assertLogs(level='ERROR') as _:
-            _ = YahooFinanceErroneous(
-                'AMZN', base_location=self.datadir)
-
-        class YahooFinanceErroneous2(YahooFinance):
-            """Modified YF that nans some line."""
-            def _download(self, symbol, current=None,
-                    overlap=5, grace_period='5d', **kwargs):
-                """Modified download method."""
-                res = super()._download(symbol, current,
-                    grace_period=grace_period)
-                res.iloc[-20] = np.nan
-                return res
-        with self.assertLogs(level='WARNING') as _:
-            _ = YahooFinanceErroneous2('GOOGL',
-                base_location=self.datadir)
-        with self.assertLogs(level='WARNING') as _:
-            _ = YahooFinanceErroneous2(
-                'GOOGL', base_location=self.datadir)
-
-        class FredErroneous(Fred):
-            """Modified FRED SymbolData that gives a NaN in the last entry."""
-
-            def _download(self, symbol, current, grace_period):
-                """Modified download method."""
-                res = super()._download(symbol, current,
-                    grace_period=grace_period)
-                res.iloc[-1] = np.nan
-                return res
-
-        _ = FredErroneous('DFF', base_location=self.datadir)
-        with self.assertLogs(level='ERROR') as _:
-            _ = FredErroneous(
-                'DFF', base_location=self.datadir)
-
-        class YahooFinanceErroneous3(YahooFinance):
-            """Modified YF that is not append-only."""
-            counter = 0
-            def _download(self, symbol, current=None,
-                    overlap=5, grace_period='5d', **kwargs):
-                """Modified download method."""
-                res = super()._download(symbol, current,
-                    grace_period=grace_period)
-                if self.counter > 0:
-                    res.iloc[-2] = 0.
-                self.counter += 1
-                return res
-        storer = YahooFinanceErroneous3('GOOGL', base_location=self.datadir)
-        with self.assertLogs(level='ERROR') as _:
-            storer.update(pd.Timedelta('0d'))
-
-    def test_no_internet(self):
-        """Test errors thrown when not connected to the internet."""
-
-        with NoInternet():
-            with self.assertRaises(DataError):
-                cvx.YahooFinance('BABA', base_location=self.datadir)
-
-        with NoInternet():
-            with self.assertRaises(DataError):
-                cvx.Fred('CES0500000003', base_location=self.datadir)
-
-    def test_yahoo_finance_errors(self):
-        """Test errors with Yahoo Finance."""
-
-        with self.assertRaises(DataError):
-            YahooFinance("DOESNTEXIST", base_location=self.datadir)
-
-    def test_yahoo_finance_cleaning(self):
-        """Test our logic to clean Yahoo Finance data."""
-
-        # this stock was found to have NaN issues
-        data = YahooFinance("ENI.MI", base_location=self.datadir).data
-        self.assertTrue((data.valuevolume == 0).sum() > 0)
-        self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0)
-
-    # def test_yahoo_finance_wrong_last_time(self):
-    #     """Test that we correct last time if intraday."""
-    #
-    #     class YahooFinanceErroneous4(YahooFinance):
-    #         """Modified YF that sets last time wrong."""
-    #         counter = 0
-    #
-    #         @staticmethod
-    #         def _get_data_yahoo(
-    #             ticker, start='1900-01-01', end='2100-01-01'):
-    #             """Modified download method."""
-    #             res = YahooFinance._get_data_yahoo(
-    #                 ticker, start=start, end=end)
-    #             if self.counter > 0:
-    #                 res.index = list(res.index)[:-1] + [
-    #                     res.index[-1] - pd.Timedelta('3h')]
-    #             self.counter += 1
-    #             print(res)
-    #             return res
-    #
-    #     storer = YahooFinanceErroneous4('GOOGL', base_location=self.datadir)
-    #     print(storer.data)
-    #     #storer.update(pd.Timedelta('0d'))
-    #     #print(storer.data)
-
 if __name__ == '__main__':
 
     unittest.main(warnings='error') # pragma: no cover

From aea2f7647b2f0552223a8f638ef91666e6961ce6 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 12:51:20 +0400
Subject: [PATCH 20/38] basic anomalous cleaning

---
 cvxportfolio/data/symbol_data.py | 50 ++++++++++++++++++++++++++---
 cvxportfolio/tests/test_data.py  | 55 ++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index f94d49409..6297cb399 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -252,7 +252,6 @@ def _timestamp_convert(unix_seconds_ts):
     return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
 
 # Windows for filtering extreme logreturns
-_WINDOWS = (10, 20, 50, 100, 200)
 
 def _median_scale_around(lrets, window):
     """Median absolute logreturn in a window around each timestamp."""
@@ -264,14 +263,13 @@ def _mean_scale_around(lrets, window):
         (lrets**2).rolling(window, center=True, min_periods=1).mean())
 
 def _unlikeliness_score(
-        test_logreturns, reference_logreturns, scaler=_median_scale_around,
-        windows=_WINDOWS):
+        test_logreturns, reference_logreturns, scaler, windows):
     """Find problematic indexes for test logreturns compared w/ reference."""
     scaled = [
         np.abs(test_logreturns) / scaler(reference_logreturns, window)
         for window in windows]
     scaled = pd.DataFrame(scaled).T
-    return scaled.min(axis=1), scaled
+    return scaled.min(axis=1)
 
 
 class OLHCV(SymbolData): # pylint: disable=abstract-method
@@ -295,6 +293,16 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
     dividends, ...) and they're dealt with in derived classes.
     """
 
+    FILTERING_WINDOWS = (10, 20, 50, 100, 200)
+
+    # remove open prices when open to close abs logreturn is larger than
+    # this time the median absolute ones in FILTERING_WINDOWS around it
+    THRESHOLD_OPEN_TO_CLOSE = 15
+
+    # remove low/high prices when low/high to close abs logreturn larger than
+    # this time the median absolute ones in FILTERING_WINDOWS around it
+    THRESHOLD_LOWHIGH_TO_CLOSE = 20
+
     def _process(self, new_data, saved_data=None):
         """Base method for processing (cleaning) data.
 
@@ -330,9 +338,33 @@ def _process(self, new_data, saved_data=None):
 
         return new_data
 
+    def _nan_anomalous_prices(self, data, price_name, threshold):
+        """Set to NaN given price name on its anomalous logrets to close."""
+        lr_to_close = np.log(data['close']) - np.log(data[price_name])
+        # with this we skip over exact zeros (which come from some upstream
+        # cleaning) and would throw the median off
+        lr_to_close.loc[lr_to_close == 0] = np.nan
+        score = _unlikeliness_score(
+                lr_to_close, lr_to_close, scaler=_median_scale_around,
+                windows=self.FILTERING_WINDOWS)
+        self._nan_values(
+            data, condition = score > threshold,
+            columns_to_nan=price_name, message=f'anomalous {price_name} price')
+
     def _nan_unlikely(self, new_data, saved_data=None):
         """Nan-out unlikely values."""
-        # return new_data
+
+        # NaN anomalous open prices
+        self._nan_anomalous_prices(
+            new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE)
+
+        # NaN anomalous high prices
+        self._nan_anomalous_prices(
+            new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
+
+        # NaN anomalous low prices
+        self._nan_anomalous_prices(
+            new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
 
     def _fill(self, new_data, saved_data=None):
         """Make easy fills."""
@@ -460,6 +492,14 @@ def _nan_impossible(self, new_data, saved_data=None):
         self._nan_low_higher_close(new_data)
         self._nan_open_lower_low(new_data)
         self._nan_open_higher_high(new_data)
+
+        assert np.all(
+            new_data['low'].fillna(0.) <= new_data[
+                ['open', 'high', 'close']].min(1))
+        assert np.all(
+            new_data['high'].fillna(np.inf) >= new_data[
+                ['open', 'low', 'close']].max(1))
+
         # self._nan_incompatible_low_high(new_data)
 
     # TODO: factor quality check and clean into total-return related and non-
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index e5d920e98..4561fa94b 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -424,6 +424,11 @@ def test_yahoo_finance_cleaning(self):
         self.assertTrue((data.valuevolume == 0).sum() > 0)
         self.assertTrue(data.iloc[:-1].isnull().sum().sum() == 0)
 
+        # this stock was found to have phony open/low/high prices
+        data = YahooFinance('NWG.L', base_location=self.datadir).data
+        self.assertGreater(data['return'].min(), -0.75)
+        self.assertLess(data['return'].max(), 0.75)
+
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 
@@ -484,6 +489,56 @@ def _test_warning(data_transformation, part_of_message):
             'data.iloc[3,2] = data.iloc[3].close * .9',
             'high price lower than close price')
 
+        # extreme low price
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3,1] * .01',
+            'anomalous low price')
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3,1] * .02',
+            'anomalous low price')
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3,1] * .05',
+            'anomalous low price')
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3,1] * .1',
+            'anomalous low price')
+        _test_warning(
+            'data.iloc[3,1] = data.iloc[3,1] * .2',
+            'anomalous low price')
+        _test_warning( # changed dtindex until found one that works
+            'data.iloc[20,1] = data.iloc[20,1] * .5',
+            'anomalous low price')
+
+        # extreme high price
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 100',
+            'anomalous high price')
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 50',
+            'anomalous high price')
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 20',
+            'anomalous high price')
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 10',
+            'anomalous high price')
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 5',
+            'anomalous high price')
+        _test_warning(
+            'data.iloc[3,2] = data.iloc[3,2] * 2',
+            'anomalous high price')
+
+        # extreme open price
+        _test_warning(
+            'data.iloc[3,0] = data.iloc[3,0] * 1.75;'
+            + 'data.iloc[3,2] = data.iloc[3,0]',
+            'anomalous open price')
+        _test_warning(
+            'data.iloc[20,0] = data.iloc[20,0] * 0.5;'
+            + 'data.iloc[20,1] = data.iloc[20,0]',
+            'anomalous open price')
+
     # def test_yahoo_finance_wrong_last_time(self):
     #     """Test that we correct last time if intraday."""
     #

From b61eb91e3e38928e4e7366b8a202f87ff5c0d148 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 14:41:38 +0400
Subject: [PATCH 21/38] basic pipeline, needs improvement

---
 cvxportfolio/data/symbol_data.py | 255 ++++++++++++++++++++++---------
 cvxportfolio/tests/test_data.py  |   7 +-
 2 files changed, 186 insertions(+), 76 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 6297cb399..b47aabe29 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -330,14 +330,119 @@ def _process(self, new_data, saved_data=None):
         the parent's before or after its own processing.
         """
 
-        self._nan_impossible(new_data, saved_data=saved_data)
-        # self._specific_process(new_data, saved_data=saved_data)
-        self._nan_unlikely(new_data, saved_data=saved_data)
-        self._fill(new_data, saved_data=saved_data)
-        # self._post_process(new_data, saved_data=saved_data)
+
+        ## Preliminaries
+        ## Eliminate non-positive prices, infinity values.
+        
+        # NaN nonpositive prices
+        for column in ["open", "close", "high", "low"]:
+            self._nan_nonpositive_prices(new_data, column)
+
+        # all infinity values to NaN
+        self._set_infty_to_nan(new_data)
+
+
+        ## Close price.
+        ## We believe them (for now). We forward fill them if unavailable.
+
+        # forward-fill close
+        self._fillna_and_message(
+            new_data, 'close', 'last available', filler='ffill')
+
+
+        ## Volumes.
+        ## We set negative to NaN, and fill with zeros.
+
+        # NaN negative volumes
+        self._nan_negative_volumes(new_data)
+
+        # fill with zeros
+        self._fillna_and_message(
+            new_data, 'volume', 'zeros', filler='fillna', filler_arg=0.)
+
+
+        ## Open price.
+        ## We remove if lower than low, higher than high, or open to close
+        ## logreturn is anomalous. Then we fill with close from day before.
+
+        # NaN open if lower than low
+        self._nan_open_lower_low(new_data)
+
+        # NaN open if higher than high
+        self._nan_open_higher_high(new_data)
+
+        # NaN anomalous open prices
+        self._nan_anomalous_prices(
+            new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE)
+
+        # fill open with close from day before
+        self._fillna_and_message(
+            new_data, 'open', 'close from period before', filler='fillna',
+            filler_arg=new_data['close'].shift(1))
+
+
+        ## Low price.
+        ## We remove if higher than close or anomalous low to close logreturn.
+        ## We fill them with min of open and close.
+
+        # NaN low if higher than close
+        self._nan_low_higher_close(new_data)
+
+        # NaN low if higher than open (cleaned)
+        self._nan_low_higher_open(new_data)
+
+        # NaN anomalous low prices
+        self._nan_anomalous_prices(
+            new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
+
+        # fill low with min of open and close
+        self._fillna_and_message(
+            new_data, 'low', 'min of open and close', filler='fillna',
+            filler_arg=new_data[['open', 'close']].min(axis=1))
+
+
+        ## High price.
+        ## We remove if lower than close or anomalous low to close logreturn.
+        ## We fill them with max of open and close.
+
+        # NaN high if lower than close
+        self._nan_high_lower_close(new_data)
+
+        # NaN high if lower than open (cleaned)
+        self._nan_high_lower_open(new_data)
+
+        # NaN anomalous high prices
+        self._nan_anomalous_prices(
+            new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
+
+        # fill high with max of open and close
+        self._fillna_and_message(
+            new_data, 'high', 'max of open and close', filler='fillna',
+            filler_arg=new_data[['open', 'close']].max(axis=1))
+
+
+        ## Some asserts
+        assert new_data.iloc[1:].isnull().sum().sum() == 0
+        assert np.all(
+            new_data['low'].fillna(0.) <= new_data[
+                ['open', 'high', 'close']].min(1))
+        assert np.all(
+            new_data['high'].fillna(np.inf) >= new_data[
+                ['open', 'low', 'close']].max(1))
 
         return new_data
 
+    def _fillna_and_message(
+        self, data, col_name, message, filler='fillna', filler_arg=None):
+        """Fill NaNs in column with chosen method and arg."""
+        bad_indexes = data.index[data[col_name].isnull()]
+        if len(bad_indexes) > 0:
+            logger.warning(
+                '%s("%s").data["%s"] has NaNs on timestamps: %s,'
+                + ' filling them with %s.', self.__class__.__name__,
+                self.symbol, col_name, bad_indexes, message)
+            data[col_name] = getattr(data[col_name], filler)(filler_arg)
+
     def _nan_anomalous_prices(self, data, price_name, threshold):
         """Set to NaN given price name on its anomalous logrets to close."""
         lr_to_close = np.log(data['close']) - np.log(data[price_name])
@@ -351,58 +456,44 @@ def _nan_anomalous_prices(self, data, price_name, threshold):
             data, condition = score > threshold,
             columns_to_nan=price_name, message=f'anomalous {price_name} price')
 
-    def _nan_unlikely(self, new_data, saved_data=None):
-        """Nan-out unlikely values."""
-
-        # NaN anomalous open prices
-        self._nan_anomalous_prices(
-            new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE)
-
-        # NaN anomalous high prices
-        self._nan_anomalous_prices(
-            new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
-
-        # NaN anomalous low prices
-        self._nan_anomalous_prices(
-            new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
 
-    def _fill(self, new_data, saved_data=None):
-        """Make easy fills."""
+    # def _fill(self, new_data, saved_data=None):
+    #     """Make easy fills."""
 
-        # TODO: simplify
+    #     # TODO: simplify
 
-        # print(data)
-        # print(data.isnull().sum())
+    #     # print(data)
+    #     # print(data.isnull().sum())
 
-        # fill volumes with zeros (safest choice)
-        new_data['volume'] = new_data['volume'].fillna(0.)
+    #     # fill volumes with zeros (safest choice)
+    #     new_data['volume'] = new_data['volume'].fillna(0.)
 
-        # fill close price with open price
-        new_data['close'] = new_data['close'].fillna(new_data['open'])
+    #     # fill close price with open price
+    #     new_data['close'] = new_data['close'].fillna(new_data['open'])
 
-        # fill open price with close from day(s) before
-        # repeat as long as it helps (up to 1 year)
-        for shifter in range(252):
-            logger.info(
-                "Filling opens with close from %s days before", shifter)
-            orig_missing_opens = new_data['open'].isnull().sum()
-            new_data['open'] = new_data['open'].fillna(new_data['close'].shift(
-                shifter+1))
-            new_missing_opens = new_data['open'].isnull().sum()
-            if orig_missing_opens == new_missing_opens:
-                break
+    #     # fill open price with close from day(s) before
+    #     # repeat as long as it helps (up to 1 year)
+    #     for shifter in range(252):
+    #         logger.info(
+    #             "Filling opens with close from %s days before", shifter)
+    #         orig_missing_opens = new_data['open'].isnull().sum()
+    #         new_data['open'] = new_data['open'].fillna(new_data['close'].shift(
+    #             shifter+1))
+    #         new_missing_opens = new_data['open'].isnull().sum()
+    #         if orig_missing_opens == new_missing_opens:
+    #             break
 
-        # fill close price with same day's open
-        new_data['close'] = new_data['close'].fillna(new_data['open'])
+    #     # fill close price with same day's open
+    #     new_data['close'] = new_data['close'].fillna(new_data['open'])
 
-        # fill high price with max
-        new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1))
+    #     # fill high price with max
+    #     new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1))
 
-        # fill low price with max
-        new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1))
+    #     # fill low price with max
+    #     new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1))
 
-        # print(data)
-        # print(data.isnull().sum())
+    #     # print(data)
+    #     # print(data.isnull().sum())
 
     def _nan_values(self, data, condition, columns_to_nan, message):
         """Set to NaN in-place for indexing condition and chosen columns."""
@@ -456,6 +547,13 @@ def _nan_high_lower_close(self, data):
             columns_to_nan = "high",
             message = 'high price lower than close price')
 
+    def _nan_high_lower_open(self, data):
+        """Set high price to NaN if lower than open, in-place."""
+        self._nan_values(
+            data=data, condition = data['high'] < data['open'],
+            columns_to_nan = "high",
+            message = 'high price lower than open price')
+
     def _nan_low_higher_close(self, data):
         """Set low price to NaN if higher than close, in-place."""
         self._nan_values(
@@ -463,6 +561,13 @@ def _nan_low_higher_close(self, data):
             columns_to_nan = "low",
             message = 'low price higher than close price')
 
+    def _nan_low_higher_open(self, data):
+        """Set low price to NaN if higher than open, in-place."""
+        self._nan_values(
+            data=data, condition = data['low'] > data['open'],
+            columns_to_nan = "low",
+            message = 'low price higher than open price')
+
     def _set_infty_to_nan(self, data):
         """Set all +/- infty elements of data to NaN, in-place."""
 
@@ -474,33 +579,33 @@ def _set_infty_to_nan(self, data):
                 data.values, copy=True, nan=np.nan, posinf=np.nan,
                 neginf=np.nan)
 
-    def _nan_impossible(self, new_data, saved_data=None):
-        """Set some impossible values of new_data to NaN, in-place."""
+    # def _nan_impossible(self, new_data, saved_data=None):
+    #     """Set some impossible values of new_data to NaN, in-place."""
 
-        # nan-out nonpositive prices
-        for column in ["open", "close", "high", "low"]:
-            self._nan_nonpositive_prices(new_data, column)
+    #     # nan-out nonpositive prices
+    #     for column in ["open", "close", "high", "low"]:
+    #         self._nan_nonpositive_prices(new_data, column)
 
-        # nan-out negative volumes
-        self._nan_negative_volumes(new_data)
+    #     # nan-out negative volumes
+    #     self._nan_negative_volumes(new_data)
 
-        # all infinity values are nans
-        self._set_infty_to_nan(new_data)
+    #     # all infinity values are nans
+    #     self._set_infty_to_nan(new_data)
 
-        # more
-        self._nan_high_lower_close(new_data)
-        self._nan_low_higher_close(new_data)
-        self._nan_open_lower_low(new_data)
-        self._nan_open_higher_high(new_data)
+    #     # more
+    #     self._nan_high_lower_close(new_data)
+    #     self._nan_low_higher_close(new_data)
+    #     self._nan_open_lower_low(new_data)
+    #     self._nan_open_higher_high(new_data)
 
-        assert np.all(
-            new_data['low'].fillna(0.) <= new_data[
-                ['open', 'high', 'close']].min(1))
-        assert np.all(
-            new_data['high'].fillna(np.inf) >= new_data[
-                ['open', 'low', 'close']].max(1))
+    #     assert np.all(
+    #         new_data['low'].fillna(0.) <= new_data[
+    #             ['open', 'high', 'close']].min(1))
+    #     assert np.all(
+    #         new_data['high'].fillna(np.inf) >= new_data[
+    #             ['open', 'low', 'close']].max(1))
 
-        # self._nan_incompatible_low_high(new_data)
+    #     # self._nan_incompatible_low_high(new_data)
 
     # TODO: factor quality check and clean into total-return related and non-
 
@@ -618,6 +723,8 @@ def _compute_total_returns(self, data):
     def _process(self, new_data, saved_data=None):
         """Temporary."""
 
+        self._nan_nonpositive_prices(new_data, "adjclose")
+
         super()._process(new_data, saved_data=saved_data)
         self._compute_total_returns(new_data)
 
@@ -689,14 +796,14 @@ def print_extreme(logreturns, name, sigmas=50):
         open2low = np.log(data['low']) - np.log(data['open']).dropna()
         print_extreme(open2low, 'open to low returns')
 
-    def _nan_impossible(self, new_data, saved_data=None):
-        """Set impossible values to NaN."""
+    # def _nan_impossible(self, new_data, saved_data=None):
+    #     """Set impossible values to NaN."""
 
-        # call the OLHCV method
-        super()._nan_impossible(new_data)
+    #     # call the OLHCV method
+    #     super()._nan_impossible(new_data)
 
-        # also do it on adjclose
-        self._nan_nonpositive_prices(new_data, "adjclose")
+    #     # also do it on adjclose
+    #     self._nan_nonpositive_prices(new_data, "adjclose")
 
     # def _specific_process(self, new_data, saved_data=None):
     #     """Specific process, compute total returns."""
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 4561fa94b..f0f2519a6 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -485,8 +485,11 @@ def _test_warning(data_transformation, part_of_message):
             'low price higher than close price')
 
         # high lower close
-        _test_warning(
-            'data.iloc[3,2] = data.iloc[3].close * .9',
+        _test_warning( # had to fix it otherwise open cleaner kicks in
+            'close = data.iloc[3].close;'
+            'data.iloc[3,0] = close * .95;' # open
+            'data.iloc[3,1] = close * .95;' # low
+            'data.iloc[3,2] = close * .975', # high
             'high price lower than close price')
 
         # extreme low price

From f4f1f3ee91204ddcf0a54358c9f4a371c335de5f Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 15:35:34 +0400
Subject: [PATCH 22/38] better

---
 cvxportfolio/data/symbol_data.py | 228 ++++++++++++-------------------
 cvxportfolio/tests/test_data.py  |   6 +
 2 files changed, 94 insertions(+), 140 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index b47aabe29..01bdebe89 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -303,6 +303,10 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
     # this time the median absolute ones in FILTERING_WINDOWS around it
     THRESHOLD_LOWHIGH_TO_CLOSE = 20
 
+    # log warning on _preload for abs logreturns (of 4 types) larger than this
+    # time the root squared mean absolute ones in FILTERING_WINDOWS around it
+    THRESHOLD_WARN_EXTREME_LOGRETS = 5
+
     def _process(self, new_data, saved_data=None):
         """Base method for processing (cleaning) data.
 
@@ -312,28 +316,11 @@ def _process(self, new_data, saved_data=None):
         (possibly overlapping with new_data at the end), and is **read only**:
         it is used as reference to help with the cleaning, it has already
         been cleaned.
-
-        The method is composed of the following steps, split between child
-        classes at the appropriate hierarchy level.
-
-        #. :meth:`_nan_impossible`: Nan-out impossible values in ``new_data``.
-        #. :meth:`_specific_process`: Do processing specific to the class,
-            before the following step (*e.g.,*, because we might want unlikely
-            values to still be there).
-        #. :meth:`_nan_unlikely`: Nan-out values that are (highly) unlikely,
-            with threshold-based testing.
-        #. :meth:`_fill`: Fill nans.
-        #. :meth:`_post_process`: Do final processing specific to the class.
-
-        With this factoring we should have the flexibility to handle various
-        data sources, by choosing at each level if each method calls
-        the parent's before or after its own processing.
         """
 
-
         ## Preliminaries
         ## Eliminate non-positive prices, infinity values.
-        
+
         # NaN nonpositive prices
         for column in ["open", "close", "high", "low"]:
             self._nan_nonpositive_prices(new_data, column)
@@ -341,7 +328,6 @@ def _process(self, new_data, saved_data=None):
         # all infinity values to NaN
         self._set_infty_to_nan(new_data)
 
-
         ## Close price.
         ## We believe them (for now). We forward fill them if unavailable.
 
@@ -349,7 +335,6 @@ def _process(self, new_data, saved_data=None):
         self._fillna_and_message(
             new_data, 'close', 'last available', filler='ffill')
 
-
         ## Volumes.
         ## We set negative to NaN, and fill with zeros.
 
@@ -360,7 +345,6 @@ def _process(self, new_data, saved_data=None):
         self._fillna_and_message(
             new_data, 'volume', 'zeros', filler='fillna', filler_arg=0.)
 
-
         ## Open price.
         ## We remove if lower than low, higher than high, or open to close
         ## logreturn is anomalous. Then we fill with close from day before.
@@ -380,7 +364,6 @@ def _process(self, new_data, saved_data=None):
             new_data, 'open', 'close from period before', filler='fillna',
             filler_arg=new_data['close'].shift(1))
 
-
         ## Low price.
         ## We remove if higher than close or anomalous low to close logreturn.
         ## We fill them with min of open and close.
@@ -400,7 +383,6 @@ def _process(self, new_data, saved_data=None):
             new_data, 'low', 'min of open and close', filler='fillna',
             filler_arg=new_data[['open', 'close']].min(axis=1))
 
-
         ## High price.
         ## We remove if lower than close or anomalous low to close logreturn.
         ## We fill them with max of open and close.
@@ -420,7 +402,6 @@ def _process(self, new_data, saved_data=None):
             new_data, 'high', 'max of open and close', filler='fillna',
             filler_arg=new_data[['open', 'close']].max(axis=1))
 
-
         ## Some asserts
         assert new_data.iloc[1:].isnull().sum().sum() == 0
         assert np.all(
@@ -456,45 +437,6 @@ def _nan_anomalous_prices(self, data, price_name, threshold):
             data, condition = score > threshold,
             columns_to_nan=price_name, message=f'anomalous {price_name} price')
 
-
-    # def _fill(self, new_data, saved_data=None):
-    #     """Make easy fills."""
-
-    #     # TODO: simplify
-
-    #     # print(data)
-    #     # print(data.isnull().sum())
-
-    #     # fill volumes with zeros (safest choice)
-    #     new_data['volume'] = new_data['volume'].fillna(0.)
-
-    #     # fill close price with open price
-    #     new_data['close'] = new_data['close'].fillna(new_data['open'])
-
-    #     # fill open price with close from day(s) before
-    #     # repeat as long as it helps (up to 1 year)
-    #     for shifter in range(252):
-    #         logger.info(
-    #             "Filling opens with close from %s days before", shifter)
-    #         orig_missing_opens = new_data['open'].isnull().sum()
-    #         new_data['open'] = new_data['open'].fillna(new_data['close'].shift(
-    #             shifter+1))
-    #         new_missing_opens = new_data['open'].isnull().sum()
-    #         if orig_missing_opens == new_missing_opens:
-    #             break
-
-    #     # fill close price with same day's open
-    #     new_data['close'] = new_data['close'].fillna(new_data['open'])
-
-    #     # fill high price with max
-    #     new_data['high'] = new_data['high'].fillna(new_data[['open', 'close']].max(1))
-
-    #     # fill low price with max
-    #     new_data['low'] = new_data['low'].fillna(new_data[['open', 'close']].min(1))
-
-    #     # print(data)
-    #     # print(data.isnull().sum())
-
     def _nan_values(self, data, condition, columns_to_nan, message):
         """Set to NaN in-place for indexing condition and chosen columns."""
 
@@ -579,35 +521,49 @@ def _set_infty_to_nan(self, data):
                 data.values, copy=True, nan=np.nan, posinf=np.nan,
                 neginf=np.nan)
 
-    # def _nan_impossible(self, new_data, saved_data=None):
-    #     """Set some impossible values of new_data to NaN, in-place."""
-
-    #     # nan-out nonpositive prices
-    #     for column in ["open", "close", "high", "low"]:
-    #         self._nan_nonpositive_prices(new_data, column)
+    def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
+        """Log warning if logreturns are extreme."""
+        # with this we skip over exact zeros (which we assume come from some
+        # cleaning) and would bias the mean down
+        logreturns.loc[logreturns == 0] = np.nan
+        score = _unlikeliness_score(
+                logreturns, logreturns, scaler=_mean_scale_around,
+                windows=self.FILTERING_WINDOWS)
+        dubious_indexes = logreturns.index[score > threshold]
+        if len(dubious_indexes) > 0:
+            logger.warning(
+                '%s("%s") has dubious %s for timestamps: %s',
+                self.__class__.__name__, self.symbol, what, dubious_indexes)
 
-    #     # nan-out negative volumes
-    #     self._nan_negative_volumes(new_data)
+    def _quality_check(self, data):
+        """Log issues with the quality of data given to the user."""
 
-    #     # all infinity values are nans
-    #     self._set_infty_to_nan(new_data)
+        # zero volume
+        zerovol_idx = data.index[data.volume == 0]
+        if len(zerovol_idx) > 0:
+            logger.info(
+                '%s("%s") has volume equal to zero for timestamps: %s',
+                self.__class__.__name__, self.symbol, zerovol_idx)
 
-    #     # more
-    #     self._nan_high_lower_close(new_data)
-    #     self._nan_low_higher_close(new_data)
-    #     self._nan_open_lower_low(new_data)
-    #     self._nan_open_higher_high(new_data)
+        # warn on extreme logreturns
+        self._warn_on_extreme_logreturns(
+            np.log(1 + data['return']), self.THRESHOLD_WARN_EXTREME_LOGRETS,
+            'total open-to-open returns')
 
-    #     assert np.all(
-    #         new_data['low'].fillna(0.) <= new_data[
-    #             ['open', 'high', 'close']].min(1))
-    #     assert np.all(
-    #         new_data['high'].fillna(np.inf) >= new_data[
-    #             ['open', 'low', 'close']].max(1))
+        # extreme open2close
+        self._warn_on_extreme_logreturns(
+            np.log(data['close']) - np.log(data['open']),
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns')
 
-    #     # self._nan_incompatible_low_high(new_data)
+        # extreme open2high
+        self._warn_on_extreme_logreturns(
+            np.log(data['high']) - np.log(data['open']),
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns')
 
-    # TODO: factor quality check and clean into total-return related and non-
+        # extreme open2low
+        self._warn_on_extreme_logreturns(
+            np.log(data['low']) - np.log(data['open']),
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns')
 
     def _preload(self, data):
         """Prepare data for use by Cvxportfolio.
@@ -723,9 +679,23 @@ def _compute_total_returns(self, data):
     def _process(self, new_data, saved_data=None):
         """Temporary."""
 
+        ## Here we only deal with the adjusted close prices.
+        ## All other operations are in the _process method of OLHCV
+
+        # all infinity values to NaN (repeat, but for adjclose)
+        self._set_infty_to_nan(new_data)
+
+        # NaN non-positive adj close
         self._nan_nonpositive_prices(new_data, "adjclose")
 
+        # forward-fill adj close
+        self._fillna_and_message(
+            new_data, 'adjclose', 'last available', filler='ffill')
+
+        ## OLHCV._process treats all other than adjclose
         super()._process(new_data, saved_data=saved_data)
+
+        # Compute total open-to-open returns
         self._compute_total_returns(new_data)
 
         # close2close_total = np.log(1 + new_data['total_return'])
@@ -750,74 +720,52 @@ def _process(self, new_data, saved_data=None):
         del new_data["adjclose"]
 
         # eliminate last period's intraday data
+        # TODO this operation needs to be moved in preload
         new_data.loc[new_data.index[-1],
             ["high", "low", "close", "return", "volume"]] = np.nan
 
         return new_data
 
-    def _quality_check(self, data):
-        """Analyze quality of the OLHCV-TR data."""
+    # def _quality_check(self, data):
+    #     """Analyze quality of the OLHCV-TR data."""
 
-        # zero volume
-        zerovol_idx = data.index[data.volume == 0]
-        if len(zerovol_idx) > 0:
-            logger.warning(
-                '%s("%s") has volume equal to zero for timestamps: %s',
-                self.__class__.__name__, self.symbol, zerovol_idx)
-
-        def print_extreme(logreturns, name, sigmas=50):
-
-            # TODO: choose
-            m, s = logreturns.median(), np.sqrt((logreturns**2).median())
-            normalized = (logreturns - m)/s
-
-            # normalized = logreturns / logreturns.rolling(252).std().shift(1)
+    #     # zero volume
+    #     zerovol_idx = data.index[data.volume == 0]
+    #     if len(zerovol_idx) > 0:
+    #         logger.warning(
+    #             '%s("%s") has volume equal to zero for timestamps: %s',
+    #             self.__class__.__name__, self.symbol, zerovol_idx)
 
-            extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
-            if len(extremereturn_idx) > 0:
-                logger.warning(
-                    '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
-                    self.__class__.__name__, self.symbol, name, sigmas,
-                    extremereturn_idx)
-
-        # extreme logreturns
-        logreturns = np.log(1 + data['return']).dropna()
-        print_extreme(logreturns, 'total returns')
-
-        # extreme open2close
-        open2close = np.log(data['close']) - np.log(data['open']).dropna()
-        print_extreme(open2close, 'open to close returns')
-
-        # extreme open2high
-        open2high = np.log(data['high']) - np.log(data['open']).dropna()
-        print_extreme(open2high, 'open to high returns')
-
-        # extreme open2low
-        open2low = np.log(data['low']) - np.log(data['open']).dropna()
-        print_extreme(open2low, 'open to low returns')
+    #     def print_extreme(logreturns, name, sigmas=50):
 
-    # def _nan_impossible(self, new_data, saved_data=None):
-    #     """Set impossible values to NaN."""
+    #         # TODO: choose
+    #         m, s = logreturns.median(), np.sqrt((logreturns**2).median())
+    #         normalized = (logreturns - m)/s
 
-    #     # call the OLHCV method
-    #     super()._nan_impossible(new_data)
+    #         # normalized = logreturns / logreturns.rolling(252).std().shift(1)
 
-    #     # also do it on adjclose
-    #     self._nan_nonpositive_prices(new_data, "adjclose")
+    #         extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
+    #         if len(extremereturn_idx) > 0:
+    #             logger.warning(
+    #                 '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
+    #                 self.__class__.__name__, self.symbol, name, sigmas,
+    #                 extremereturn_idx)
 
-    # def _specific_process(self, new_data, saved_data=None):
-    #     """Specific process, compute total returns."""
+    #     # extreme logreturns
+    #     logreturns = np.log(1 + data['return']).dropna()
+    #     print_extreme(logreturns, 'total returns')
 
-    #     # Close-to-close total return, so we can delegate to parent class.
-    #     # Note that this uses different time alignment than Cvxportfolio,
-    #     # Here today's return uses yesterday close and today close, while
-    #     # today's returns in Cvxportfolio use today open and tomorrow open.
-    #     # However this is the format more common among data vendors.
-    #     # new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
+    #     # extreme open2close
+    #     open2close = np.log(data['close']) - np.log(data['open']).dropna()
+    #     print_extreme(open2close, 'open to close returns')
 
-    #     # We don't need this any more.
-    #     # del new_data['adjclose']
+    #     # extreme open2high
+    #     open2high = np.log(data['high']) - np.log(data['open']).dropna()
+    #     print_extreme(open2high, 'open to high returns')
 
+    #     # extreme open2low
+    #     open2low = np.log(data['low']) - np.log(data['open']).dropna()
+    #     print_extreme(open2low, 'open to low returns')
 
 class YahooFinance(OLHCVAC):
     """Yahoo Finance symbol data.
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index f0f2519a6..3c5f3490f 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -429,6 +429,12 @@ def test_yahoo_finance_cleaning(self):
         self.assertGreater(data['return'].min(), -0.75)
         self.assertLess(data['return'].max(), 0.75)
 
+        # this stock had some extreme returns but they were legitimate
+        with self.assertNoLogs(level='WARNING'):
+            data = YahooFinance('GME', base_location=self.datadir).data
+        self.assertGreater(data['return'].min(), -0.75)
+        self.assertGreater(data['return'].max(), 3)
+
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 

From 39e8939e3cd1930e3789c4527e2935cbac2e0542 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 16:28:00 +0400
Subject: [PATCH 23/38] testing

---
 cvxportfolio/data/symbol_data.py | 243 +++++++++----------------------
 cvxportfolio/tests/test_data.py  |  52 +++++++
 2 files changed, 119 insertions(+), 176 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 01bdebe89..b0875223c 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -229,7 +229,7 @@ def _download(self, symbol, current, grace_period, **kwargs):
         :type current: pandas.Series or pandas.DataFrame or None
         :rtype: pandas.Series or pandas.DataFrame
         """
-        raise NotImplementedError #pragma: no cover
+        raise NotImplementedError # pragma: no cover
 
     def _preload(self, data):
         """Prepare data to serve to the user.
@@ -240,7 +240,7 @@ def _preload(self, data):
         :type data: pandas.Series or pandas.DataFrame
         :rtype: pandas.Series or pandas.DataFrame
         """
-        return data
+        return data # pragma: no cover
 
 
 #
@@ -251,7 +251,7 @@ def _timestamp_convert(unix_seconds_ts):
     """Convert a UNIX timestamp in seconds to a pandas.Timestamp."""
     return pd.Timestamp(unix_seconds_ts*1E9, tz='UTC')
 
-# Windows for filtering extreme logreturns
+# Anomalous, extreme, dubious logreturns filtering.
 
 def _median_scale_around(lrets, window):
     """Median absolute logreturn in a window around each timestamp."""
@@ -289,7 +289,7 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
 
     in which case the ``'return'`` column is not processed. It only matters in
     the :meth:`_preload`, method: if open-to-open returns are not present,
-    we compute them there. Otherwise these may be total returns (including
+    we compute them there. Otherwise these may be total returns (which include
     dividends, ...) and they're dealt with in derived classes.
     """
 
@@ -304,8 +304,8 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
     THRESHOLD_LOWHIGH_TO_CLOSE = 20
 
     # log warning on _preload for abs logreturns (of 4 types) larger than this
-    # time the root squared mean absolute ones in FILTERING_WINDOWS around it
-    THRESHOLD_WARN_EXTREME_LOGRETS = 5
+    # time the median absolute ones in FILTERING_WINDOWS around it
+    THRESHOLD_WARN_EXTREME_LOGRETS = 17.5
 
     def _process(self, new_data, saved_data=None):
         """Base method for processing (cleaning) data.
@@ -524,10 +524,10 @@ def _set_infty_to_nan(self, data):
     def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
         """Log warning if logreturns are extreme."""
         # with this we skip over exact zeros (which we assume come from some
-        # cleaning) and would bias the mean down
+        # cleaning) and would bias the median down
         logreturns.loc[logreturns == 0] = np.nan
         score = _unlikeliness_score(
-                logreturns, logreturns, scaler=_mean_scale_around,
+                logreturns, logreturns, scaler=_median_scale_around,
                 windows=self.FILTERING_WINDOWS)
         dubious_indexes = logreturns.index[score > threshold]
         if len(dubious_indexes) > 0:
@@ -585,102 +585,56 @@ def _preload(self, data):
 
         return data
 
-class OLHCVAC(OLHCV):
-    """Open-High-Low-Close-Volume-AdjustedClose data.
+# TODO: plan
+# ffill adj closes & compute adj close logreturns
+# use code above to get indexes of wrong ones, raise warnings, set to 0
+#
+# check close vs adj close, there should be only dividends (with y finance)
+#
+# throw out opens that are not in [low, high]
+#
+# apply similar logic (perhaps using total lrets for the stddev) for
+# open-close , close-high , close-low, throw out open/low/close not OK
+#
+# fill
+#
+# compute open-open total returns, then check with same logic for errors
+#
+# when doing append, make past data adhere to same format: recompute adj
+# close
+# could use volumes as well, if there are jumps in price due to
+# splits not recorded, then price * volume should be more stable
+#
+#
 
-    This is modeled after the data returned by Yahoo Finance.
-    """
 
-    # # rolstd windows for finding wrong logreturns
-    # _ROLSTD_WINDOWS = [20, 60, 252]
-
-    # # threshold for finding wrong logreturns
-    # _WRONG_LOGRET_THRESHOLD = 15
-
-    # def _indexes_extreme_logrets_wrt_rolstddev(self, lrets, window, treshold):
-    #     """Get indexes of logreturns that are extreme wrt trailing stddev."""
-    #     trailing_stdev = np.sqrt((lrets**2).rolling(window).median().shift(1))
-    #     bad_indexes = lrets.index[np.abs(lrets / trailing_stdev) > treshold]
-    #     return bad_indexes
-
-    # def _find_wrong_daily_logreturns(self, lrets):
-    #     """Find indexes of logreturns that are most probably data errors."""
-    #     bad_indexes = []
-    #     for window in self._ROLSTD_WINDOWS:
-    #         bad_indexes.append(
-    #             set(self._indexes_extreme_logrets_wrt_rolstddev(
-    #             lrets, window=window, treshold=self._WRONG_LOGRET_THRESHOLD)))
-    #         bad_indexes.append(
-    #             set(self._indexes_extreme_logrets_wrt_rolstddev(
-    #             lrets.iloc[::-1], window=window,
-    #             treshold=self._WRONG_LOGRET_THRESHOLD)))
-    #     bad_indexes = set.intersection(*bad_indexes)
-    #     return bad_indexes
-
-    # TODO: plan
-    # ffill adj closes & compute adj close logreturns
-    # use code above to get indexes of wrong ones, raise warnings, set to 0
-    #
-    # check close vs adj close, there should be only dividends (with y finance)
-    #
-    # throw out opens that are not in [low, high]
-    #
-    # apply similar logic (perhaps using total lrets for the stddev) for
-    # open-close , close-high , close-low, throw out open/low/close not OK
-    #
-    # fill
-    #
-    # compute open-open total returns, then check with same logic for errors
-    #
-    # when doing append, make past data adhere to same format: recompute adj
-    # close
-    # could use volumes as well, if there are jumps in price due to
-    # splits not recorded, then price * volume should be more stable
-    #
-    #
-
-    def _compute_total_returns(self, data):
-        """Compute total open-to-open returns."""
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # compute log of ratio between adjclose and close
-        log_adjustment_ratio = np.log(data['adjclose'] / data['close'])
-
-        # forward fill adjustment ratio
-        log_adjustment_ratio = log_adjustment_ratio.ffill()
-
-        # non-market log returns (dividends, splits)
-        non_market_lr = log_adjustment_ratio.diff().shift(-1)
-
-        # dividend_return = (data['adjclose'] /  data['close']).pct_change().shift(-1)
-
-        # import code; code.interact(local=locals())
-
-        # full open-to-open returns
-        open_to_open = np.log(data["open"]).diff().shift(-1)
-        data['return'] = np.exp(open_to_open + non_market_lr) - 1
-
-        # print(data)
-        # print(data.isnull().sum())
-
-        # intraday_logreturn = np.log(data["close"]) - np.log(data["open"])
-        # close_to_close_logreturn = np.log(data["adjclose"]).diff().shift(-1)
-        # open_to_open_logreturn = (
-        #     close_to_close_logreturn + intraday_logreturn -
-        #     intraday_logreturn.shift(-1)
-        # )
-        # data["return"] = np.exp(open_to_open_logreturn) - 1
-
-        # print(data)
-        # print(data.isnull().sum())
+class YahooFinance(OLHCV):
+    """Yahoo Finance symbol data.
+
+    :param symbol: The symbol that we downloaded.
+    :type symbol: str
+    :param storage_backend: The storage backend, implemented ones are
+        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
+    :type storage_backend: str
+    :param base_storage_location: The location of the storage. We store in a
+        subdirectory named after the class which derives from this.
+    :type base_storage_location: pathlib.Path
+    :param grace_period: If the most recent observation in the data is less
+        old than this we do not download new data.
+    :type grace_period: pandas.Timedelta
+
+    :attribute data: The downloaded, and cleaned, data for the symbol.
+    :type data: pandas.DataFrame
+    """
 
     def _process(self, new_data, saved_data=None):
-        """Temporary."""
+        """Process Yahoo Finance specific data, call parent's.
 
-        ## Here we only deal with the adjusted close prices.
-        ## All other operations are in the _process method of OLHCV
+        Here we deal with the adjclose column, call OLHCV._process method, and
+        compute total open-to-open returns.
+        """
+
+        ## Treat adjclose. We believe them (unless impossible).
 
         # all infinity values to NaN (repeat, but for adjclose)
         self._set_infty_to_nan(new_data)
@@ -692,29 +646,26 @@ def _process(self, new_data, saved_data=None):
         self._fillna_and_message(
             new_data, 'adjclose', 'last available', filler='ffill')
 
-        ## OLHCV._process treats all other than adjclose
+        ## OLHCV._process treats all columns other than adjclose
         super()._process(new_data, saved_data=saved_data)
 
-        # Compute total open-to-open returns
-        self._compute_total_returns(new_data)
-
-        # close2close_total = np.log(1 + new_data['total_return'])
-        # open2close = np.log(new_data['close']) - np.log(new_data['open'])
-        # open2open_total = close2close_total - open2close + open2close.shift(1)
-        # alt = (np.exp(open2open_total) - 1).shift(-1)
+        ## Compute total open-to-open returns
 
-        # close_div_open = new_data['close'] / new_data['open']
-        # open_to_open_total = (
-        #     (1 + new_data['total_return']) / close_div_open
-        #         ) * close_div_open.shift(1) - 1
+        # intraday logreturn
+        intraday_logreturn = np.log(
+            new_data["close"]) - np.log(new_data["open"])
 
-        # import code; code.interact(local=locals())
+        # close to close total logreturn
+        close_to_close_total_logreturn = np.log(
+            new_data["adjclose"]).diff().shift(-1)
 
-        # assert np.allclose(new_data['return'].dropna(), open_to_open_total.shift(-1).dropna())
+        # open to open total logreturn
+        open_to_open_total_logreturn = \
+            close_to_close_total_logreturn + intraday_logreturn \
+            - intraday_logreturn.shift(-1)
 
-        # new_data['return'] = open_to_open_total.shift(-1)
-
-        # del new_data['total_return']
+        # open to open total return
+        new_data['return'] = np.exp(open_to_open_total_logreturn) - 1
 
         # eliminate adjclose column
         del new_data["adjclose"]
@@ -726,66 +677,6 @@ def _process(self, new_data, saved_data=None):
 
         return new_data
 
-    # def _quality_check(self, data):
-    #     """Analyze quality of the OLHCV-TR data."""
-
-    #     # zero volume
-    #     zerovol_idx = data.index[data.volume == 0]
-    #     if len(zerovol_idx) > 0:
-    #         logger.warning(
-    #             '%s("%s") has volume equal to zero for timestamps: %s',
-    #             self.__class__.__name__, self.symbol, zerovol_idx)
-
-    #     def print_extreme(logreturns, name, sigmas=50):
-
-    #         # TODO: choose
-    #         m, s = logreturns.median(), np.sqrt((logreturns**2).median())
-    #         normalized = (logreturns - m)/s
-
-    #         # normalized = logreturns / logreturns.rolling(252).std().shift(1)
-
-    #         extremereturn_idx = normalized.index[np.abs(normalized) > sigmas]
-    #         if len(extremereturn_idx) > 0:
-    #             logger.warning(
-    #                 '%s("%s") has extreme %s (~%s sigmas) for timestamps: %s',
-    #                 self.__class__.__name__, self.symbol, name, sigmas,
-    #                 extremereturn_idx)
-
-    #     # extreme logreturns
-    #     logreturns = np.log(1 + data['return']).dropna()
-    #     print_extreme(logreturns, 'total returns')
-
-    #     # extreme open2close
-    #     open2close = np.log(data['close']) - np.log(data['open']).dropna()
-    #     print_extreme(open2close, 'open to close returns')
-
-    #     # extreme open2high
-    #     open2high = np.log(data['high']) - np.log(data['open']).dropna()
-    #     print_extreme(open2high, 'open to high returns')
-
-    #     # extreme open2low
-    #     open2low = np.log(data['low']) - np.log(data['open']).dropna()
-    #     print_extreme(open2low, 'open to low returns')
-
-class YahooFinance(OLHCVAC):
-    """Yahoo Finance symbol data.
-
-    :param symbol: The symbol that we downloaded.
-    :type symbol: str
-    :param storage_backend: The storage backend, implemented ones are
-        ``'pickle'``, ``'csv'``, and ``'sqlite'``.
-    :type storage_backend: str
-    :param base_storage_location: The location of the storage. We store in a
-        subdirectory named after the class which derives from this.
-    :type base_storage_location: pathlib.Path
-    :param grace_period: If the most recent observation in the data is less
-        old than this we do not download new data.
-    :type grace_period: pandas.Timedelta
-
-    :attribute data: The downloaded, and cleaned, data for the symbol.
-    :type data: pandas.DataFrame
-    """
-
     @staticmethod
     def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
         """Get 1-day OLHC-AC-V from Yahoo finance.
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 3c5f3490f..04d0dda36 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -435,6 +435,58 @@ def test_yahoo_finance_cleaning(self):
         self.assertGreater(data['return'].min(), -0.75)
         self.assertGreater(data['return'].max(), 3)
 
+    def test_yahoo_finance_preload_warnings(self):
+        """Test warnings on _preload if data has issues."""
+
+        # pylint: disable=protected-access
+
+        raw_data = YahooFinance._get_data_yahoo('ZM')
+        empty_instance = YahooFinance.__new__(YahooFinance)
+        empty_instance._symbol = 'ZM' # because the warnings use the symbol
+        cleaned = empty_instance._process(raw_data, None)
+
+        def _test_warning(data_transformation, part_of_message):
+            """Test that warning is raised w/ message containing some word."""
+            data = pd.DataFrame(cleaned, copy=True)
+            exec(data_transformation) # pylint: disable=exec-used
+            # print(data)
+            with self.assertLogs(level='WARNING') as _:
+                empty_instance._preload(data)
+                # print(_)
+                self.assertTrue(part_of_message in _.output[0])
+
+        # columns are: open low high close volume return
+
+        # high unexpected return
+        _test_warning(
+            'data.iloc[300,-1] = 1',
+            'dubious total open-to-open returns')
+
+        # low unexpected return
+        _test_warning(
+            'data.iloc[300,-1] = -0.5',
+            'dubious total open-to-open returns')
+
+        # low unexpected open
+        _test_warning(
+            'data.iloc[300,0] = data.iloc[300,0]*0.5',
+            'dubious open to close returns')
+
+        # high unexpected open
+        _test_warning(
+            'data.iloc[300,0] = data.iloc[300,0]*2',
+            'dubious open to close returns')
+
+        # low unexpected low
+        _test_warning(
+            'data.iloc[300,1] = data.iloc[300,1]*0.5',
+            'dubious open to low returns')
+
+        # high unexpected high
+        _test_warning(
+            'data.iloc[300,2] = data.iloc[300,2]*2',
+            'dubious open to high returns')
+
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 

From 0488b7ee580e1df8b58328f64f515478ace5da58 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 17:00:18 +0400
Subject: [PATCH 24/38] minor

---
 cvxportfolio/data/symbol_data.py | 28 +++++++++++++++++++---------
 cvxportfolio/tests/test_data.py  |  3 ++-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index b0875223c..0c6106eeb 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -168,6 +168,14 @@ def _print_difference(self, current, new):
     def update(self, grace_period):
         """Update current stored data for symbol.
 
+        Checks (which raise warnings):
+
+        #. Elements of data are NaN (skipping last row)
+        #. Update is not append-only. For dataframes check all elements other
+        than last row of the data which was there before, and for that last
+        row, only the open price. For Series that doesn't matter, check that
+        last element is the same.
+
         :param grace_period: If the time between now and the last value stored
             is less than this, we don't update the data already stored.
         :type grace_period: pandas.Timedelta
@@ -192,8 +200,7 @@ def update(self, grace_period):
                         # we use numpy.isclose because returns may be computed
                         # via logreturns and numerical errors can sift through
                         np.isclose(updated.loc[current.index[:-1]],
-                            current.iloc[:-1], equal_nan=True,
-                            rtol=1e-08, atol=1e-08)):
+                            current.iloc[:-1], equal_nan=True)):
                     logger.error(f"{self.__class__.__name__} update"
                         + f" of {self.symbol} is not append-only!")
                     self._print_difference(current, updated)
@@ -211,7 +218,7 @@ def update(self, grace_period):
                             f"{self.__class__.__name__} update"
                             + f" of {self.symbol} changed last value!")
                         self._print_difference(current, updated)
-        except KeyError:
+        except KeyError: # this should have become superflous
             logger.error("%s update of %s could not be checked for"
                 + " append-only edits. Was there a DST change?",
                 self.__class__.__name__, self.symbol)
@@ -577,9 +584,16 @@ def _preload(self, data):
         # pure OLHCV data source there is no need to store the open-to-open
         # returns, they can be computed here
         if not 'return' in data.columns:
-           data['return'] = data['open'].pct_change().shift(-1)
+           data['return'] = data[
+                'open'].pct_change().shift(-1) # pragma: no cover
 
         self._quality_check(data)
+
+        # NaN intraday data
+        data.loc[data.index[-1],
+            ["high", "low", "close", "return", "volume"]] = np.nan
+
+        # compute volume in cash units
         data["valuevolume"] = data["volume"] * data["open"]
         del data["volume"]
 
@@ -670,11 +684,6 @@ def _process(self, new_data, saved_data=None):
         # eliminate adjclose column
         del new_data["adjclose"]
 
-        # eliminate last period's intraday data
-        # TODO this operation needs to be moved in preload
-        new_data.loc[new_data.index[-1],
-            ["high", "low", "close", "return", "volume"]] = np.nan
-
         return new_data
 
     @staticmethod
@@ -752,6 +761,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
         this_periods_open_time = _timestamp_convert(
             data['meta']['currentTradingPeriod']['regular']['start'])
 
+        # this should be enough, but be careful
         if df_result.index[-1] > this_periods_open_time:
             index = df_result.index.to_numpy()
             index[-1] = this_periods_open_time
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 04d0dda36..957fc72af 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -81,8 +81,8 @@ def test_yfinance_download(self):
             data.loc["2023-04-10 13:30:00+00:00", "return"],
             data.loc["2023-04-11 13:30:00+00:00", "open"] /
             data.loc["2023-04-10 13:30:00+00:00", "open"] - 1,
+            rtol=1e-04, atol=1e-07,
         ))
-        self.assertTrue(np.isnan(data.iloc[-1]["close"]))
 
     def test_fred(self):
         """Test basic Fred usage."""
@@ -124,6 +124,7 @@ def test_yahoo_finance(self):
             data.loc["2023-04-05 13:30:00+00:00", "return"],
             data.loc["2023-04-06 13:30:00+00:00", "open"] /
             data.loc["2023-04-05 13:30:00+00:00", "open"] - 1,
+            rtol=1e-04, atol=1e-07,
         ))
 
         store.update(grace_period=pd.Timedelta('1d'))

From 40ff3b5b885ca124155a05d235b5c010f1661b3c Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 17:32:19 +0400
Subject: [PATCH 25/38] mostly done

---
 cvxportfolio/data/market_data.py |  4 +--
 cvxportfolio/data/symbol_data.py | 53 +++++++++++++++++++++++---------
 cvxportfolio/tests/test_data.py  | 46 ++++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/cvxportfolio/data/market_data.py b/cvxportfolio/data/market_data.py
index e69530b5b..245873948 100644
--- a/cvxportfolio/data/market_data.py
+++ b/cvxportfolio/data/market_data.py
@@ -636,10 +636,10 @@ def _remove_missing_recent(self):
         """
 
         if self.prices.iloc[-5:].isnull().any().any():
-            logger.debug(
+            logger.warning(
                 'Removing some recent lines because there are missing values.')
             drop_at = self.prices.iloc[-5:].isnull().any(axis=1).idxmax()
-            logger.debug('Dropping at index %s', drop_at)
+            logger.warning('Dropping at index %s', drop_at)
             self.returns = self.returns.loc[self.returns.index < drop_at]
             if self.prices is not None:
                 self.prices = self.prices.loc[self.prices.index < drop_at]
diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 0c6106eeb..9d56bb858 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -218,10 +218,11 @@ def update(self, grace_period):
                             f"{self.__class__.__name__} update"
                             + f" of {self.symbol} changed last value!")
                         self._print_difference(current, updated)
-        except KeyError: # this should have become superflous
+        # this should have become superflous
+        except KeyError: # pragma: no cover
             logger.error("%s update of %s could not be checked for"
                 + " append-only edits. Was there a DST change?",
-                self.__class__.__name__, self.symbol)
+                self.__class__.__name__, self.symbol) # pragma: no cover
         self._store(updated)
 
     def _download(self, symbol, current, grace_period, **kwargs):
@@ -264,10 +265,10 @@ def _median_scale_around(lrets, window):
     """Median absolute logreturn in a window around each timestamp."""
     return np.abs(lrets).rolling(window, center=True, min_periods=1).median()
 
-def _mean_scale_around(lrets, window):
-    """Root mean squared logreturn in a window around each timestamp."""
-    return np.sqrt(
-        (lrets**2).rolling(window, center=True, min_periods=1).mean())
+# def _mean_scale_around(lrets, window):
+#     """Root mean squared logreturn in a window around each timestamp."""
+#     return np.sqrt(
+#         (lrets**2).rolling(window, center=True, min_periods=1).mean())
 
 def _unlikeliness_score(
         test_logreturns, reference_logreturns, scaler, windows):
@@ -364,7 +365,8 @@ def _process(self, new_data, saved_data=None):
 
         # NaN anomalous open prices
         self._nan_anomalous_prices(
-            new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE)
+            new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE,
+            saved_data=saved_data)
 
         # fill open with close from day before
         self._fillna_and_message(
@@ -383,7 +385,8 @@ def _process(self, new_data, saved_data=None):
 
         # NaN anomalous low prices
         self._nan_anomalous_prices(
-            new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
+            new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE,
+            saved_data=saved_data)
 
         # fill low with min of open and close
         self._fillna_and_message(
@@ -402,7 +405,8 @@ def _process(self, new_data, saved_data=None):
 
         # NaN anomalous high prices
         self._nan_anomalous_prices(
-            new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE)
+            new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE,
+            saved_data=saved_data)
 
         # fill high with max of open and close
         self._fillna_and_message(
@@ -431,17 +435,36 @@ def _fillna_and_message(
                 self.symbol, col_name, bad_indexes, message)
             data[col_name] = getattr(data[col_name], filler)(filler_arg)
 
-    def _nan_anomalous_prices(self, data, price_name, threshold):
+    def _nan_anomalous_prices(
+            self, new_data, price_name, threshold, saved_data=None):
         """Set to NaN given price name on its anomalous logrets to close."""
-        lr_to_close = np.log(data['close']) - np.log(data[price_name])
+        new_lr_to_close =\
+            np.log(new_data['close']) - np.log(new_data[price_name])
+
+        # if there is saved data, we use it to compute the logrets
+        # also on the past, but we only NaN (if necessary) elements of
+        # new data, so the scores computed on the past are not used
+        if saved_data is None:
+            all_lr_to_close = new_lr_to_close
+        else:
+            old_lr_to_close =\
+                np.log(saved_data['close']) - np.log(saved_data[price_name])
+            all_lr_to_close = pd.concat(
+                [old_lr_to_close.loc[
+                    old_lr_to_close.index < new_lr_to_close.index[0]],
+                new_lr_to_close])
+            # drop old data which we don't need
+            all_lr_to_close = all_lr_to_close.iloc[
+                -len(new_data) - max(self.FILTERING_WINDOWS):]
+
         # with this we skip over exact zeros (which come from some upstream
         # cleaning) and would throw the median off
-        lr_to_close.loc[lr_to_close == 0] = np.nan
+        all_lr_to_close.loc[all_lr_to_close == 0] = np.nan
         score = _unlikeliness_score(
-                lr_to_close, lr_to_close, scaler=_median_scale_around,
+                all_lr_to_close, all_lr_to_close, scaler=_median_scale_around,
                 windows=self.FILTERING_WINDOWS)
         self._nan_values(
-            data, condition = score > threshold,
+            new_data, condition = score.loc[new_data.index] > threshold,
             columns_to_nan=price_name, message=f'anomalous {price_name} price')
 
     def _nan_values(self, data, condition, columns_to_nan, message):
@@ -753,7 +776,7 @@ def _get_data_yahoo(ticker, start='1900-01-01', end='2100-01-01'):
                 data['indicators']['quote'][0], index=index)
             df_result['adjclose'] = data[
                 'indicators']['adjclose'][0]['adjclose']
-        except KeyError as exc:
+        except KeyError as exc: # pragma: no cover
             raise DataError(f'Yahoo finance download of {ticker} failed.'
                 + ' Json:', str(res.json())) from exc # pragma: no cover
 
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 957fc72af..2b891ea7e 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -156,7 +156,8 @@ def test_yahoo_finance_removefirstline(self):
         and sys.version_info.minor < 11, "Issues with timezoned timestamps.")
     def test_sqlite3_store_series(self):
         """Test storing and retrieving of a Series with datetime index."""
-        self._base_test_series(_loader_sqlite, _storer_sqlite)
+        with self.assertWarns(UserWarning):
+            self._base_test_series(_loader_sqlite, _storer_sqlite)
 
     @unittest.skipIf(sys.version_info.major == 3
         and sys.version_info.minor < 11, "Issues with timezoned timestamps.")
@@ -196,6 +197,9 @@ def _base_test_series(self, loader, storer):
         """Test storing and retrieving of a Series with datetime index."""
 
         for data in [
+            pd.Series(
+                0.0, pd.date_range("2020-01-01", "2020-01-10"),
+                name="test0"),
             pd.Series(
                 0.0, pd.date_range("2020-01-01", "2020-01-10", tz='UTC-05:00'),
                 name="test1"),
@@ -507,6 +511,17 @@ def _test_warning(data_transformation, part_of_message):
                 # check all NaNs have been filled
                 self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0)
 
+        def _test_warning_update(data_transformation, part_of_message):
+            """Test that warning is raised w/ message containing some word."""
+            new_data = pd.DataFrame(raw_data.iloc[-20:], copy=True)
+            saved_data = pd.DataFrame(raw_data.iloc[:-15], copy=True)
+            exec(data_transformation) # pylint: disable=exec-used
+            with self.assertLogs(level='WARNING') as _:
+                _cleaned = empty_instance._process(new_data, saved_data)
+                self.assertTrue(part_of_message in _.output[0])
+                # check all NaNs have been filled
+                self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0)
+
         # infty
         _test_warning(
             'data.iloc[2,2] = np.inf',
@@ -601,6 +616,16 @@ def _test_warning(data_transformation, part_of_message):
             + 'data.iloc[20,1] = data.iloc[20,0]',
             'anomalous open price')
 
+        # extreme open update
+        _test_warning_update(
+            'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 1.75;'
+            + 'new_data.iloc[-1,2] = new_data.iloc[-1,0]',
+            'anomalous open price')
+        _test_warning_update(
+            'new_data.iloc[-1,0] = new_data.iloc[-1,0] *  0.5;'
+            + 'new_data.iloc[-1,1] = new_data.iloc[-1,0]',
+            'anomalous open price')
+
     # def test_yahoo_finance_wrong_last_time(self):
     #     """Test that we correct last time if intraday."""
     #
@@ -789,6 +814,25 @@ def test_user_provided_market_data(self):
                        prices=self.prices, cash_key='cash',
                        min_history=pd.Timedelta('0d'))
 
+        with self.assertRaises(NotImplementedError):
+            UserProvidedMarketData(returns=self.returns, volumes=used_volumes,
+                prices=self.prices, cash_key='NOTSUPPORTED',
+                min_history=pd.Timedelta('0d'))
+
+        with self.assertRaises(ValueError):
+            UserProvidedMarketData(returns=self.returns, volumes=used_volumes,
+                prices=self.prices, cash_key='USDOLLAR',
+                min_history=pd.Timedelta('0d'))
+
+        md = UserProvidedMarketData(
+            returns=self.returns, volumes=self.volumes,
+            prices=self.prices, cash_key='cash',
+            min_history=pd.Timedelta('60d'))
+
+        # try to serve when there's not enough min_history
+        with self.assertRaises(ValueError):
+            md.serve(t=self.returns.index[20])
+
     def test_market_data_full(self):
         """Test serve method of DownloadedMarketData."""
 

From c55ab7cf2736b638591643fc93f22e0d492fca6b Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 17:44:54 +0400
Subject: [PATCH 26/38] typo

---
 cvxportfolio/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cvxportfolio/utils.py b/cvxportfolio/utils.py
index 1454966fb..662ba336a 100644
--- a/cvxportfolio/utils.py
+++ b/cvxportfolio/utils.py
@@ -29,7 +29,6 @@
            'average_periods_per_year']
 
 
-@staticmethod
 def set_pd_read_only(df_or_ser):
     """Set numpy array contained in dataframe or series to read only.
 

From d75b0ac53803b67f55a9341a84a077be3850cf31 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 21:31:15 +0400
Subject: [PATCH 27/38] assertNoLogs not available on py < 3.10

---
 cvxportfolio/tests/test_data.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 2b891ea7e..1cde51676 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -435,7 +435,11 @@ def test_yahoo_finance_cleaning(self):
         self.assertLess(data['return'].max(), 0.75)
 
         # this stock had some extreme returns but they were legitimate
-        with self.assertNoLogs(level='WARNING'):
+        # only available on py<3.10
+        if hasattr(self, 'assertNoLogs'):
+            with self.assertNoLogs(level='WARNING'):
+                data = YahooFinance('GME', base_location=self.datadir).data
+        else:
             data = YahooFinance('GME', base_location=self.datadir).data
         self.assertGreater(data['return'].min(), -0.75)
         self.assertGreater(data['return'].max(), 3)

From 0aa09be09e29ec89d9f17c799f5253bb29c57ff2 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Tue, 13 Feb 2024 22:10:21 +0400
Subject: [PATCH 28/38] preload warning on RMS logreturn not abs mean

---
 cvxportfolio/data/symbol_data.py | 23 ++++++++++++++---------
 cvxportfolio/tests/test_data.py  |  6 ++++--
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 9d56bb858..6fb7443c4 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -265,10 +265,14 @@ def _median_scale_around(lrets, window):
     """Median absolute logreturn in a window around each timestamp."""
     return np.abs(lrets).rolling(window, center=True, min_periods=1).median()
 
-# def _mean_scale_around(lrets, window):
-#     """Root mean squared logreturn in a window around each timestamp."""
-#     return np.sqrt(
-#         (lrets**2).rolling(window, center=True, min_periods=1).mean())
+def _mean_scale_around(lrets, window):
+    """Root mean squared logreturn in a window around each timestamp.
+
+    We need a few operations because we skip the observation itself
+    """
+    sum = (lrets**2).rolling(window, center=True, min_periods=2).sum()
+    count = lrets.rolling(window, center=True, min_periods=2).count()
+    return np.sqrt((sum - lrets**2) / (count - 1))
 
 def _unlikeliness_score(
         test_logreturns, reference_logreturns, scaler, windows):
@@ -308,12 +312,13 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
     THRESHOLD_OPEN_TO_CLOSE = 15
 
     # remove low/high prices when low/high to close abs logreturn larger than
-    # this time the median absolute ones in FILTERING_WINDOWS around it
+    # this time the median absolute ones in FILTERING_WINDOWS centered on it
     THRESHOLD_LOWHIGH_TO_CLOSE = 20
 
     # log warning on _preload for abs logreturns (of 4 types) larger than this
-    # time the median absolute ones in FILTERING_WINDOWS around it
-    THRESHOLD_WARN_EXTREME_LOGRETS = 17.5
+    # time the root mean square in FILTERING_WINDOWS centered on it, without
+    # the given observation itself
+    THRESHOLD_WARN_EXTREME_LOGRETS = 10
 
     def _process(self, new_data, saved_data=None):
         """Base method for processing (cleaning) data.
@@ -554,10 +559,10 @@ def _set_infty_to_nan(self, data):
     def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
         """Log warning if logreturns are extreme."""
         # with this we skip over exact zeros (which we assume come from some
-        # cleaning) and would bias the median down
+        # cleaning) and would bias the mean down
         logreturns.loc[logreturns == 0] = np.nan
         score = _unlikeliness_score(
-                logreturns, logreturns, scaler=_median_scale_around,
+                logreturns, logreturns, scaler=_mean_scale_around,
                 windows=self.FILTERING_WINDOWS)
         dubious_indexes = logreturns.index[score > threshold]
         if len(dubious_indexes) > 0:
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 1cde51676..17a7a1316 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -435,11 +435,13 @@ def test_yahoo_finance_cleaning(self):
         self.assertLess(data['return'].max(), 0.75)
 
         # this stock had some extreme returns but they were legitimate
-        # only available on py<3.10
+        # only available on py>3.10
         if hasattr(self, 'assertNoLogs'):
-            with self.assertNoLogs(level='WARNING'):
+            with self.assertNoLogs(level='WARNING'): # pragma: no cover
+                # pragma: no cover
                 data = YahooFinance('GME', base_location=self.datadir).data
         else:
+            # pragma: no cover
             data = YahooFinance('GME', base_location=self.datadir).data
         self.assertGreater(data['return'].min(), -0.75)
         self.assertGreater(data['return'].max(), 3)

From e81932abe24dcf22da83038a4e3cce53bf2a2034 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 12:20:30 +0400
Subject: [PATCH 29/38] historical data cleaning

---
 cvxportfolio/data/symbol_data.py     | 41 +++++++++++++++++++++++-----
 cvxportfolio/tests/test_data.py      | 11 ++++++++
 examples/strategies/ftse100_daily.py |  5 +++-
 examples/universes.py                | 24 ++++++++--------
 4 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 6fb7443c4..afc3fed92 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -316,9 +316,8 @@ class OLHCV(SymbolData): # pylint: disable=abstract-method
     THRESHOLD_LOWHIGH_TO_CLOSE = 20
 
     # log warning on _preload for abs logreturns (of 4 types) larger than this
-    # time the root mean square in FILTERING_WINDOWS centered on it, without
-    # the given observation itself
-    THRESHOLD_WARN_EXTREME_LOGRETS = 10
+    # this time the median absolute ones in FILTERING_WINDOWS centered on it
+    THRESHOLD_WARN_EXTREME_LOGRETS = 50
 
     def _process(self, new_data, saved_data=None):
         """Base method for processing (cleaning) data.
@@ -438,7 +437,10 @@ def _fillna_and_message(
                 '%s("%s").data["%s"] has NaNs on timestamps: %s,'
                 + ' filling them with %s.', self.__class__.__name__,
                 self.symbol, col_name, bad_indexes, message)
-            data[col_name] = getattr(data[col_name], filler)(filler_arg)
+            if filler == 'ffill':
+                data[col_name] = data[col_name].ffill()
+            else:
+                data[col_name] = getattr(data[col_name], filler)(filler_arg)
 
     def _nan_anomalous_prices(
             self, new_data, price_name, threshold, saved_data=None):
@@ -559,10 +561,10 @@ def _set_infty_to_nan(self, data):
     def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
         """Log warning if logreturns are extreme."""
         # with this we skip over exact zeros (which we assume come from some
-        # cleaning) and would bias the mean down
+        # cleaning) and would bias the scale down
         logreturns.loc[logreturns == 0] = np.nan
         score = _unlikeliness_score(
-                logreturns, logreturns, scaler=_mean_scale_around,
+                logreturns, logreturns, scaler=_median_scale_around,
                 windows=self.FILTERING_WINDOWS)
         dubious_indexes = logreturns.index[score > threshold]
         if len(dubious_indexes) > 0:
@@ -669,6 +671,28 @@ class YahooFinance(OLHCV):
     :type data: pandas.DataFrame
     """
 
+    # Maximum number of contiguous days on which an adjclose price can be
+    # invalid (e.g., negative); if any such period is found, all data before
+    # and including it is removed
+    MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20
+
+    def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
+        """Throw out all data before many NaN on adjclose column."""
+        invalid_indexes = new_data.index[
+            new_data.adjclose.isnull().rolling(
+                self.MAX_CONTIGUOUS_MISSING_ADJCLOSES
+                ).sum() == self.MAX_CONTIGUOUS_MISSING_ADJCLOSES]
+        if len(invalid_indexes) > 0:
+            last_invalid_index = invalid_indexes[-1]
+            logger.warning(
+                '%s("%s").data has invalid adjclose prices for more than'
+                + ' %s contiguous days until %s; removing all data until then',
+                self.__class__.__name__, self.symbol,
+                self.MAX_CONTIGUOUS_MISSING_ADJCLOSES, last_invalid_index)
+            new_data = pd.DataFrame(
+                new_data.loc[new_data.index > last_invalid_index], copy=True)
+        return new_data
+
     def _process(self, new_data, saved_data=None):
         """Process Yahoo Finance specific data, call parent's.
 
@@ -684,12 +708,15 @@ def _process(self, new_data, saved_data=None):
         # NaN non-positive adj close
         self._nan_nonpositive_prices(new_data, "adjclose")
 
+        # Throw out data before many NaN on adjclose
+        new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data)
+
         # forward-fill adj close
         self._fillna_and_message(
             new_data, 'adjclose', 'last available', filler='ffill')
 
         ## OLHCV._process treats all columns other than adjclose
-        super()._process(new_data, saved_data=saved_data)
+        new_data = super()._process(new_data, saved_data=saved_data)
 
         ## Compute total open-to-open returns
 
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 17a7a1316..b5fd8c08d 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -498,6 +498,17 @@ def _test_warning(data_transformation, part_of_message):
             'data.iloc[300,2] = data.iloc[300,2]*2',
             'dubious open to high returns')
 
+    def test_yahoo_finance_remove_on_many_bad_adjcloses(self):
+        """Test remove old data when many adjcloses are invalid."""
+
+        # this stock was found to have bad (negative) adjcloses for many
+        # months at its start
+        # with self.assertLogs(level='WARNING') as _:
+        with self.assertLogs(level='WARNING') as _:
+            YahooFinance('BATS.L', base_location=self.datadir)
+            self.assertTrue(np.any(
+                'contiguous' in el.output for el in _))
+
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 
diff --git a/examples/strategies/ftse100_daily.py b/examples/strategies/ftse100_daily.py
index bc99dd472..07336929c 100644
--- a/examples/strategies/ftse100_daily.py
+++ b/examples/strategies/ftse100_daily.py
@@ -67,10 +67,13 @@ def policy(gamma_risk, gamma_trade):
 
     else:
         import matplotlib.pyplot as plt
+        import pandas as pd
 
         #INDEX_ETF = 'DIA'
 
-        research_sim = cvx.StockMarketSimulator(FTSE100, cash_key='GBPOUND')
+        md = cvx.DownloadedMarketData(
+            FTSE100, cash_key='GBPOUND', grace_period=pd.Timedelta('5d'))
+        research_sim = cvx.StockMarketSimulator(market_data = md)
 
         research_policy, _ = policy(1., 1.)
 
diff --git a/examples/universes.py b/examples/universes.py
index 3687a1112..139c1b65e 100644
--- a/examples/universes.py
+++ b/examples/universes.py
@@ -20,7 +20,7 @@
 We could also save each universe in a ``json`` file.
 """
 
-# This was generated on 2024-01-04 06:18:49.851642+00:00
+# This was generated on 2024-02-14 07:15:36.308012+00:00
 
 SP500 = \
 ['A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI',
@@ -31,11 +31,11 @@
  'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF-B', 'BG', 'BIIB', 'BIO', 'BK',
  'BKNG', 'BKR', 'BLDR', 'BLK', 'BMY', 'BR', 'BRK-B', 'BRO', 'BSX', 'BWA', 'BX',
  'BXP', 'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE', 'CBRE', 'CCI', 'CCL',
- 'DAY', 'CDNS', 'CDW', 'CE', 'CEG', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI',
- 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP',
- 'COF', 'COO', 'COP', 'COR', 'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM',
- 'CSCO', 'CSGP', 'CSX', 'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX',
- 'CZR', 'D', 'DAL', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR',
+ 'CDNS', 'CDW', 'CE', 'CEG', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF',
+ 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COF',
+ 'COO', 'COP', 'COR', 'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CSCO',
+ 'CSGP', 'CSX', 'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX', 'CZR',
+ 'D', 'DAL', 'DAY', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR',
  'DLTR', 'DOV', 'DOW', 'DPZ', 'DRI', 'DTE', 'DUK', 'DVA', 'DVN', 'DXCM', 'EA',
  'EBAY', 'ECL', 'ED', 'EFX', 'EG', 'EIX', 'EL', 'ELV', 'EMN', 'EMR', 'ENPH',
  'EOG', 'EPAM', 'EQIX', 'EQR', 'EQT', 'ES', 'ESS', 'ETN', 'ETR', 'ETSY',
@@ -93,12 +93,12 @@
 ['AAF.L', 'AAL.L', 'ABF.L', 'ADM.L', 'AHT.L', 'ANTO.L', 'AUTO.L', 'AV.L',
  'AZN.L', 'BA.L', 'BARC.L', 'BATS.L', 'BDEV.L', 'BEZ.L', 'BKG.L', 'BME.L',
  'BNZL.L', 'BP.L', 'BRBY.L', 'BT-A.L', 'CCH.L', 'CNA.L', 'CPG.L', 'CRDA.L',
- 'CTEC.L', 'DCC.L', 'DGE.L', 'DPH.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L',
- 'FCIT.L', 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L',
- 'HLN.L', 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L',
- 'IMI.L', 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L',
- 'LSEG.L', 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L',
- 'OCDO.L', 'PHNX.L', 'PRU.L', 'PSH.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L',
+ 'CTEC.L', 'DCC.L', 'DGE.L', 'DPLM.L', 'EDV.L', 'ENT.L', 'EXPN.L', 'FCIT.L',
+ 'FLTR.L', 'FRAS.L', 'FRES.L', 'GLEN.L', 'GSK.L', 'HIK.L', 'HLMA.L', 'HLN.L',
+ 'HSBA.L', 'HWDN.L', 'IAG.L', 'ICP.L', 'IHG.L', 'III.L', 'IMB.L', 'IMI.L',
+ 'INF.L', 'ITRK.L', 'JD.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L', 'LSEG.L',
+ 'MKS.L', 'MNDI.L', 'MNG.L', 'MRO.L', 'NG.L', 'NWG.L', 'NXT.L', 'OCDO.L',
+ 'PHNX.L', 'PRU.L', 'PSH.L', 'PSN.L', 'PSON.L', 'REL.L', 'RIO.L', 'RKT.L',
  'RMV.L', 'RR.L', 'RS1.L', 'RTO.L', 'SBRY.L', 'SDR.L', 'SGE.L', 'SGRO.L',
  'SHEL.L', 'SKG.L', 'SMDS.L', 'SMIN.L', 'SMT.L', 'SN.L', 'SPX.L', 'SSE.L',
  'STAN.L', 'STJ.L', 'SVT.L', 'TSCO.L', 'TW.L', 'ULVR.L', 'UTG.L', 'UU.L',

From db7477c6fc044755c37ae4fdfe2f23f174119d8b Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 13:01:35 +0400
Subject: [PATCH 30/38] removing phony adjcloses and data around them

---
 cvxportfolio/data/symbol_data.py | 33 +++++++++++++++++++++++++++++++-
 cvxportfolio/tests/test_data.py  | 10 +++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index afc3fed92..10adbf2e2 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -676,6 +676,11 @@ class YahooFinance(OLHCV):
     # and including it is removed
     MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20
 
+    # remove all data (also one day before and after) when logrets implied by
+    # adjcloses are anomalous; abs value larger than median abs value time this
+    # in many windows around it
+    THRESHOLD_BAD_ADJCLOSE = 20
+
     def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
         """Throw out all data before many NaN on adjclose column."""
         invalid_indexes = new_data.index[
@@ -693,6 +698,29 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
                 new_data.loc[new_data.index > last_invalid_index], copy=True)
         return new_data
 
+    def _remove_data_on_bad_adjcloses(self, new_data):
+        """Remove adjcloses if implied logreturns are highly anomalous."""
+        while True:
+            logrets = np.log(new_data.adjclose.ffill()).diff()
+            score = _unlikeliness_score(
+                logrets, logrets, scaler=_median_scale_around,
+                windows=self.FILTERING_WINDOWS)
+
+            # we eliminate data 1 day before and after any anomalous event
+            # could be made less aggressive, but better to be safe
+            bad_indexes = logrets.index[
+                (score > self.THRESHOLD_BAD_ADJCLOSE)
+                    | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)]
+
+            if len(bad_indexes) == 0:
+                break
+            new_data.loc[bad_indexes] = np.nan
+            logger.warning(
+                '%s("%s").data has anomalous adjclose prices on timestamps'
+                + '(including one day before and after) %s; removing all'
+                + 'data (not just adjcloses) on those timestamps.',
+                self.__class__.__name__, self.symbol, bad_indexes)
+
     def _process(self, new_data, saved_data=None):
         """Process Yahoo Finance specific data, call parent's.
 
@@ -708,9 +736,12 @@ def _process(self, new_data, saved_data=None):
         # NaN non-positive adj close
         self._nan_nonpositive_prices(new_data, "adjclose")
 
-        # Throw out data before many NaN on adjclose
+        # Throw out all data before many NaN on adjclose
         new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data)
 
+        # Remove all data when highly anomalous adjclose prices are detected
+        self._remove_data_on_bad_adjcloses(new_data)
+
         # forward-fill adj close
         self._fillna_and_message(
             new_data, 'adjclose', 'last available', filler='ffill')
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index b5fd8c08d..59ced7de0 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -503,12 +503,20 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self):
 
         # this stock was found to have bad (negative) adjcloses for many
         # months at its start
-        # with self.assertLogs(level='WARNING') as _:
         with self.assertLogs(level='WARNING') as _:
             YahooFinance('BATS.L', base_location=self.datadir)
             self.assertTrue(np.any(
                 'contiguous' in el.output for el in _))
 
+    def test_adjcloses_logrets_removal(self):
+        """Test method to remove adjcloses when its logrets are anomalous."""
+
+        # this stock was found to have phony adjcloses
+        with self.assertLogs(level='WARNING') as _:
+            YahooFinance('BA.L', base_location=self.datadir)
+            self.assertTrue(np.any(
+                    'anomalous adjclose prices' in el.output for el in _))
+
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 

From cde79b55e89d5363b72ded5cfb7adc294454301f Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 17:15:57 +0400
Subject: [PATCH 31/38] improving cleaning of bad adjcloses, more analysis
 needed

---
 cvxportfolio/data/symbol_data.py | 41 +++++++++++++++++++--------
 cvxportfolio/tests/test_data.py  | 48 ++++++++++++++++++++++----------
 2 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 10adbf2e2..2ae19e6f0 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -281,7 +281,7 @@ def _unlikeliness_score(
         np.abs(test_logreturns) / scaler(reference_logreturns, window)
         for window in windows]
     scaled = pd.DataFrame(scaled).T
-    return scaled.min(axis=1)
+    return scaled.min(axis=1, skipna=True)
 
 
 class OLHCV(SymbolData): # pylint: disable=abstract-method
@@ -558,7 +558,8 @@ def _set_infty_to_nan(self, data):
                 data.values, copy=True, nan=np.nan, posinf=np.nan,
                 neginf=np.nan)
 
-    def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
+    def _warn_on_extreme_logreturns(
+            self, logreturns, threshold, what, level='warning'):
         """Log warning if logreturns are extreme."""
         # with this we skip over exact zeros (which we assume come from some
         # cleaning) and would bias the scale down
@@ -568,7 +569,7 @@ def _warn_on_extreme_logreturns(self, logreturns, threshold, what):
                 windows=self.FILTERING_WINDOWS)
         dubious_indexes = logreturns.index[score > threshold]
         if len(dubious_indexes) > 0:
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s") has dubious %s for timestamps: %s',
                 self.__class__.__name__, self.symbol, what, dubious_indexes)
 
@@ -590,17 +591,20 @@ def _quality_check(self, data):
         # extreme open2close
         self._warn_on_extreme_logreturns(
             np.log(data['close']) - np.log(data['open']),
-            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns')
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to close returns',
+            level='info')
 
         # extreme open2high
         self._warn_on_extreme_logreturns(
             np.log(data['high']) - np.log(data['open']),
-            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns')
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to high returns',
+            level='info')
 
         # extreme open2low
         self._warn_on_extreme_logreturns(
             np.log(data['low']) - np.log(data['open']),
-            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns')
+            self.THRESHOLD_WARN_EXTREME_LOGRETS, 'open to low returns',
+            level='info')
 
     def _preload(self, data):
         """Prepare data for use by Cvxportfolio.
@@ -674,12 +678,15 @@ class YahooFinance(OLHCV):
     # Maximum number of contiguous days on which an adjclose price can be
     # invalid (e.g., negative); if any such period is found, all data before
     # and including it is removed
-    MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20
+    MAX_CONTIGUOUS_MISSING_ADJCLOSES = 10
 
     # remove all data (also one day before and after) when logrets implied by
-    # adjcloses are anomalous; abs value larger than median abs value time this
-    # in many windows around it
-    THRESHOLD_BAD_ADJCLOSE = 20
+    # adjcloses are anomalous: abs value larger than median abs value time this
+    # in many windows around it.
+    # this is redone iteratively up to the MAX_CONTIGUOUS_MISSING_ADJCLOSES,
+    # so unless the bad adjcloses are only for few days all data up to the
+    # anomalous event will be deleted
+    THRESHOLD_BAD_ADJCLOSE = 100
 
     def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
         """Throw out all data before many NaN on adjclose column."""
@@ -700,8 +707,13 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
 
     def _remove_data_on_bad_adjcloses(self, new_data):
         """Remove adjcloses if implied logreturns are highly anomalous."""
-        while True:
+        # worst case (if it goes to end of for loop)
+        # we throw out all data before the event
+        for _ in range(self.MAX_CONTIGUOUS_MISSING_ADJCLOSES + 1):
             logrets = np.log(new_data.adjclose.ffill()).diff()
+            # with this we skip over exact zeros (which we assume come from
+            # some cleaning) and would bias the scale down
+            # logrets.loc[logrets == 0.] = np.nan
             score = _unlikeliness_score(
                 logrets, logrets, scaler=_median_scale_around,
                 windows=self.FILTERING_WINDOWS)
@@ -710,7 +722,9 @@ def _remove_data_on_bad_adjcloses(self, new_data):
             # could be made less aggressive, but better to be safe
             bad_indexes = logrets.index[
                 (score > self.THRESHOLD_BAD_ADJCLOSE)
-                    | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)]
+                    | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)
+                    # | score.isnull() # TODO: this is not good, necessary for SMT.L
+                    ]
 
             if len(bad_indexes) == 0:
                 break
@@ -742,6 +756,9 @@ def _process(self, new_data, saved_data=None):
         # Remove all data when highly anomalous adjclose prices are detected
         self._remove_data_on_bad_adjcloses(new_data)
 
+        # Repeat throw out all data before many NaN on adjclose
+        new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data)
+
         # forward-fill adj close
         self._fillna_and_message(
             new_data, 'adjclose', 'last available', filler='ffill')
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 59ced7de0..0618f73ed 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -456,12 +456,13 @@ def test_yahoo_finance_preload_warnings(self):
         empty_instance._symbol = 'ZM' # because the warnings use the symbol
         cleaned = empty_instance._process(raw_data, None)
 
-        def _test_warning(data_transformation, part_of_message):
+        def _test_warning(
+                data_transformation, part_of_message, level='WARNING'):
             """Test that warning is raised w/ message containing some word."""
             data = pd.DataFrame(cleaned, copy=True)
             exec(data_transformation) # pylint: disable=exec-used
             # print(data)
-            with self.assertLogs(level='WARNING') as _:
+            with self.assertLogs(level=level) as _:
                 empty_instance._preload(data)
                 # print(_)
                 self.assertTrue(part_of_message in _.output[0])
@@ -470,33 +471,37 @@ def _test_warning(data_transformation, part_of_message):
 
         # high unexpected return
         _test_warning(
-            'data.iloc[300,-1] = 1',
+            'data.iloc[300,-1] = 4',
             'dubious total open-to-open returns')
 
         # low unexpected return
         _test_warning(
-            'data.iloc[300,-1] = -0.5',
+            'data.iloc[300,-1] = -0.9',
             'dubious total open-to-open returns')
 
         # low unexpected open
         _test_warning(
-            'data.iloc[300,0] = data.iloc[300,0]*0.5',
-            'dubious open to close returns')
+            'data.iloc[300,0] = data.iloc[300,0]*0.1',
+            'dubious open to close returns',
+            level='INFO')
 
         # high unexpected open
         _test_warning(
-            'data.iloc[300,0] = data.iloc[300,0]*2',
-            'dubious open to close returns')
+            'data.iloc[300,0] = data.iloc[300,0]*5',
+            'dubious open to close returns',
+            level='INFO')
 
         # low unexpected low
         _test_warning(
-            'data.iloc[300,1] = data.iloc[300,1]*0.5',
-            'dubious open to low returns')
+            'data.iloc[300,1] = data.iloc[300,1]*0.1',
+            'dubious open to low returns',
+            level='INFO')
 
         # high unexpected high
         _test_warning(
-            'data.iloc[300,2] = data.iloc[300,2]*2',
-            'dubious open to high returns')
+            'data.iloc[300,2] = data.iloc[300,2]*5',
+            'dubious open to high returns',
+            level='INFO')
 
     def test_yahoo_finance_remove_on_many_bad_adjcloses(self):
         """Test remove old data when many adjcloses are invalid."""
@@ -508,14 +513,29 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self):
             self.assertTrue(np.any(
                 'contiguous' in el.output for el in _))
 
+        with self.assertNoLogs(level='WARNING'):
+            YahooFinance('BATS.L', base_location=self.datadir)
+
     def test_adjcloses_logrets_removal(self):
         """Test method to remove adjcloses when its logrets are anomalous."""
 
+        # this stock had anomalous price changes in the 70s
+        with self.assertLogs(level='WARNING') as _:
+            d = YahooFinance("SMT.L", base_location=self.datadir).data
+            self.assertTrue(np.any([
+                    'anomalous adjclose prices' in el for el in _.output]))
+            self.assertTrue(d['return'].max() < 2)
+
         # this stock was found to have phony adjcloses
         with self.assertLogs(level='WARNING') as _:
             YahooFinance('BA.L', base_location=self.datadir)
-            self.assertTrue(np.any(
-                    'anomalous adjclose prices' in el.output for el in _))
+            self.assertTrue(np.any([
+                    'anomalous adjclose prices' in el for el in _.output]))
+
+        if hasattr(self, 'assertNoLogs'):
+            with self.assertNoLogs(level='WARNING'):
+                YahooFinance('BA.L', base_location=self.datadir)
+
 
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""

From 26f03cdc427cea31faa9bfc17ba823495030987c Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 19:16:27 +0400
Subject: [PATCH 32/38] added more adjclose filtering

---
 cvxportfolio/data/symbol_data.py | 30 ++++++++++++++++++++++--------
 cvxportfolio/tests/test_data.py  |  1 -
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 2ae19e6f0..d8baf9500 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -678,7 +678,7 @@ class YahooFinance(OLHCV):
     # Maximum number of contiguous days on which an adjclose price can be
     # invalid (e.g., negative); if any such period is found, all data before
     # and including it is removed
-    MAX_CONTIGUOUS_MISSING_ADJCLOSES = 10
+    MAX_CONTIGUOUS_MISSING_ADJCLOSES = 20
 
     # remove all data (also one day before and after) when logrets implied by
     # adjcloses are anomalous: abs value larger than median abs value time this
@@ -686,7 +686,16 @@ class YahooFinance(OLHCV):
     # this is redone iteratively up to the MAX_CONTIGUOUS_MISSING_ADJCLOSES,
     # so unless the bad adjcloses are only for few days all data up to the
     # anomalous event will be deleted
-    THRESHOLD_BAD_ADJCLOSE = 100
+    THRESHOLD_BAD_ADJCLOSE = 50
+
+    # assume any adjclose-to-adjclose log10-return larger than this in absolute
+    # value (1. is 10x) is false and eliminate both adjcloses around it
+    # this only applies before ASSUME_FALSE_BEFORE
+    THRESHOLD_FALSE_LOG10RETS = .5
+
+    # assume logreturns larger in abs value than threshold above are false
+    # ONLY before this date, otherwise don't filter them
+    ASSUME_FALSE_BEFORE = pd.Timestamp('2000-01-01', tz='UTC')
 
     def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
         """Throw out all data before many NaN on adjclose column."""
@@ -710,21 +719,26 @@ def _remove_data_on_bad_adjcloses(self, new_data):
         # worst case (if it goes to end of for loop)
         # we throw out all data before the event
         for _ in range(self.MAX_CONTIGUOUS_MISSING_ADJCLOSES + 1):
-            logrets = np.log(new_data.adjclose.ffill()).diff()
+            logrets = np.log10(new_data.adjclose.ffill()).diff()
+
             # with this we skip over exact zeros (which we assume come from
             # some cleaning) and would bias the scale down
-            # logrets.loc[logrets == 0.] = np.nan
+            logrets.loc[logrets == 0.] = np.nan
+
             score = _unlikeliness_score(
                 logrets, logrets, scaler=_median_scale_around,
                 windows=self.FILTERING_WINDOWS)
+            bad_score = score > self.THRESHOLD_BAD_ADJCLOSE
+
+            too_large_logreturns = np.abs(
+                logrets) > self.THRESHOLD_FALSE_LOG10RETS
+            too_large_logreturns &= logrets.index < self.ASSUME_FALSE_BEFORE
 
             # we eliminate data 1 day before and after any anomalous event
             # could be made less aggressive, but better to be safe
             bad_indexes = logrets.index[
-                (score > self.THRESHOLD_BAD_ADJCLOSE)
-                    | (score > self.THRESHOLD_BAD_ADJCLOSE).shift(-1)
-                    # | score.isnull() # TODO: this is not good, necessary for SMT.L
-                    ]
+                bad_score | bad_score.shift(-1) | too_large_logreturns
+                    | too_large_logreturns.shift(-1)]
 
             if len(bad_indexes) == 0:
                 break
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 0618f73ed..2026a61d5 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -536,7 +536,6 @@ def test_adjcloses_logrets_removal(self):
             with self.assertNoLogs(level='WARNING'):
                 YahooFinance('BA.L', base_location=self.datadir)
 
-
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 

From 7ccdd542530b689b42b088d1dff6373612fc3b3d Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 19:17:36 +0400
Subject: [PATCH 33/38] typo

---
 cvxportfolio/tests/test_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 2026a61d5..1fc2c9106 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -513,8 +513,9 @@ def test_yahoo_finance_remove_on_many_bad_adjcloses(self):
             self.assertTrue(np.any(
                 'contiguous' in el.output for el in _))
 
-        with self.assertNoLogs(level='WARNING'):
-            YahooFinance('BATS.L', base_location=self.datadir)
+        if hasattr(self, 'assertNoLogs'):
+            with self.assertNoLogs(level='WARNING'):
+                YahooFinance('BATS.L', base_location=self.datadir)
 
     def test_adjcloses_logrets_removal(self):
         """Test method to remove adjcloses when its logrets are anomalous."""

From 5900968591cbc6342322904c66d56e06c0dcd13a Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Wed, 14 Feb 2024 22:02:39 +0400
Subject: [PATCH 34/38] tested on current example universes; names that get
 historical data trimmed down are HUBB, JCI, NVR, and seem reasonable

---
 cvxportfolio/data/symbol_data.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index d8baf9500..3d2456b81 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -777,6 +777,11 @@ def _process(self, new_data, saved_data=None):
         self._fillna_and_message(
             new_data, 'adjclose', 'last available', filler='ffill')
 
+        # eliminate (initial) rows where adjclose is NaN
+        nan_adjcloses = new_data.adjclose.isnull()
+        if np.any(nan_adjcloses):
+            new_data = pd.DataFrame(new_data.loc[~nan_adjcloses], copy=True)
+
         ## OLHCV._process treats all columns other than adjclose
         new_data = super()._process(new_data, saved_data=saved_data)
 

From 572b4df3503a45dbfbfab245f76e9f905f951a91 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Thu, 15 Feb 2024 13:26:02 +0400
Subject: [PATCH 35/38] data cleaning example

---
 examples/data_cleaning.py | 111 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 examples/data_cleaning.py

diff --git a/examples/data_cleaning.py b/examples/data_cleaning.py
new file mode 100644
index 000000000..08a503c77
--- /dev/null
+++ b/examples/data_cleaning.py
@@ -0,0 +1,111 @@
+# Copyright 2023 Enzo Busseti
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This script is used to show the data cleaning applied to Yahoo Finance data.
+
+It is not really an example, and some of the methods shown here are not public,
+so not covered by the semantic versioning agreeement (they could change
+without notice).
+"""
+
+import logging
+import shutil
+import tempfile
+from pathlib import Path
+from time import sleep
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+import cvxportfolio as cvx
+
+# If you change this to logging.INFO you get more logging output from the
+# cleaning procedure
+logging.getLogger().setLevel(logging.WARNING)
+
+# Here put any number of stocks for which you wish to analyze the cleaning
+TEST_UNIVERSE = ['AAPL', 'GOOG', 'TSLA']
+
+# Some names with known issues:
+# TEST_UNIVERSE = ['SMT.L','NVR', 'HUBB', 'NWG.L']
+
+ALL_DROPPED_ROWS_PCT = pd.Series(dtype=float)
+ALL_MIN_LR = pd.Series(dtype=float)
+ALL_MAX_LR = pd.Series(dtype=float)
+
+PLOT = True
+SLEEP = 1
+
+for stock in TEST_UNIVERSE:
+    sleep(SLEEP)
+    print(f'\n\t{stock}:')
+
+    # This method is not public:
+    raw_yfinance = cvx.YahooFinance._get_data_yahoo(stock)
+    print(f'{stock}: YAHOO FINANCE RAW')
+    print(raw_yfinance)
+
+    tmpdir = Path(tempfile.mkdtemp())
+    cvx_cleaned = cvx.YahooFinance(stock, base_location=tmpdir).data
+    shutil.rmtree(tmpdir)
+    print(f'{stock}: CVXPORTFOLIO CLEANED')
+    print(cvx_cleaned)
+
+    yf_log10r = np.log10(raw_yfinance.adjclose).diff().shift(-1)
+    cvx_log10r = np.log10(1 + cvx_cleaned['return'])
+
+    if PLOT:
+        fig, axes = plt.subplots(
+            3, figsize=(10/1.62, 10), layout='constrained')
+
+        raw_yfinance.iloc[:, :5].plot(ax=axes[0])
+        axes[0].set_yscale('log')
+        axes[0].set_title(f'{stock}: RAW YAHOO FINANCE')
+
+        cvx_cleaned.iloc[:, :4].plot(ax=axes[1])
+        axes[1].set_title(f'{stock}: CVXPORTFOLIO CLEANED DATA')
+        axes[1].set_yscale('log')
+
+        (yf_log10r.cumsum() - yf_log10r.sum()).plot(
+            label='Yahoo Finance total close-to-close', ax=axes[2])
+        (cvx_log10r.cumsum() - cvx_log10r.sum()).plot(
+            label='Cvxportfolio total open-to-open', ax=axes[2])
+        axes[2].set_title(f'{stock}: CUMULATIVE LOG10 RETURNS (SCALED)')
+        axes[2].legend()
+
+        plt.show()
+
+    assert cvx_cleaned.index[-1] == raw_yfinance.index[-1]
+
+    print()
+    dropped_rows = len(raw_yfinance) - len(cvx_cleaned)
+    dropped_rows_pct = dropped_rows / len(raw_yfinance)
+    ALL_DROPPED_ROWS_PCT.loc[stock] = dropped_rows_pct*100
+    print(f'Cvxportfolio dropped {int(dropped_rows_pct*100)}% of rows')
+
+    ALL_MIN_LR.loc[stock] = np.log(1+cvx_cleaned['return']).min()
+    ALL_MAX_LR.loc[stock] = np.log(1+cvx_cleaned['return']).max()
+
+    print('Max Cvxportfolio logreturn:', ALL_MAX_LR.loc[stock])
+    print('Min Cvxportfolio logreturn:', ALL_MIN_LR.loc[stock] )
+    print('How many zero volumes:', (cvx_cleaned['valuevolume'] == 0.).mean())
+
+print('\nCvxportfolio dropped rows %:')
+print(ALL_DROPPED_ROWS_PCT.sort_values().tail())
+
+print('\nnCvxportfolio min logreturns:')
+print(ALL_MIN_LR.sort_values().head())
+
+print('\nnCvxportfolio max logreturns:')
+print(ALL_MAX_LR.sort_values().tail())

From 4a207c47869aed21d14fc375637b805fd8cf5ad6 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Thu, 15 Feb 2024 17:21:17 +0400
Subject: [PATCH 36/38] adjusted log level of cleaning to info

---
 cvxportfolio/data/symbol_data.py | 94 ++++++++++++++++++--------------
 cvxportfolio/tests/test_data.py  | 87 +++++++++++++++--------------
 examples/data_cleaning.py        |  6 +-
 3 files changed, 104 insertions(+), 83 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 3d2456b81..759af7d66 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -265,14 +265,14 @@ def _median_scale_around(lrets, window):
     """Median absolute logreturn in a window around each timestamp."""
     return np.abs(lrets).rolling(window, center=True, min_periods=1).median()
 
-def _mean_scale_around(lrets, window):
-    """Root mean squared logreturn in a window around each timestamp.
+# def _mean_scale_around(lrets, window):
+#     """Root mean squared logreturn in a window around each timestamp.
 
-    We need a few operations because we skip the observation itself
-    """
-    sum = (lrets**2).rolling(window, center=True, min_periods=2).sum()
-    count = lrets.rolling(window, center=True, min_periods=2).count()
-    return np.sqrt((sum - lrets**2) / (count - 1))
+#     We need a few operations because we skip the observation itself
+#     """
+#     sum = (lrets**2).rolling(window, center=True, min_periods=2).sum()
+#     count = lrets.rolling(window, center=True, min_periods=2).count()
+#     return np.sqrt((sum - lrets**2) / (count - 1))
 
 def _unlikeliness_score(
         test_logreturns, reference_logreturns, scaler, windows):
@@ -338,7 +338,7 @@ def _process(self, new_data, saved_data=None):
             self._nan_nonpositive_prices(new_data, column)
 
         # all infinity values to NaN
-        self._set_infty_to_nan(new_data)
+        self._set_infty_to_nan(new_data, level='info')
 
         ## Close price.
         ## We believe them (for now). We forward fill them if unavailable.
@@ -370,12 +370,12 @@ def _process(self, new_data, saved_data=None):
         # NaN anomalous open prices
         self._nan_anomalous_prices(
             new_data, 'open', threshold=self.THRESHOLD_OPEN_TO_CLOSE,
-            saved_data=saved_data)
+            saved_data=saved_data, level='info')
 
         # fill open with close from day before
         self._fillna_and_message(
             new_data, 'open', 'close from period before', filler='fillna',
-            filler_arg=new_data['close'].shift(1))
+            filler_arg=new_data['close'].shift(1), level='info')
 
         ## Low price.
         ## We remove if higher than close or anomalous low to close logreturn.
@@ -390,12 +390,12 @@ def _process(self, new_data, saved_data=None):
         # NaN anomalous low prices
         self._nan_anomalous_prices(
             new_data, 'low', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE,
-            saved_data=saved_data)
+            saved_data=saved_data, level='info')
 
         # fill low with min of open and close
         self._fillna_and_message(
             new_data, 'low', 'min of open and close', filler='fillna',
-            filler_arg=new_data[['open', 'close']].min(axis=1))
+            filler_arg=new_data[['open', 'close']].min(axis=1), level='info')
 
         ## High price.
         ## We remove if lower than close or anomalous low to close logreturn.
@@ -410,12 +410,12 @@ def _process(self, new_data, saved_data=None):
         # NaN anomalous high prices
         self._nan_anomalous_prices(
             new_data, 'high', threshold=self.THRESHOLD_LOWHIGH_TO_CLOSE,
-            saved_data=saved_data)
+            saved_data=saved_data, level='info')
 
         # fill high with max of open and close
         self._fillna_and_message(
             new_data, 'high', 'max of open and close', filler='fillna',
-            filler_arg=new_data[['open', 'close']].max(axis=1))
+            filler_arg=new_data[['open', 'close']].max(axis=1), level='info')
 
         ## Some asserts
         assert new_data.iloc[1:].isnull().sum().sum() == 0
@@ -429,11 +429,12 @@ def _process(self, new_data, saved_data=None):
         return new_data
 
     def _fillna_and_message(
-        self, data, col_name, message, filler='fillna', filler_arg=None):
+        self, data, col_name, message, filler='fillna', filler_arg=None,
+            level='warning'):
         """Fill NaNs in column with chosen method and arg."""
         bad_indexes = data.index[data[col_name].isnull()]
         if len(bad_indexes) > 0:
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s").data["%s"] has NaNs on timestamps: %s,'
                 + ' filling them with %s.', self.__class__.__name__,
                 self.symbol, col_name, bad_indexes, message)
@@ -443,7 +444,8 @@ def _fillna_and_message(
                 data[col_name] = getattr(data[col_name], filler)(filler_arg)
 
     def _nan_anomalous_prices(
-            self, new_data, price_name, threshold, saved_data=None):
+            self, new_data, price_name, threshold, saved_data=None,
+                level='warning'):
         """Set to NaN given price name on its anomalous logrets to close."""
         new_lr_to_close =\
             np.log(new_data['close']) - np.log(new_data[price_name])
@@ -472,14 +474,16 @@ def _nan_anomalous_prices(
                 windows=self.FILTERING_WINDOWS)
         self._nan_values(
             new_data, condition = score.loc[new_data.index] > threshold,
-            columns_to_nan=price_name, message=f'anomalous {price_name} price')
+            columns_to_nan=price_name, message=f'anomalous {price_name} price',
+            level=level)
 
-    def _nan_values(self, data, condition, columns_to_nan, message):
+    def _nan_values(
+            self, data, condition, columns_to_nan, message, level='warning'):
         """Set to NaN in-place for indexing condition and chosen columns."""
 
         bad_indexes = data.index[condition]
         if len(bad_indexes) > 0:
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s") has %s on timestamps: %s,'
                 + ' setting to nan',
                 self.__class__.__name__, self.symbol, message, bad_indexes)
@@ -490,27 +494,28 @@ def _nan_nonpositive_prices(self, data, prices_name):
         self._nan_values(
             data=data, condition = data[prices_name] <= 0,
             columns_to_nan = prices_name,
-            message = f'non-positive {prices_name} prices')
+            message = f'non-positive {prices_name} prices', level='info')
 
     def _nan_negative_volumes(self, data):
         """Set negative volumes to NaN, in-place."""
         self._nan_values(
             data=data, condition = data["volume"] < 0,
-            columns_to_nan = "volume", message = 'negative volumes')
+            columns_to_nan = "volume", message = 'negative volumes',
+            level='info')
 
     def _nan_open_lower_low(self, data):
         """Set open price to NaN if lower than low, in-place."""
         self._nan_values(
             data=data, condition = data['open'] < data['low'],
             columns_to_nan = "open",
-            message = 'open price lower than low price')
+            message = 'open price lower than low price', level='info')
 
     def _nan_open_higher_high(self, data):
         """Set open price to NaN if higher than high, in-place."""
         self._nan_values(
             data=data, condition = data['open'] > data['high'],
             columns_to_nan = "open",
-            message = 'open price higher than high price')
+            message = 'open price higher than high price', level='info')
 
     # def _nan_incompatible_low_high(self, data):
     #     """Set low and high to NaN if low is higher, in-place."""
@@ -524,34 +529,34 @@ def _nan_high_lower_close(self, data):
         self._nan_values(
             data=data, condition = data['high'] < data['close'],
             columns_to_nan = "high",
-            message = 'high price lower than close price')
+            message = 'high price lower than close price', level='info')
 
     def _nan_high_lower_open(self, data):
         """Set high price to NaN if lower than open, in-place."""
         self._nan_values(
             data=data, condition = data['high'] < data['open'],
             columns_to_nan = "high",
-            message = 'high price lower than open price')
+            message = 'high price lower than open price', level='info')
 
     def _nan_low_higher_close(self, data):
         """Set low price to NaN if higher than close, in-place."""
         self._nan_values(
             data=data, condition = data['low'] > data['close'],
             columns_to_nan = "low",
-            message = 'low price higher than close price')
+            message = 'low price higher than close price', level='info')
 
     def _nan_low_higher_open(self, data):
         """Set low price to NaN if higher than open, in-place."""
         self._nan_values(
             data=data, condition = data['low'] > data['open'],
             columns_to_nan = "low",
-            message = 'low price higher than open price')
+            message = 'low price higher than open price', level='info')
 
-    def _set_infty_to_nan(self, data):
+    def _set_infty_to_nan(self, data, level='warning'):
         """Set all +/- infty elements of data to NaN, in-place."""
 
         if np.isinf(data).sum().sum() > 0:
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s") has +/- infinity values, setting those to nan',
                 self.__class__.__name__, self.symbol)
             data.iloc[:, :] = np.nan_to_num(
@@ -586,7 +591,7 @@ def _quality_check(self, data):
         # warn on extreme logreturns
         self._warn_on_extreme_logreturns(
             np.log(1 + data['return']), self.THRESHOLD_WARN_EXTREME_LOGRETS,
-            'total open-to-open returns')
+            'total open-to-open returns', level='warning')
 
         # extreme open2close
         self._warn_on_extreme_logreturns(
@@ -697,7 +702,8 @@ class YahooFinance(OLHCV):
     # ONLY before this date, otherwise don't filter them
     ASSUME_FALSE_BEFORE = pd.Timestamp('2000-01-01', tz='UTC')
 
-    def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
+    def _throw_out_all_data_before_many_bad_adjcloses(
+            self, new_data, level='warning'):
         """Throw out all data before many NaN on adjclose column."""
         invalid_indexes = new_data.index[
             new_data.adjclose.isnull().rolling(
@@ -705,7 +711,7 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
                 ).sum() == self.MAX_CONTIGUOUS_MISSING_ADJCLOSES]
         if len(invalid_indexes) > 0:
             last_invalid_index = invalid_indexes[-1]
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s").data has invalid adjclose prices for more than'
                 + ' %s contiguous days until %s; removing all data until then',
                 self.__class__.__name__, self.symbol,
@@ -714,7 +720,7 @@ def _throw_out_all_data_before_many_bad_adjcloses(self, new_data):
                 new_data.loc[new_data.index > last_invalid_index], copy=True)
         return new_data
 
-    def _remove_data_on_bad_adjcloses(self, new_data):
+    def _remove_data_on_bad_adjcloses(self, new_data, level='warning'):
         """Remove adjcloses if implied logreturns are highly anomalous."""
         # worst case (if it goes to end of for loop)
         # we throw out all data before the event
@@ -743,7 +749,7 @@ def _remove_data_on_bad_adjcloses(self, new_data):
             if len(bad_indexes) == 0:
                 break
             new_data.loc[bad_indexes] = np.nan
-            logger.warning(
+            getattr(logger, level)(
                 '%s("%s").data has anomalous adjclose prices on timestamps'
                 + '(including one day before and after) %s; removing all'
                 + 'data (not just adjcloses) on those timestamps.',
@@ -759,27 +765,35 @@ def _process(self, new_data, saved_data=None):
         ## Treat adjclose. We believe them (unless impossible).
 
         # all infinity values to NaN (repeat, but for adjclose)
-        self._set_infty_to_nan(new_data)
+        self._set_infty_to_nan(new_data, level='info')
 
         # NaN non-positive adj close
         self._nan_nonpositive_prices(new_data, "adjclose")
 
         # Throw out all data before many NaN on adjclose
-        new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data)
+        new_data = self._throw_out_all_data_before_many_bad_adjcloses(
+            new_data, level='info')
 
         # Remove all data when highly anomalous adjclose prices are detected
-        self._remove_data_on_bad_adjcloses(new_data)
+        self._remove_data_on_bad_adjcloses(new_data, level='info')
 
         # Repeat throw out all data before many NaN on adjclose
-        new_data = self._throw_out_all_data_before_many_bad_adjcloses(new_data)
+        new_data = self._throw_out_all_data_before_many_bad_adjcloses(
+            new_data, level='info')
 
         # forward-fill adj close
         self._fillna_and_message(
-            new_data, 'adjclose', 'last available', filler='ffill')
+            new_data, 'adjclose', 'last available', filler='ffill',
+            level='info')
 
         # eliminate (initial) rows where adjclose is NaN
         nan_adjcloses = new_data.adjclose.isnull()
         if np.any(nan_adjcloses):
+            logger.info(
+                '%s("%s") is eliminating data on %s because the adjclose '
+                + 'price is missing.',
+                self.__class__.__name__, self.symbol,
+                new_data.index[nan_adjcloses])
             new_data = pd.DataFrame(new_data.loc[~nan_adjcloses], copy=True)
 
         ## OLHCV._process treats all columns other than adjclose
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 1fc2c9106..3fbf87f93 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -440,9 +440,6 @@ def test_yahoo_finance_cleaning(self):
             with self.assertNoLogs(level='WARNING'): # pragma: no cover
                 # pragma: no cover
                 data = YahooFinance('GME', base_location=self.datadir).data
-        else:
-            # pragma: no cover
-            data = YahooFinance('GME', base_location=self.datadir).data
         self.assertGreater(data['return'].min(), -0.75)
         self.assertGreater(data['return'].max(), 3)
 
@@ -521,87 +518,97 @@ def test_adjcloses_logrets_removal(self):
         """Test method to remove adjcloses when its logrets are anomalous."""
 
         # this stock had anomalous price changes in the 70s
-        with self.assertLogs(level='WARNING') as _:
+        with self.assertLogs(level='INFO') as _:
             d = YahooFinance("SMT.L", base_location=self.datadir).data
             self.assertTrue(np.any([
                     'anomalous adjclose prices' in el for el in _.output]))
             self.assertTrue(d['return'].max() < 2)
 
         # this stock was found to have phony adjcloses
-        with self.assertLogs(level='WARNING') as _:
+        with self.assertLogs(level='INFO') as _:
             YahooFinance('BA.L', base_location=self.datadir)
             self.assertTrue(np.any([
                     'anomalous adjclose prices' in el for el in _.output]))
 
-        if hasattr(self, 'assertNoLogs'):
-            with self.assertNoLogs(level='WARNING'):
-                YahooFinance('BA.L', base_location=self.datadir)
+        with self.assertLogs(level='INFO') as _:
+            YahooFinance('BA.L', base_location=self.datadir)
+            self.assertFalse(np.any([
+                    'anomalous adjclose prices' in el for el in _.output]))
 
     def test_yahoo_finance_cleaning_granular(self):
         """Test each step of cleaning."""
 
         # pylint: disable=protected-access
         raw_data = YahooFinance._get_data_yahoo('ZM')
-        print(raw_data)
+        # print(raw_data)
         empty_instance = YahooFinance.__new__(YahooFinance)
         empty_instance._symbol = 'ZM' # because the warnings use the symbol
 
-        def _test_warning(data_transformation, part_of_message):
+        def _test_warning(
+                data_transformation, part_of_message, level='WARNING'):
             """Test that warning is raised w/ message containing some word."""
             data = pd.DataFrame(raw_data, copy=True)
             exec(data_transformation) # pylint: disable=exec-used
-            with self.assertLogs(level='WARNING') as _:
+            with self.assertLogs(level=level) as _:
                 _cleaned = empty_instance._process(data, None)
-                self.assertTrue(part_of_message in _.output[0])
+                self.assertTrue(
+                    np.any([part_of_message in el for el in _.output]))
                 # check all NaNs have been filled
                 self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0)
 
-        def _test_warning_update(data_transformation, part_of_message):
+        def _test_warning_update(
+                data_transformation, part_of_message, level='WARNING'):
             """Test that warning is raised w/ message containing some word."""
             new_data = pd.DataFrame(raw_data.iloc[-20:], copy=True)
             saved_data = pd.DataFrame(raw_data.iloc[:-15], copy=True)
             exec(data_transformation) # pylint: disable=exec-used
-            with self.assertLogs(level='WARNING') as _:
+            with self.assertLogs(level=level) as _:
                 _cleaned = empty_instance._process(new_data, saved_data)
-                self.assertTrue(part_of_message in _.output[0])
+                self.assertTrue(
+                    np.any([part_of_message in el for el in _.output]))
                 # check all NaNs have been filled
                 self.assertTrue(_cleaned.iloc[:-1].isnull().sum().sum() == 0)
 
+        # missing initial adjclose
+        _test_warning(
+            'data.iloc[0,-2] = np.nan',
+            'adjclose price is missing', level='INFO')
+
         # infty
         _test_warning(
             'data.iloc[2,2] = np.inf',
-            'infinity')
+            'infinity', level='INFO')
 
         # non-pos price
         _test_warning(
             'data.iloc[2,0] = -1',
-            'non-positive open')
+            'non-positive open', level='INFO')
         _test_warning(
             'data.iloc[2,0] = 0',
-            'non-positive open')
+            'non-positive open', level='INFO')
         _test_warning(
             'data.iloc[4,2] = 0',
-            'non-positive high')
+            'non-positive high', level='INFO')
 
         # neg volume
         _test_warning(
             'data.iloc[2,-1] = -1',
-            'negative volumes')
+            'negative volumes', level='INFO')
 
         # open lower low
         _test_warning(
             'data.iloc[1,0] = data.iloc[1,1]*.9',
-            'open price lower than low price')
+            'open price lower than low price', level='INFO')
 
         # open higher high
         _test_warning(
             'data.iloc[1,0] = data.iloc[1,2]*1.1',
-            'open price higher than high price')
+            'open price higher than high price', level='INFO')
 
         # low higher close
         _test_warning(
             'data.iloc[3,1] = data.iloc[3].close * 1.1',
-            'low price higher than close price')
+            'low price higher than close price', level='INFO')
 
         # high lower close
         _test_warning( # had to fix it otherwise open cleaner kicks in
@@ -609,67 +616,67 @@ def _test_warning_update(data_transformation, part_of_message):
             'data.iloc[3,0] = close * .95;' # open
             'data.iloc[3,1] = close * .95;' # low
             'data.iloc[3,2] = close * .975', # high
-            'high price lower than close price')
+            'high price lower than close price', level='INFO')
 
         # extreme low price
         _test_warning(
             'data.iloc[3,1] = data.iloc[3,1] * .01',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
         _test_warning(
             'data.iloc[3,1] = data.iloc[3,1] * .02',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
         _test_warning(
             'data.iloc[3,1] = data.iloc[3,1] * .05',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
         _test_warning(
             'data.iloc[3,1] = data.iloc[3,1] * .1',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
         _test_warning(
             'data.iloc[3,1] = data.iloc[3,1] * .2',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
         _test_warning( # changed dtindex until found one that works
             'data.iloc[20,1] = data.iloc[20,1] * .5',
-            'anomalous low price')
+            'anomalous low price', level='INFO')
 
         # extreme high price
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 100',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 50',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 20',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 10',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 5',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
         _test_warning(
             'data.iloc[3,2] = data.iloc[3,2] * 2',
-            'anomalous high price')
+            'anomalous high price', level='INFO')
 
         # extreme open price
         _test_warning(
             'data.iloc[3,0] = data.iloc[3,0] * 1.75;'
             + 'data.iloc[3,2] = data.iloc[3,0]',
-            'anomalous open price')
+            'anomalous open price', level='INFO')
         _test_warning(
             'data.iloc[20,0] = data.iloc[20,0] * 0.5;'
             + 'data.iloc[20,1] = data.iloc[20,0]',
-            'anomalous open price')
+            'anomalous open price', level='INFO')
 
         # extreme open update
         _test_warning_update(
             'new_data.iloc[-1,0] = new_data.iloc[-1,0] * 1.75;'
             + 'new_data.iloc[-1,2] = new_data.iloc[-1,0]',
-            'anomalous open price')
+            'anomalous open price', level='INFO')
         _test_warning_update(
             'new_data.iloc[-1,0] = new_data.iloc[-1,0] *  0.5;'
             + 'new_data.iloc[-1,1] = new_data.iloc[-1,0]',
-            'anomalous open price')
+            'anomalous open price', level='INFO')
 
     # def test_yahoo_finance_wrong_last_time(self):
     #     """Test that we correct last time if intraday."""
diff --git a/examples/data_cleaning.py b/examples/data_cleaning.py
index 08a503c77..23e67d000 100644
--- a/examples/data_cleaning.py
+++ b/examples/data_cleaning.py
@@ -38,7 +38,7 @@
 TEST_UNIVERSE = ['AAPL', 'GOOG', 'TSLA']
 
 # Some names with known issues:
-# TEST_UNIVERSE = ['SMT.L','NVR', 'HUBB', 'NWG.L']
+# TEST_UNIVERSE = ['SMT.L', 'NVR', 'HUBB', 'NWG.L']
 
 ALL_DROPPED_ROWS_PCT = pd.Series(dtype=float)
 ALL_MIN_LR = pd.Series(dtype=float)
@@ -104,8 +104,8 @@
 print('\nCvxportfolio dropped rows %:')
 print(ALL_DROPPED_ROWS_PCT.sort_values().tail())
 
-print('\nnCvxportfolio min logreturns:')
+print('\nCvxportfolio min logreturns:')
 print(ALL_MIN_LR.sort_values().head())
 
-print('\nnCvxportfolio max logreturns:')
+print('\nCvxportfolio max logreturns:')
 print(ALL_MAX_LR.sort_values().tail())

From b9d5d015babb597683124394cf9e625d86d5f670 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Thu, 15 Feb 2024 17:57:14 +0400
Subject: [PATCH 37/38] testcase typo

---
 cvxportfolio/data/symbol_data.py | 8 ++++++++
 cvxportfolio/tests/test_data.py  | 6 +-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cvxportfolio/data/symbol_data.py b/cvxportfolio/data/symbol_data.py
index 759af7d66..119c1221a 100644
--- a/cvxportfolio/data/symbol_data.py
+++ b/cvxportfolio/data/symbol_data.py
@@ -664,6 +664,14 @@ def _preload(self, data):
 class YahooFinance(OLHCV):
     """Yahoo Finance symbol data.
 
+    .. versionadded:: 1.2.0
+
+        The data cleaning logic has been significantly improved, see the
+        ``data_cleaning.py`` example to view what's done on any given
+        name (or enable ``'INFO'`` logging messages). It is recommended to
+        delete the ``~/cvxportfolio_data`` folder with data files downloaded
+        by previous Cvxportfolio versions.
+
     :param symbol: The symbol that we downloaded.
     :type symbol: str
     :param storage_backend: The storage backend, implemented ones are
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
index 3fbf87f93..a9ea2eb6a 100644
--- a/cvxportfolio/tests/test_data.py
+++ b/cvxportfolio/tests/test_data.py
@@ -435,11 +435,7 @@ def test_yahoo_finance_cleaning(self):
         self.assertLess(data['return'].max(), 0.75)
 
         # this stock had some extreme returns but they were legitimate
-        # only available on py>3.10
-        if hasattr(self, 'assertNoLogs'):
-            with self.assertNoLogs(level='WARNING'): # pragma: no cover
-                # pragma: no cover
-                data = YahooFinance('GME', base_location=self.datadir).data
+        data = YahooFinance('GME', base_location=self.datadir).data
         self.assertGreater(data['return'].min(), -0.75)
         self.assertGreater(data['return'].max(), 3)
 

From c9c11ff27833e550b2d5327a24d9c8ebaccdb081 Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Thu, 15 Feb 2024 18:19:57 +0400
Subject: [PATCH 38/38] data cleaning example docs

---
 docs/examples.rst               | 1 +
 docs/examples/data_cleaning.rst | 9 +++++++++
 2 files changed, 10 insertions(+)
 create mode 100644 docs/examples/data_cleaning.rst

diff --git a/docs/examples.rst b/docs/examples.rst
index d43bd17b5..102471231 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -12,6 +12,7 @@ We show some of them, along with their results, in the following pages:
    examples/dow30
    examples/timing
    examples/universes
+   examples/data_cleaning
    examples/etfs
    examples/user_provided_forecasters
    examples/risk_models
diff --git a/docs/examples/data_cleaning.rst b/docs/examples/data_cleaning.rst
new file mode 100644
index 000000000..1128baf63
--- /dev/null
+++ b/docs/examples/data_cleaning.rst
@@ -0,0 +1,9 @@
+Data cleaning
+===================
+
+This example script is
+`available in the repository <https://github.com/cvxgrp/cvxportfolio/blob/master/examples/data_cleaning.py>`_.
+
+.. literalinclude:: ../../examples/data_cleaning.py
+   :language: python
+   :lines: 14-