Skip to content

Commit

Permalink
more, cleaning needed
Browse files Browse the repository at this point in the history
  • Loading branch information
enzbus committed Feb 12, 2024
1 parent a598854 commit af76cb9
Showing 1 changed file with 127 additions and 57 deletions.
184 changes: 127 additions & 57 deletions cvxportfolio/data/symbol_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,20 +322,20 @@ def _process(self, new_data, saved_data=None):
"""

self._nan_impossible(new_data, saved_data=saved_data)
self._specific_process(new_data, saved_data=saved_data)
# self._specific_process(new_data, saved_data=saved_data)
self._nan_unlikely(new_data, saved_data=saved_data)
self._fill(new_data, saved_data=saved_data)
self._post_process(new_data, saved_data=saved_data)
# self._post_process(new_data, saved_data=saved_data)

return new_data

def _specific_process(self, new_data, saved_data=None):
"""Specific process, do nothing."""
# return new_data
# def _specific_process(self, new_data, saved_data=None):
# """Specific process, do nothing."""
# # return new_data

def _post_process(self, new_data, saved_data=None):
"""Post process, do nothing."""
# return new_data
# def _post_process(self, new_data, saved_data=None):
# """Post process, do nothing."""
# # return new_data

def _nan_unlikely(self, new_data, saved_data=None):
"""Nan-out unlikely values."""
Expand Down Expand Up @@ -379,30 +379,86 @@ def _fill(self, new_data, saved_data=None):
# print(data)
# print(data.isnull().sum())

def _nan_indexes(self, data, bad_indexes, columns, message):
pass

def _nan_nonpositive_prices(self, data, prices_name):
"""Set non-positive prices (chosen column) to NaN, in-place."""
def _nan_values(self, data, condition, columns_to_nan, message):
"""Set to NaN in-place on indexing condition chosen columns."""

bad_indexes = data.index[data[prices_name] <= 0]
bad_indexes = data.index[condition]
if len(bad_indexes) > 0:
logger.warning(
'%s("%s") has non-positive %s prices on timestamps: %s,'
'%s("%s") has %s on timestamps: %s,'
+ ' setting to nan',
self.__class__.__name__, self.symbol, prices_name, bad_indexes)
data.loc[bad_indexes, prices_name] = np.nan
self.__class__.__name__, self.symbol, message, bad_indexes)
data.loc[bad_indexes, columns_to_nan] = np.nan

def _nan_nonpositive_prices(self, data, prices_name):
"""Set non-positive prices (chosen price name) to NaN, in-place."""
self._nan_values(
data=data, condition = data[prices_name] <= 0,
columns_to_nan = prices_name,
message = f'non-positive {prices_name} prices')

def _nan_negative_volumes(self, data):
"""Set negative volumes to NaN, in-place."""

bad_indexes = data.index[data["volume"] < 0]
if len(bad_indexes) > 0:
logger.warning(
'%s("%s") has negative volumes on timestamps: %s,'
+ ' setting to nan',
self.__class__.__name__, self.symbol, bad_indexes)
data.loc[bad_indexes, "volume"] = np.nan
self._nan_values(
data=data, condition = data["volume"] < 0,
columns_to_nan = "volume", message = 'negative volumes')

def _nan_open_lower_low(self, data):
"""Set open price to NaN if lower than low, in-place."""
self._nan_values(
data=data, condition = data['open'] < data['low'],
columns_to_nan = "open",
message = 'open price lower than low price')

def _nan_open_higher_high(self, data):
"""Set open price to NaN if higher than high, in-place."""
self._nan_values(
data=data, condition = data['open'] > data['high'],
columns_to_nan = "open",
message = 'open price higher than high price')

def _nan_incompatible_low_high(self, data):
"""Set low and high to NaN if low is higher, in-place."""
self._nan_values(
data=data, condition = data['low'] > data['high'],
columns_to_nan = ["low", "high"],
message = 'low price higher than high price')

def _nan_high_lower_close(self, data):
"""Set high price to NaN if lower than close, in-place."""
self._nan_values(
data=data, condition = data['high'] < data['close'],
columns_to_nan = "high",
message = 'high price lower than close price')

def _nan_low_higher_close(self, data):
"""Set low price to NaN if higher than close, in-place."""
self._nan_values(
data=data, condition = data['low'] > data['close'],
columns_to_nan = "low",
message = 'low price higher than close price')

# def _nan_nonpositive_prices(self, data, prices_name):
# """Set non-positive prices (chosen column) to NaN, in-place."""

# bad_indexes = data.index[data[prices_name] <= 0]
# if len(bad_indexes) > 0:
# logger.warning(
# '%s("%s") has non-positive %s prices on timestamps: %s,'
# + ' setting to nan',
# self.__class__.__name__, self.symbol, prices_name, bad_indexes)
# data.loc[bad_indexes, prices_name] = np.nan

# def _nan_negative_volumes(self, data):
# """Set negative volumes to NaN, in-place."""

# bad_indexes = data.index[data["volume"] < 0]
# if len(bad_indexes) > 0:
# logger.warning(
# '%s("%s") has negative volumes on timestamps: %s,'
# + ' setting to nan',
# self.__class__.__name__, self.symbol, bad_indexes)
# data.loc[bad_indexes, "volume"] = np.nan

def _set_infty_to_nan(self, data):
"""Set all +/- infty elements of data to NaN, in-place."""
Expand All @@ -428,27 +484,34 @@ def _nan_impossible(self, new_data, saved_data=None):
# all infinity values are nans
self._set_infty_to_nan(new_data)

# TODO: these can be made smarter (sometimes the open is clearly wrong)
# more
self._nan_open_lower_low(new_data)
self._nan_open_higher_high(new_data)
self._nan_incompatible_low_high(new_data)
self._nan_high_lower_close(new_data)
self._nan_low_higher_close(new_data)

# if low is not the lowest, set it to nan
bad_indexes = new_data.index[
new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
if len(bad_indexes) > 0:
logger.warning(
'%s("%s") low prices are not the lowest on timestamps: %s,'
+ ' setting to nan',
self.__class__.__name__, self.symbol, bad_indexes)
new_data.loc[bad_indexes, "low"] = np.nan
# TODO: these can be made smarter (sometimes the open is clearly wrong)

# if high is not the highest, set it to nan
bad_indexes = new_data.index[
new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
if len(bad_indexes) > 0:
logger.warning(
'%s("%s") high prices are not the highest on timestamps: %s,'
+ ' setting to nan',
self.__class__.__name__, self.symbol, bad_indexes)
new_data.loc[bad_indexes, "high"] = np.nan
# # if low is not the lowest, set it to nan
# bad_indexes = new_data.index[
# new_data['low'] > new_data[['open', 'high', 'close']].min(1)]
# if len(bad_indexes) > 0:
# logger.warning(
# '%s("%s") low prices are not the lowest on timestamps: %s,'
# + ' setting to nan',
# self.__class__.__name__, self.symbol, bad_indexes)
# new_data.loc[bad_indexes, "low"] = np.nan

# # if high is not the highest, set it to nan
# bad_indexes = new_data.index[
# new_data['high'] < new_data[['open', 'high', 'close']].max(1)]
# if len(bad_indexes) > 0:
# logger.warning(
# '%s("%s") high prices are not the highest on timestamps: %s,'
# + ' setting to nan',
# self.__class__.__name__, self.symbol, bad_indexes)
# new_data.loc[bad_indexes, "high"] = np.nan

# TODO: factor quality check and clean into total-return related and non-

Expand All @@ -463,8 +526,8 @@ def _preload(self, data):
# this is not used currently, but if we implement an interface to a
# pure OLHCV data source there is no need to store the open-to-open
# returns, they can be computed here
#if not 'return' in data.columns:
# data['return'] = data['open'].pct_change().shift(-1)
if not 'return' in data.columns:
data['return'] = data['open'].pct_change().shift(-1)

self._quality_check(data)
data["valuevolume"] = data["volume"] * data["open"]
Expand Down Expand Up @@ -552,6 +615,10 @@ def _compute_total_returns(self, data):
# non-market log returns (dividends, splits)
non_market_lr = log_adjustment_ratio.diff().shift(-1)

# dividend_return = (data['adjclose'] / data['close']).pct_change().shift(-1)

# import code; code.interact(local=locals())

# full open-to-open returns
open_to_open = np.log(data["open"]).diff().shift(-1)
data['return'] = np.exp(open_to_open + non_market_lr) - 1
Expand All @@ -570,9 +637,10 @@ def _compute_total_returns(self, data):
# print(data)
# print(data.isnull().sum())

def _post_process(self, new_data, saved_data=None):
def _process(self, new_data, saved_data=None):
"""Temporary."""

super()._process(new_data, saved_data=saved_data)
self._compute_total_returns(new_data)

# close2close_total = np.log(1 + new_data['total_return'])
Expand Down Expand Up @@ -600,6 +668,8 @@ def _post_process(self, new_data, saved_data=None):
new_data.loc[new_data.index[-1],
["high", "low", "close", "return", "volume"]] = np.nan

return new_data

# def _process(self, data):
# """Clean Yahoo Finance open-low-high-close-volume-adjclose data."""

Expand Down Expand Up @@ -668,18 +738,18 @@ def _nan_impossible(self, new_data, saved_data=None):
# also do it on adjclose
self._nan_nonpositive_prices(new_data, "adjclose")

def _specific_process(self, new_data, saved_data=None):
"""Specific process, compute total returns."""
# def _specific_process(self, new_data, saved_data=None):
# """Specific process, compute total returns."""

# Close-to-close total return, so we can delegate to parent class.
# Note that this uses different time alignment than Cvxportfolio,
# Here today's return uses yesterday close and today close, while
# today's returns in Cvxportfolio use today open and tomorrow open.
# However this is the format more common among data vendors.
# new_data['total_return'] = new_data['adjclose'].ffill().pct_change()
# # Close-to-close total return, so we can delegate to parent class.
# # Note that this uses different time alignment than Cvxportfolio,
# # Here today's return uses yesterday close and today close, while
# # today's returns in Cvxportfolio use today open and tomorrow open.
# # However this is the format more common among data vendors.
# # new_data['total_return'] = new_data['adjclose'].ffill().pct_change()

# We don't need this any more.
# del new_data['adjclose']
# # We don't need this any more.
# # del new_data['adjclose']


class YahooFinance(OLHCVAC):
Expand Down

0 comments on commit af76cb9

Please sign in to comment.