hudson-and-thames · franz101 · Feb 17, 2021 · Feb 17, 2021 · Feb 17, 2021 · Feb 17, 2021
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,8 @@ __pycache__/
 
 # C extensions
 *.so
-
+.idea
+.empty
 # Distribution / packaging
 .Python
 build/
@@ -127,3 +128,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.vscode/
diff --git a/FranzKrekeler/README.md b/FranzKrekeler/README.md
@@ -0,0 +1,13 @@
+The vinecopulaslab is a basic implementation of the paper [Statistical Arbitrage with Vine Copulas [Stübinger, Mangold, Krauss (2016)]](https://www.econstor.eu/bitstream/10419/147450/1/870932616.pdf)
+
+Its purpose is to find correlated stock partners for statistical arbitrage
+
+For installation currently you can only use git clone
+
+Afterwards you can access the tutorial ipynb
+
+The requirements are mainly numpy, pandas.
+The work was done in effort for the application of the march apprenticeship of Hudson & Thames
+
+
+![uml diagram](UML_data.png)
diff --git a/FranzKrekeler/UML_data.png b/FranzKrekeler/UML_data.png
diff --git a/FranzKrekeler/submission_and_tutorial.ipynb b/FranzKrekeler/submission_and_tutorial.ipynb
diff --git a/FranzKrekeler/tests/unit/__init__.py b/FranzKrekeler/tests/unit/__init__.py
diff --git a/FranzKrekeler/tests/unit/datafetchertest.py b/FranzKrekeler/tests/unit/datafetchertest.py
@@ -0,0 +1,6 @@
+from unittest import TestCase
+
+
+class TestDownload(TestCase):
+    def test_download_sample(self):
+        pass
diff --git a/FranzKrekeler/tests/unit/geometrictest.py b/FranzKrekeler/tests/unit/geometrictest.py
@@ -0,0 +1,18 @@
+import sys 
+sys.path.append("./../../")
+import unittest
+from unittest import TestCase
+import numpy as np
+from vinecopulaslab.partnerselection.geometric import GeometricSelection
+
+class TestDownload(TestCase):
+    def test_distance_to_diagonal(self):
+        # test template
+        # TODO: add tests
+        line = np.array([1, 1, 1])
+        pts = np.array([[0, 0, 0]])
+        self.assertEqual(GeometricSelection.distance_to_line(line, pts), 0, "Should be 0") 
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/FranzKrekeler/tmp/sp500_2017-01-01_2018-01-01.p b/FranzKrekeler/tmp/sp500_2017-01-01_2018-01-01.p
diff --git a/FranzKrekeler/vinecopulaslab/__init__.py b/FranzKrekeler/vinecopulaslab/__init__.py
@@ -0,0 +1,3 @@
+from vinecopulaslab.partnerselection import TraditionalSelection, ExtendedSelection, GeometricSelection, \
+    ExtremalSelection
+from vinecopulaslab.universe.universe import UniverseDownloader
diff --git a/FranzKrekeler/vinecopulaslab/partnerselection/__init__.py b/FranzKrekeler/vinecopulaslab/partnerselection/__init__.py
@@ -0,0 +1,8 @@
+"""
+Classes from the partner selection module
+"""
+
+from vinecopulaslab.partnerselection import TraditionalSelection, ExtendedSelection, GeometricSelection, \
+    ExtremalSelection
+
+# TODO: Add PartnerSelection wrapper class
diff --git a/FranzKrekeler/vinecopulaslab/partnerselection/base.py b/FranzKrekeler/vinecopulaslab/partnerselection/base.py
@@ -0,0 +1,82 @@
+# Author: Franz Krekeler 2021
+from typing import List
+import itertools
+import numpy as np
+import pandas as pd
+
+
+class SelectionBase(object):
+    """The base class for the partner selection framework.
+    """
+
+    def __init__(self):
+        """Initialization
+        """
+        self.corr_returns_top_n = None
+
+    @staticmethod
+    def calculate_returns(prices: pd.DataFrame) -> pd.DataFrame:
+        """Calculate percentage based daily returns
+        :param: prices (pd.DataFrame): The columns must the closing prices for the stocks
+        :return: returns (pd.DataFrame)
+        """
+        return prices.pct_change(fill_method='ffill').dropna(how='all')
+
+    @staticmethod
+    def _ranked_correlation(returns: pd.DataFrame) -> pd.DataFrame:
+        """Given a df of returns calculated it's Spearman correlation matrix
+        :param: returns (pd.DataFrame): The input needs to be in percentage based returns
+        :return: returns_correlation (pd.DataFrame)
+        """
+        return returns.corr("spearman")
+
+    @staticmethod
+    def _rankings_pct(returns: pd.DataFrame):
+        """Calculate the rank of a given dataframe and then convert it to percentage based
+        :param: returns (pd.DataFrame)
+        :return: returns_ranked_percentile  (pd.DataFrame)
+        """
+        return returns.rank(pct=True)
+
+    @staticmethod
+    def _top_n_correlations(corr_returns: pd.DataFrame, top_n: int = 50) -> pd.DataFrame:
+        """For correlation matrix return the top n correlations (default 50)
+        :param: corr_returns (pd.DataFrame): correlation matrix
+        :return: corr_returns_top_n pd.DataFrame shape is (n,n) 
+        """
+        # Filter self correlated and self correlated stocks
+        corr_returns_unstacked = corr_returns[corr_returns < 1].unstack().sort_values(ascending=False)
+        corr_returns_unstacked = corr_returns_unstacked.reset_index().dropna()
+        corr_returns_unstacked.columns = "TARGET_STOCK", "STOCK_PAIR", "CORRELATION"
+        # Note pandas was chosen here, but many ways lead to rome
+        corr_returns_top_n = corr_returns_unstacked.groupby("TARGET_STOCK").head(top_n)
+        return corr_returns_top_n.sort_values(['TARGET_STOCK', 'CORRELATION'])
+
+    @staticmethod
+    def _prepare_combinations_of_partners(stock_selection: List[str]) -> pd.DataFrame:
+        """Helper function to calculate all combinations for a target stock and it's potential partners
+        :param: stock_selection (pd.DataFrame): the target stock has to be the first element of the array
+        :return: the possible combinations for the quadruples.Shape (19600,4) or
+        if the target stock is left out (19600,3)
+        """
+        # We will convert the stock names into integers and then get a list of all combinations with a length of 3
+        num_of_stocks = len(stock_selection)
+        # We turn our partner stocks into numerical indices so we can use them directly for indexing
+        partner_stocks_idx = np.arange(1, num_of_stocks)  # basically exclude the target stock
+        partner_stocks_idx_combs = itertools.combinations(partner_stocks_idx, 3)
+        return np.array(list((0,) + comb for comb in partner_stocks_idx_combs))
+
+    def _find_partners(self, target_stocks: List[str] = []):
+        """
+        Helper functions where we apply the approach to each stock. Optional a subset of target stocks can be chosen.
+        :param: return_target_stock (List[str]): the subset of target stocks to analyze (default [])
+        :return: (pd.DataFrame)
+        """
+        assert self.corr_returns_top_n is not None
+        corr_returns_top_n = self.corr_returns_top_n.copy()
+        if len(target_stocks):
+            sublist = corr_returns_top_n.TARGET_STOCK.isin(target_stocks)
+            corr_returns_top_n = corr_returns_top_n[sublist]
+        target_stocks_partners_quadruples = corr_returns_top_n.groupby('TARGET_STOCK').apply(
+            self._partner_selection_approach)
+        return target_stocks_partners_quadruples
diff --git a/FranzKrekeler/vinecopulaslab/partnerselection/extended.py b/FranzKrekeler/vinecopulaslab/partnerselection/extended.py
@@ -0,0 +1,81 @@
+from typing import List
+import numpy as np
+import pandas as pd
+import scipy.special
+from statsmodels.distributions.empirical_distribution import ECDF
+from vinecopulaslab.partnerselection.base import SelectionBase
+
+
+class ExtendedSelection(SelectionBase):
+    """
+    This class implements the extended approach for partner selection. Mentioned section 3.1
+    of the paper "Statistical arbitrage with vine copulas"
+    https://www.econstor.eu/bitstream/10419/147450/1/870932616.pdf
+    It is an extension to the spearman correlation
+    """
+
+    def __init__(self):
+        """Initialization
+        """
+        super().__init__()
+        self.corr_returns_top_n = None
+
+    def _partner_selection_approach(self, group) -> List[str]:
+        """
+        Find the partners stocks for the groupby group of the data df.groupby("TARGET_STOCK").apply(...)
+        :param: group (pd.group) The group of n most correlated stocks
+        :return: (List[str]) returns a list of highest correlated quadruple
+        """
+        target_stock = group.name
+        partner_stocks = group.STOCK_PAIR.tolist()
+        stock_selection = [target_stock] + partner_stocks
+        # We create a subset of our ecdf dataframe to increase lookup speed.
+        data_subset = self.ecdf_df[stock_selection].copy()
+        # We turn our partner stocks into numerical indices so we can use them directly for indexing
+        quadruples_combinations = self._prepare_combinations_of_partners(stock_selection)
+        # We can now use our list of possible quadruples as an index
+        quadruples_combinations_data = data_subset.values[:, quadruples_combinations]
+        # Now we can get closer to a vectorized calculation
+        # n is equal to the total number of returns d to the number of stocks
+        # we use lodash because we don't need the 19600 dimension
+        n, _, d = quadruples_combinations_data.shape
+        # We split up the given formula
+        # For reference:
+        # https://github.com/hudson-and-thames/march_applications_21/blob/main/Guide%20for%20the%20Extended%20Approach.pdf
+        hd = (d + 1) / (2 ** d - d - 1)
+        ecdf_df_product = np.product(quadruples_combinations_data, axis=-1)
+        est1 = hd * (-1 + (2 ** d / n) * (1 - ecdf_df_product).sum(axis=0))
+        est2 = hd * (-1 + (2 ** d / n) * ecdf_df_product.sum(axis=0))
+        # here we create the index as we will use it on specific dimensions
+        idx = np.array([(k, l) for l in range(0, d) for k in range(0, l)])
+        est3 = -3 + (12 / (n * scipy.special.comb(n, 2, exact=True))) * (
+                (1 - quadruples_combinations_data[:, :, idx[:, 0]]) * (
+                1 - quadruples_combinations_data[:, :, idx[:, 1]])).sum(axis=(0, 2))
+        quadruples_scores = (est1 + est2 + est3) / 3
+        # The quadruple scores have the shape of (19600,1) now
+        max_index = np.argmax(quadruples_scores)
+        return data_subset.columns[list(quadruples_combinations[max_index])].tolist()
+
+    def _preprocess(self, close: pd.DataFrame) -> pd.DataFrame:
+        """
+        Helper function for preparing the data. Here we already prepare the ecdf
+        :param close: (pd.DataFrame) the closing prices
+        """
+        close.sort_index(axis=1, inplace=True)
+        self.close_returns = self.calculate_returns(close)
+        self.ranked_correlation = self._ranked_correlation(self.close_returns)
+        self.corr_returns_top_n = self._top_n_correlations(self.ranked_correlation)
+        self.ecdf_df = self.close_returns.apply(lambda x: ECDF(x)(x), axis=0)
+
+    def find_partners(self, close: pd.DataFrame, target_stocks: List[str] = []):
+        """
+        Find partners based on an extension of the Spearmann correlation. Mentioned in section 3.1
+        of the paper "Statistical arbitrage with vine copulas"
+        https://www.econstor.eu/bitstream/10419/147450/1/870932616.pdf
+        :param: close (pd.DataFrame) The close prices of the SP500
+        :param: target_stocks (List[str]) A list of target stocks to analyze
+        :return: (List[str]) returns a list of highest correlated quadruple
+        """
+        self._preprocess(close)
+        # find_partners could be moved to the base class but then it wouldn't have the right docstring... looking for best practice
+        return self._find_partners(target_stocks)
diff --git a/FranzKrekeler/vinecopulaslab/partnerselection/extremal.py b/FranzKrekeler/vinecopulaslab/partnerselection/extremal.py
@@ -0,0 +1,99 @@
+from typing import List
+import itertools
+import logging
+import pandas as pd
+import numpy as np
+import scipy.special
+import scipy.linalg
+from vinecopulaslab.partnerselection.base import SelectionBase
+
+
+class ExtremalSelection(SelectionBase):
+    """
+    Class for partner selection based on "A multivariate linear rank test of independence
+    based on a multiparametric copula with cubic sections"
+    Mangold 2015
+    """
+
+    def __init__(self):
+        """Initialization
+        """
+        super().__init__()
+        self.corr_returns_top_n = None
+
+    def _partner_selection_approach(self, group):
+        """
+        Approach function Partner selection based on "A multivariate linear rank test of independence based on
+        a multiparametric copula with cubic sections"
+        for df.groupby("TARGET_STOCK").apply(...)
+        This has only been implemented for performance testing.
+        References:
+        https://www.researchgate.net/publication/309408947_A_multivariate_linear_rank_test_of_independence_based_on_a_multiparametric_copula_with_cubic_sections
+        https://pypi.org/project/Independence-test/
+        :param group: (group) The group of 50 most correlated stocks
+        :return: (List[str]) returns a list of highest correlated quadruple
+        """
+        logging.warning("Extremal approach is still under construction")
+        target_stock = group.name
+        partner_stocks = group.STOCK_PAIR.tolist()
+        stock_selection = [target_stock] + partner_stocks
+        # We create a subset of our ranked returns dataframe to increase lookup speed.
+        data_subset = self.ranked_returns[stock_selection].copy()
+        # We turn our partner stocks into numerical indices so we can use them directly for indexing
+        quadruples_combinations = self._prepare_combinations_of_partners(stock_selection)
+        # We can now use our list of possible quadruples as an index
+        quadruples_combinations_data = data_subset.values[:, quadruples_combinations]
+        # Now we can get closer to a vectorized calculation
+        # n is equal to the total number of returns d to the number of stocks
+        # we use lodash because we don't need the 19600 dimension
+        n, _, d = quadruples_combinations_data.shape
+        # Here the math from the Mangold 2015 paper begins
+        permut_mat = np.array(list(itertools.product([-1, 1], repeat=d)), dtype=np.int8)
+        sub_mat = permut_mat @ permut_mat.T
+        F = (d + sub_mat) / 2
+        D = (d - sub_mat) / 2
+        cov_mat = ((2 / 15) ** F) * ((1 / 30) ** D)
+        cov_mat_inv = scipy.linalg.inv(cov_mat)
+        rank_df_norm = quadruples_combinations_data / (n + 1)
+        pos_rank_df = (rank_df_norm - 1) * (3 * rank_df_norm - 1)
+        neg_rank_df = rank_df_norm * (2 - 3 * rank_df_norm)
+        # performance here is still lagging
+        # Proposition 3.3. from the paper
+        pos_neg_combined = np.add(np.einsum('ijk,lmk->jmik', pos_rank_df, np.expand_dims(permut_mat > 0, axis=0)),
+                                  np.einsum('ijk,lmk->jmik', neg_rank_df, np.expand_dims(permut_mat < 0, axis=0)))
+        TNP = pos_neg_combined.prod(axis=-1).mean(-1)
+        # Incomplete: Still not documented
+        # performance here is also not optimal here
+        T = ((np.expand_dims(TNP, axis=1) @ np.expand_dims(cov_mat_inv, axis=0)) @ TNP.T)
+        T_results = np.diag(T[:, 0, :]) * n
+        max_index = np.argmax(T_results)
+        partners = data_subset.columns[list(quadruples_combinations[max_index])].tolist()
+        # Please take this with a grain of salt, I was too obsessed with a proof of concept.
+        return partners
+
+    def _preprocess(self, close: pd.DataFrame) -> pd.DataFrame:
+        """
+        Helper function for preparing the data.
+        :param close: (pd.DataFrame) the closing prices
+        """
+        close.sort_index(axis=1, inplace=True)
+        self.close_returns = self.calculate_returns(close)
+        self.ranked_correlation = self._ranked_correlation(self.close_returns)
+        self.corr_returns_top_n = self._top_n_correlations(self.ranked_correlation)
+        self.ranked_returns = self.close_returns.rank()
+
+    def find_partners(self, close: pd.DataFrame, target_stocks: List[str] = []):
+        """
+        Find partners based on the extremal approach mentioned in section 3.1
+        of the paper "Statistical arbitrage with vine copulas"
+        https://www.econstor.eu/bitstream/10419/147450/1/870932616.pdf
+        Based on the paper  Class for partner selection based on "A multivariate linear rank test of independence
+        based on a multiparametric copula with cubic sections" Mangold 2015
+        :param: close (pd.DataFrame) The close prices of the SP500
+        :param: target_stocks (List[str]) A list of target stocks to analyze
+        :return: (List[str]) returns a list of highest correlated quadruple
+        """
+        self._preprocess(close)
+        # find_partners could be moved to the base class but then it wouldn't have the right docstring...
+        # looking for best practice
+        return self._find_partners(target_stocks)