-
Notifications
You must be signed in to change notification settings - Fork 8
/
idiosyncratic_volatility.py
157 lines (137 loc) · 6.35 KB
/
idiosyncratic_volatility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# ----------------------------------------------------------------
# Idiosyncratic volatility
#
# Idiosyncratic volatility is defined as the standard deviation
# of residuals from asset pricing models. There are two models
# available: CAPM and Fama-French 3-factor. Regression is
# estimated in each month and at least 15 days are required.
# If use the entire CRSP daily data, there will be more than 3.5
# millions regressions from 1926 to 2020. To speed up the
# estimation, multiprocessing is applied.
#
# Ang, Hodrick, Xing and Zhang (2006)
# "To examine trading strategies based on idiosyncratic volatility,
# we describe portfolio formation strategies based on an estimation
# period of L months, a waiting period of M months, and a holding
# period of N months. We describe an L/M/N strategy as follows.
# At month t, we compute idiosyncratic volatilities from the
# regression (8) on daily data over an L-month period from month
# t−L−M to month t−M."
# "We use a 1/0/1 strategy in both cases."
# ----------------------------------------------------------------
import wrds
import configparser as cp
import pandas as pd
import numpy as np
from joblib import Parallel, delayed, parallel_backend
import multiprocessing as mp
import itertools
import time
import os
class ap_ivol:
def __init__(self):
start_time = time.time()
pass_dir = '~/.pass'
cfg = cp.ConfigParser()
cfg.read(os.path.join(os.path.expanduser(pass_dir), 'credentials.cfg'))
conn = wrds.Connection(wrds_username=cfg['wrds']['username'])
# Extract CRSP daily data
dsf = conn.raw_sql("""
select a.permno, a.date, a.ret
from crsp.dsf a left join crsp.msenames b
on a.permno=b.permno and a.date>=b.namedt and a.date<=b.nameendt
where b.exchcd between -2 and 3 and b.shrcd between 10 and 11
""", date_cols=['date'])
dsf = dsf.drop_duplicates(['permno', 'date'], keep='last')
dsf.loc[dsf['ret']<=-1, 'ret'] = np.nan
dsf['permno'] = dsf['permno'].astype(int)
# Extract factor data
# Data is available from 1926-07-01
ff3 = conn.raw_sql("""
select date, mktrf, smb, hml, rf
from ff.factors_daily
order by date
""", date_cols=['date'])
self.dsf = dsf.copy()
self.ff3 = ff3.copy()
end_time = time.time()
print('\n--------- Extract data from WRDS ---------')
print(f'Time used: {(end_time-start_time)/60: 3.1f} mins\n')
def ols_b(self, data, x_var, y_var):
x = data[x_var].copy()
x.loc[:, 'a'] = 1
x = x.to_numpy()
y = data[y_var].to_numpy()
b = np.linalg.inv(x.T@x) @ x.T @ y
return b
def groupby_ols(self, data, x_var, i, l_res):
df = data.query('permno==list(@i)').copy()
df = (df.groupby(['permno', 'yyyymm'])
.apply(self.ols_b, x_var=x_var, y_var='retx').reset_index())
df = tuple(df.itertuples(index=False, name=None))
l_res.append(df)
def ivol_est(self, model, outvar):
start_time = time.time()
df = self.dsf.copy()
if model == 'capm':
factors = ['mktrf']
df = df.merge(self.ff3, how='left', on='date')
elif model == 'ff3':
factors = ['mktrf', 'smb', 'hml']
df = df.merge(self.ff3, how='left', on='date')
df['retx'] = df['ret'] - df['rf']
df['yyyymm'] = df['date'].dt.year*100 + df['date'].dt.month
# Require at least 15 days in a month
# Require standard deviation is greater than 0 to make sure the daily
# returns in a month are not the same
df = df.dropna()
df['n'] = df.groupby(['permno', 'yyyymm'])['retx'].transform('count')
df['std'] = df.groupby(['permno', 'yyyymm'])['retx'].transform('std')
df = df.query('n>=15 & std>0').copy()
df = df.drop(columns=['ret', 'rf', 'n', 'std'])
df = df.sort_values(['permno', 'date'], ignore_index=True)
# Python is slow when running large number of regressions by group
# Parallel to speed up
# TODO: pure numpy should be faster than pandas
permno_list = df['permno'].unique()
permno_split = np.array_split(permno_list, 7)
manager = mp.Manager()
l_res = manager.list()
with parallel_backend('loky', n_jobs=mp.cpu_count()-1):
Parallel()(delayed(self.groupby_ols)(df, factors, i, l_res)
for i in permno_split)
# Flatten list and generate dataframe to improve performance
# This is faster than appending dataframes
l_res = list(itertools.chain.from_iterable(list(l_res)))
b = pd.DataFrame(l_res)
b.columns = ['permno', 'yyyymm', 'est']
res_df = df.merge(b, how='inner', on=['permno', 'yyyymm'])
res_df = res_df.sort_values(['permno', 'yyyymm'], ignore_index=True)
if model == 'capm':
for i, j in zip(['a', 'b1'], [1, 0]):
res_df[i] = res_df['est'].apply(lambda x: x[j])
res_df['p'] = res_df['a'] + res_df['b1']*res_df['mktrf']
elif model == 'ff3':
for i, j in zip(['a', 'b1', 'b2', 'b3'], [3, 0, 1, 2]):
res_df[i] = res_df['est'].apply(lambda x: x[j])
res_df['p'] = (res_df['a'] + res_df['b1']*res_df['mktrf']
+ res_df['b2']*res_df['smb'] + res_df['b3']*res_df['hml'])
res_df['resid'] = res_df['retx'] - res_df['p']
res_df = (res_df.groupby(['permno', 'yyyymm'])['resid']
.std().to_frame(outvar).reset_index())
res_df = res_df.sort_values(['permno', 'yyyymm'], ignore_index=True)
end_time = time.time()
print(f'--------- IVOL estimation: {model} ---------')
print(f'Time used: {(end_time-start_time)/60: 3.1f} mins')
print(f'number of stocks: {len(permno_list)}')
print(f'number of regressions: {len(res_df)}\n')
return res_df
if __name__ == '__main__':
db = ap_ivol()
ivol_capm = db.ivol_est('capm', 'ivol_capm')
ivol_ff3 = db.ivol_est('ff3', 'ivol_ff3')
ivol = ivol_capm.merge(ivol_ff3, how='left', on=['permno', 'yyyymm'])
ivol = ivol.sort_values(['permno', 'yyyymm'], ignore_index=True)
data_dir = '/Volumes/Seagate/asset_pricing_data'
ivol.to_csv(os.path.join(data_dir, 'ivol.txt'), sep='\t', index=False)
print('Done: data is generated')