-
Notifications
You must be signed in to change notification settings - Fork 0
/
HistBinDistance.py
141 lines (108 loc) · 4.34 KB
/
HistBinDistance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy
import csv
import math
### these are the 2 arrays that will supply the values to the hist function
store_array_A = []
store_array_B = []
hist_A = []
hist_B = []
### _JCS is the index of the columns that contain ANY non zero element (>=0.1)
### and _IRS is the row index in the column of that non zero .._SR is the VALUE of the same
_IRS = []
_JCS = []
_SR = []
bin_min = -0.15
bin_max = 0.15
rng = 30
normFactor = 0.9
similarity_matrix = [ [ 0 for i in range(rng+1) ] for j in range(rng+1) ]
### bad hard coding below
store_dailyChg = []
distance_matrix = []
### will return row indices of all non zero indices
### min threshold 0.1
def calcIRS_JCS_SR():
### fill the top half
for row in range(rng+1):
for col in range(rng+1):
if ( col >= row ):
if ( row == col ):
similarity_matrix[row][col] = 1
else:
similarity_matrix[row][col] = similarity_matrix[row][col-1]*0.5
### fill the bottom half
for col in range(rng+1):
for row in range(rng+1):
if ( row >= col ):
if ( row == col ):
similarity_matrix[row][col] = 1
else:
similarity_matrix[row][col] = similarity_matrix[row-1][col]*0.5
for col in range(len(similarity_matrix)):
if ( sum (similarity_matrix[col]) >= 0.1):
if len(_IRS) not in _JCS:
_JCS.append( len(_IRS) )
for row in range(len(similarity_matrix)):
if similarity_matrix[row][col] >= 0.1:
_IRS.append(row)
_SR.append(similarity_matrix[row][col])
if len(_IRS) not in _JCS:
_JCS.append( len(_IRS) )
### final entry in _JCS is the number of non empty cells in the similarity matrix i.e. length of _SR
_JCS.append( len(_SR) )
return 1
### distance calc function
def _calcQCDist(hist_A, hist_B, similarity_matrix, normFactor, sizeOfHist):
Distance_matrix = []
sparseInd= 0
for i in range(sizeOfHist):
zi= 0.0;
cb= _JCS[i]
ce= _JCS[i+1]
for c in range(cb, ce):
zi+= (hist_A[_IRS[c]] + hist_B[_IRS[c]])*_SR[sparseInd]
++sparseInd
if (zi!=0.0):
Distance_matrix.append( (hist_A[i]-hist_B[i])/(pow(zi,normFactor)) )
dist= 0.0
sparseInd= 0
for i in range(len(Distance_matrix)):
cb= _JCS[i]
ce= _JCS[i+1]
for c in range(cb, ce):
if( _IRS[c]>=0 and _IRS[c] < len( Distance_matrix )):
dist+= Distance_matrix[i]*Distance_matrix[_IRS[c]]*_SR[sparseInd]
++sparseInd;
if dist<0:
return 0.0
else:
return math.sqrt(dist);
return 1
### read daily change values into array
for colCtr in range(0,7):
with open('D:\Portfolio-for-masses\DATA\hist_test.csv', 'rb') as csvfile:
stockReader = csv.reader(csvfile, delimiter=',')
locArr = []
for row in stockReader:
### the below is used to check 2 distributions with unequal number of observations come through
if row[colCtr]!='' :
locArr.append(float(row[colCtr]))
store_dailyChg.append(locArr)
### similarity matrix is independant of any loop or whatever and need be called only once
### now call the function that populates irs, jcs and sr
calcIRS_JCS_SR( )
for outer in range(len(store_dailyChg)):
distArr = []
for inner in range(len(store_dailyChg)):
if (outer != inner):
hist_A = numpy.histogram( store_dailyChg[outer], rng+1 , (bin_min, bin_max) )
hist_B = numpy.histogram( store_dailyChg[inner], rng+1 , (bin_min, bin_max) )
### should be passing the 0th element of hist's since the hist is 2 element list that
### contains final freq dist and the bin edges
distArr.append( _calcQCDist(hist_A[0], hist_B[0], similarity_matrix, normFactor, len(hist_A[0])) )
else:
distArr.append(0)
distance_matrix.append(distArr)
w, v = numpy.linalg.eig( distance_matrix )
print w
print v[5]