-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
162 lines (131 loc) · 7.35 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# import numpy as np
import pandas as pd
# import gym
# from stable_baselines3 import PPO
# from stable_baselines3.common.evaluation import evaluate_policy
# from stable_baselines3.common.monitor import Monitor
# from stable_baselines3.common.vec_env import DummyVecEnv
# from stable_baselines3.ppo import MlpPolicy
# from imitation.algorithms import bc
# from imitation.data import rollout
# from imitation.data.wrappers import RolloutInfoWrapper
from supervised_classifiers import SupervisedClassifier
from file_utility import FileUtility
from load_file import YahooDataLoader
from tqdm import tqdm
from line_printer import LinePrinter
import timeit
def run_classifier(classifiers, train_data, test_data, extra_classification_counter):
if classifiers.generate_test_train(train_data, test_data):
# y_scores = classifiers.evaluate_model()
y_scores = classifiers.load_random_forest_and_evaluate()
extra_classification_counter = 0
else:
extra_classification_counter += 1
return extra_classification_counter
if __name__ == '__main__':
starting_time = timeit.default_timer()
print("Start time :", starting_time)
data_path = '../Data_Source/Yahoo/Processed_Yahoo_Data/Stock_Binary_tolerance_half_std/ETF'
# folder_path = 'AMS'
# This parameter is used in order to prevent memory problems. If a folder contains more than 1000 files,
# then the system will load 1000 files first, do the training and then continue with the rest of the files
line_printer = LinePrinter()
# no_of_files_to_start_training = 1000
min_df_size_to_start_training = 3000
# actions = [-1, 0, 1]
actions = [2, 0, 1]
batch_size_to_load_if_min_df_size_does_not_have_enough_sample_data = 5000
# action_description = ['Long Put', 'Short stock', 'Short Call', 'Flat', 'Short Put', 'Long stock', 'Long Call']
action_description = ['Short Call', 'Short Strangle', 'Short Put']
intervals = 4
sentence_length = 32
include_volatility = True
train_percent = 0.5
file_info = {'source_data_path': data_path,
'save_destination_path': 'results',
'file_formats_to_load': 'csv',
'file_format_to_save': 'csv',
'verbose': True
}
data_col = ['-' + str(i) for i in range(sentence_length - 1, 1, -1)]
data_col.append('0')
# % of the data to be used for testing
# for example 75% of the files in each folder for training and the rest of the folders will be used for test
file_util = FileUtility(**file_info)
classifiers = SupervisedClassifier(actions, action_description, include_volatility)
classifiers.generate_pipeline()
data_loader = YahooDataLoader()
# Load all folders and iterate through each folder and perform training:
list_of_folders = file_util.load_all_sub_directories()
# print('list_of_folders[0]: ', list_of_folders[0])
# print(shit)
# This counter will count the number of batch_size_to_load_if_min_df_size_does_not_have_enough_sample_data
# it will be multiplied with the above variable and then added to min_df_size_to_start_training in order to
# make sure there is enough data before calling the classifier
extra_classification_counter = 0
test_data = pd.DataFrame()
train_data = pd.DataFrame()
total_row_counter = 0
for folder_counter in tqdm(range(len(list_of_folders))):
# for folder_counter in tqdm(range(1,2)):
folder_name = list_of_folders[folder_counter]
# folder_name = list_of_folders[4]
list_of_files = file_util.load_file_names_in_directory(folder_name)
no_of_files_in_folder = len(list_of_files)
for file_counter in tqdm(range(no_of_files_in_folder)):
# for file_counter in tqdm(range(20)):
# print('Loading: ' + data_path + "/" + folder_name, list_of_files[file_counter])
loaded_data = data_loader.load_file(data_path + "/" + folder_name, list_of_files[file_counter], True,
interval=intervals)
# try:
# if there is information available in the file we have just loaded then proceed
if len(loaded_data) > 0:
total_row_counter += len(loaded_data)
# threshold is the minimum number of rows of data required before entering starting the classifier
threshold = (min_df_size_to_start_training + (extra_classification_counter *
batch_size_to_load_if_min_df_size_does_not_have_enough_sample_data))
#print('threshold: ', threshold)
# Generate Test Data
# first check if we have enough train data, then collect test data
if (len(train_data) < (threshold * train_percent)):
train_data = pd.concat([loaded_data, train_data])
# print('Train DATA::::::')
# print(train_data)
# line_printer.print_line()
else:
# Generate Train Data
test_data = pd.concat([loaded_data, test_data])
# if we have enough data, then we can run the classifier.
if ((len(test_data) + len(train_data)) > threshold):
#line_printer.print_text('Calling the Classifier')
extra_classification_counter = run_classifier(classifiers, train_data, test_data,
extra_classification_counter)
# this is to free up memory. If the run_classifier manged to conduct training then we can start
# collecting data for a new train and test dataset
if (extra_classification_counter == 0):
train_data = pd.DataFrame()
test_data = pd.DataFrame()
print("Total Row Counter: ", total_row_counter)
print('len (train_data): ', len(train_data))
print('len (test_data): ', len(test_data))
# except:
# print("could not load this file: ", data_path + "/" + folder_name, list_of_files[file_counter])
print('Before Final Run len (train_data): ', len(train_data))
print('Before Final Run len (test_data): ', len(test_data))
final_length = len(train_data) + len(test_data)
print('Final total: ', final_length)
# This part processes all the data that is left after the last file is loaded but the length of test and train
# dataset is still below the threshold
combined_df = pd.concat([train_data, test_data])
combined_df.reset_index(inplace=True, drop=True)
if (final_length > 0):
no_of_test_data_required = int(final_length * (1 - train_percent))
no_of_train_data_required = int(final_length * train_percent)
if (len(test_data) < (no_of_test_data_required)):
test_data = pd.concat([test_data, combined_df.iloc[no_of_train_data_required:]])
train_data = combined_df.iloc[:no_of_train_data_required]
final_classification_counter = run_classifier(classifiers, train_data, test_data, extra_classification_counter)
print('After Final Run len (train_data): ', len(train_data))
print('After Final Run (test_data): ', len(test_data))
print("Total Row Counter: ", total_row_counter)