Skip to content

Commit

Permalink
v 0.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Aug 27, 2022
1 parent 9416890 commit a2cdf03
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
build/
dist/
pypef.egg-info/
workflow/.ipynb_checkpoints/
workflow/.ipynb_checkpoints/
.idea/
2 changes: 1 addition & 1 deletion pypef/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
# §Equal contribution


VERSION = '0.2.2-alpha'
VERSION = '0.2.3-alpha'
2 changes: 1 addition & 1 deletion pypef/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
[--conc]
pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET
[--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PLMC_FILE]
[--sort METRIC_INT] [--threads THREADS] [--color]
[--sort METRIC_INT] [--threads THREADS]
pypef ml --show
[MODELS]
pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --figure TS_FOR_PLOTTING
Expand Down
53 changes: 28 additions & 25 deletions pypef/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,30 +828,29 @@ def save_model(
os.mkdir('Pickles')
except FileExistsError:
pass
cv_filename, name = '', ''
train_sequences, train_variants, y_train = get_sequences_from_file(training_set)
test_sequences, test_variants, y_test = get_sequences_from_file(test_set)
for i, t in enumerate(range(threshold)):
try:
idx = performance_list[t][0]
parameter = performance_list[t][7]
if i == 0:
if encoding == 'onehot' or encoding == 'dca':
name = idx
else:
name = get_basename(idx)
cv_filename = os.path.join('CV_performance', f'{name}_{regressor}_CV_Results.txt')
try:
os.remove(cv_filename)
except FileNotFoundError:
pass
file = open(cv_filename, 'w')
file.write('5-fold cross-validated performance of top '
'models for validation set across all data.\n\n')
if no_fft:
file.write("No FFT used in this model construction, performance represents"
" model accuracies on raw encoded sequence data.\n\n")
file.close()

if encoding == 'onehot' or encoding == 'dca':
name = idx
else:
name = get_basename(idx)
cv_filename = os.path.join('CV_performance', f'{name}_{regressor.upper()}_CV_Results.txt')
try:
os.remove(cv_filename)
except FileNotFoundError:
pass
file = open(cv_filename, 'w')
file.write('5-fold cross-validated performance of top '
'models for validation set across all data.\n\n')
if no_fft:
file.write("No FFT used in this model construction, performance represents"
" model accuracies on raw encoded sequence data.\n\n")
file.close()

# Estimating the CV performance of the n_component-fitted model on all data
if encoding == 'aaidx': # AAindex encoding technique
Expand All @@ -877,9 +876,11 @@ def save_model(
x_train_ = dca_encoder.collect_encoded_sequences(train_variants)
x_test_ = dca_encoder.collect_encoded_sequences(test_variants)
x_train, y_train = remove_nan_encoded_positions(copy.copy(x_train_), y_train)
x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), y_train)
x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), train_variants)
x_test, y_test = remove_nan_encoded_positions(copy.copy(x_test_), y_test)
x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), y_test)
x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), test_variants)
assert len(x_train) == len(y_train) == len(train_variants)
assert len(x_test) == len(y_test) == len(test_variants)

x = np.concatenate([x_train, x_test])
y = np.concatenate([y_train, y_test])
Expand Down Expand Up @@ -912,7 +913,7 @@ def save_model(
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.legend(prop={'size': 8})
plt.savefig(os.path.join('CV_performance', f'{name}_{regressor}_{n_samples}-fold-CV.png'), dpi=300)
plt.savefig(os.path.join('CV_performance', f'{name}_{regressor.upper()}_{n_samples}-fold-CV.png'), dpi=300)
plt.close('all')

if train_on_all: # Train model hyperparameters based on all available data (training + test set)
Expand All @@ -933,7 +934,7 @@ def save_model(
variants=test_variants,
label=True,
hybrid=False,
name=os.path.join('CV_performance', f'{name}_{regressor}_')
name=f'{name}_{regressor.upper()}_'
)

file = open(os.path.join(path, 'Pickles', name), 'wb')
Expand Down Expand Up @@ -1141,7 +1142,7 @@ def predict_and_plot(
round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + \
r'$\rho$ = {}'.format(round(spearman_rho, 3)) + \
'\n' + r'($N$ = {})'.format(len(y_test))
x = np.linspace(min(y_pred) - 1, max(y_pred) + 1, 100)
x = np.linspace(min(y_pred), max(y_pred), 100)
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred,
label=legend, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.8)
Expand Down Expand Up @@ -1194,7 +1195,7 @@ def predict_and_plot(
plt.hlines(y=(y_wt + limit_y_pred) - (((y_wt + limit_y_pred) - (y_wt - limit_y_pred)) / 2),
xmin=y_wt - limit_y_true, xmax=y_wt + limit_y_true, color='grey', linewidth=0.5)
crossline = np.linspace(y_wt - limit_y_true, y_wt + limit_y_true)
plt.plot(crossline, crossline, color='black', linewidth=0.5)
plt.plot(crossline, crossline, color='black', linewidth=0.25)
steps = float(abs(max(y_pred)))
gradient = []
for x in np.linspace(0, steps, 100):
Expand Down Expand Up @@ -1242,6 +1243,8 @@ def plot_y_true_vs_y_pred(
fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})'
)
file_name = name + 'ML_Model_LS_TS_Performance.png'
x = np.linspace(min(y_pred), max(y_pred), 100)
ax.plot(x, x, color='black', linewidth=0.25) # plot diagonal line
ax.legend(prop={'size': 8})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
Expand All @@ -1262,4 +1265,4 @@ def plot_y_true_vs_y_pred(
# while os.path.isfile(file_name):
# i += 1 # iterate until finding an unused file name
# file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
plt.savefig(file_name, dpi=500)
plt.savefig(file_name, dpi=500)
1 change: 1 addition & 0 deletions pypef/ml/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def run_pypef_pure_ml(arguments):
no_fft=arguments['--nofft'],
minimum_r2=0.0
)

save_model(
path=path,
performance_list=encoding_performance_list,
Expand Down
50 changes: 31 additions & 19 deletions pypef/utils/learning_test_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,21 +366,28 @@ def make_sub_ls_ts_randomly(

def make_fasta_ls_ts(
filename,
wt,
sub,
val
): # sub = substitution, val = value
wt_seq,
substitutions,
fitness_values
):
"""
Creates learning and test sets (.fasta style files)
sub: list
filename: str
String for defining the filename for the learning and test set "fasta-like" files.
wt: str
Wild-type sequence as string
substitutions: list
List of substiutuions of a single variant of the format:
- Single substitution variant, e.g. variant A123C: [['A123C']]
- Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']]
- Single substitution variant, e.g. variant A123C: ['A123C']
- Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G']
--> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']]
fitness_values: list
List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8]
"""
myfile = open(filename, 'w')
for i, var in enumerate(sub): # var are lists of (single or multiple) substitutions
temp = list(wt)
for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions
temp = list(wt_seq)
name = ''
separation = 0
if var == ['WT']:
Expand All @@ -397,28 +404,33 @@ def make_fasta_ls_ts(
name += '/' + single_var
separation += 1
print(f'>{name}', file=myfile)
print(f';{val[i]}', file=myfile)
print(f';{fitness_values[i]}', file=myfile)
print(''.join(temp), file=myfile)
# print(name+';'+str(val[i])+';'+''.join(temp), file=myfile) # output: CSV format
myfile.close()


def get_seqs_from_var_name(
wt,
sub,
val
wt_seq,
substitutions,
fitness_values
) -> tuple[list, list, list]:
"""
Similar to function above but just returns sequences
variant: list
wt: str
Wild-type sequence as string
substitutions: list
List of substiutuions of a single variant of the format:
- Single substitution variant, e.g. variant A123C: [['A123C']]
- Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']]
- Single substitution variant, e.g. variant A123C: ['A123C']
- Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G']
--> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']]
fitness_values: list
List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8]
"""
variant, values, sequences = [], [], []
for i, var in enumerate(sub): # var are lists of (single or multiple) substitutions
temp = list(wt)
for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions
temp = list(wt_seq)
name = ''
separation = 0
if var == ['WT']:
Expand All @@ -435,7 +447,7 @@ def get_seqs_from_var_name(
name += '/' + single_var
separation += 1
variant.append(name)
values.append(val[i])
values.append(fitness_values[i])
sequences.append(''.join(temp))

return variant, values, sequences
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ adjustText
scikit-learn
biopython
schema
ray[default]
ray[default]<2.0.0
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
# for installation run me with: pip install . --use-feature=in-tree-build
# for installation run me with: pip install .


from setuptools import setup, find_packages
Expand All @@ -10,7 +10,7 @@

setup(
name='pypef',
version='0.2.2',
version='0.2.3',
author='Niklas Siedhoff & Alexander-Maurice Illig',
author_email='[email protected]',
license='CC BY-NC-SA 4.0',
Expand All @@ -34,6 +34,7 @@
'Development Status :: 3 - Alpha',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'Topic :: Scientific/Engineering :: Artificial Intelligence'
],
Expand Down

0 comments on commit a2cdf03

Please sign in to comment.