Skip to content

Commit

Permalink
update v0.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Jul 9, 2022
1 parent 0b0b909 commit df7839d
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 127 deletions.
2 changes: 1 addition & 1 deletion pypef/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
# §Equal contribution


VERSION = '0.2.0-alpha'
VERSION = '0.2.1-alpha'
14 changes: 7 additions & 7 deletions pypef/dca/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,14 +285,14 @@ def __read_plmc_v2(self, filename, precision):
self.wt_aa_pos = []
for aa, pos in zip(self._target_seq, self.index_list):
self.wt_aa_pos.append(str(aa) + str(pos))
print(f'Evaluating gap content of PLMC parameter file... '
f'First amino acid position used in the MSA (PLMC params file) is '
f'{self._target_seq[0]}{self.index_list[0]} and the last position '
f'used is {self._target_seq[-1]}{self.index_list[-1]}.')
if len(not_valid) > 0:
print(f'\nFurther, non-included positions are:')
print(str(not_valid)[1:-1])
if self.verbose:
print(f'Evaluating gap content of PLMC parameter file... '
f'First amino acid position used in the MSA (PLMC params file) is '
f'{self._target_seq[0]}{self.index_list[0]} and the last position '
f'used is {self._target_seq[-1]}{self.index_list[-1]}.')
if len(not_valid) > 0:
print(f'\nFurther, non-included positions are:')
print(str(not_valid)[1:-1])
print(f'\nSummary of all effective positions represented in the MSA '
f'based on wild-type sequence ({len(valid)} encoded positions):')
for aa_pos in self.wt_aa_pos:
Expand Down
72 changes: 25 additions & 47 deletions pypef/dca/hybrid_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import matplotlib.pyplot as plt
from pypef.utils.variant_data import get_sequences_from_file, remove_nan_encoded_positions
from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel, get_encoded_sequence, ActiveSiteError
from pypef.ml.regression import predictions_out
from pypef.ml.regression import predictions_out, plot_y_true_vs_y_pred

np.warnings.filterwarnings('error', category=np.VisibleDeprecationWarning)

Expand Down Expand Up @@ -570,7 +570,7 @@ def run(
"""


def get_model_and_save_pkl(
def generate_model_and_save_pkl(
xs: list,
ys: list,
dca_encoder: DCAEncoding,
Expand Down Expand Up @@ -650,51 +650,17 @@ def get_model_and_save_pkl(
pass
print(f'Save model as Pickle file... HYBRIDMODEL')
pickle.dump(
{'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2,
'spearman_rho': test_spearman_r, 'regressor': reg},
{
'hybrid_model': hybrid_model,
'beta_1': beta_1,
'beta_2': beta_2,
'spearman_rho': test_spearman_r,
'regressor': reg
},
open('Pickles/HYBRIDMODEL', 'wb')
)


def plot_y_true_vs_y_pred(
y_true: np.ndarray,
y_pred: np.ndarray,
variants: np.ndarray, # just required for labeling
label=False
):
"""
Plots predicted versus true values using the hybrid model for prediction.
Function called by function predict_ps.
"""
spearman_rho = spearmanr(y_true, y_pred)[0]

figure, ax = plt.subplots()
ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7,
label=fr'$\rho$ = {spearman_rho:.3f}')
ax.legend()
ax.set_xlabel(r'$y_\mathrm{true}$' + fr' ($N$ = {len(y_true)})')
ax.set_ylabel(r'$y_\mathrm{pred}$' + fr' ($N$ = {len(y_pred)})')
print('\nPlotting...')
if label:
from adjustText import adjust_text
print('Adjusting variant labels for plotting can take some '
'time (the limit for labeling is 150 data points)...')
if len(y_true) < 150:
texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4)
for i, txt in enumerate(variants)]
adjust_text(
texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250)
else:
print("Terminating label process. Too many variants "
"(> 150) for plotting (labels would overlap).")
file_name = 'DCA_Hybrid_Model_LS_TS_Performance.png'
# i = 1
# while os.path.isfile(file_name):
# i += 1 # iterate until finding an unused file name
# file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
plt.savefig(file_name, dpi=500)


def performance_ls_ts(
ls_fasta: str,
ts_fasta: str,
Expand Down Expand Up @@ -799,13 +765,18 @@ def performance_ls_ts(
pass
print(f'Save model as Pickle file... HYBRIDMODEL')
pickle.dump(
{'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2,
'spearman_rho': test_spearman_r, 'regressor': reg},
{
'hybrid_model': hybrid_model,
'beta_1': beta_1,
'beta_2': beta_2,
'spearman_rho': test_spearman_r,
'regressor': reg
},
open('Pickles/HYBRIDMODEL', 'wb')
)


def predict_ps( # "predict pmult"
def predict_ps( # also predicting "pmult" dirs
prediction_dict: dict,
params_file: str,
threads: int,
Expand Down Expand Up @@ -974,7 +945,14 @@ def predict_ps( # "predict pmult"
print('Testing performance...')
print(f'Spearman\'s rho = {spearmanr(y_true, y_pred)[0]:.3f}')
if figure is not None:
plot_y_true_vs_y_pred(np.array(y_true), np.array(y_pred), np.array(variants), label)
plot_y_true_vs_y_pred(
np.array(y_true), np.array(y_pred), np.array(variants), label, hybrid=True
)
else:
raise SystemError(
'Define set(s) for prediction (e.g. \'-p PS.fasta\' or '
'created prediction set folder, e.g. \'--pmult --drecomb\')'
)


def predict_directed_evolution(
Expand Down
7 changes: 4 additions & 3 deletions pypef/dca/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from pypef.utils.variant_data import read_csv, remove_nan_encoded_positions
from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel
from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, get_model_and_save_pkl
from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, generate_model_and_save_pkl
from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n


Expand Down Expand Up @@ -77,7 +77,8 @@ def run_pypef_hybrid_modeling(arguments):
if arguments['train_and_save']:
dca_encode = DCAEncoding(
params_file=arguments['--params'],
separator=arguments['--mutation_sep']
separator=arguments['--mutation_sep'],
verbose=False
)

variants, fitnesses, _ = read_csv(arguments['--input'])
Expand All @@ -95,7 +96,7 @@ def run_pypef_hybrid_modeling(arguments):
encoded_sequences, fitnesses = remove_nan_encoded_positions(copy.copy(encoded_sequences_), fitnesses)
assert len(encoded_sequences) == len(variants) == len(fitnesses)

get_model_and_save_pkl(
generate_model_and_save_pkl(
xs=encoded_sequences,
ys=fitnesses,
dca_encoder=dca_encode,
Expand Down
3 changes: 2 additions & 1 deletion pypef/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@
[--params PLMC_FILE] [--threads THREADS] [--nofft] [--negative]
pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --pmult
[--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
[--ddiverse] [--tdiverse] [--qdiverse] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS]
[--ddiverse] [--tdiverse] [--qdiverse]
[--regressor TYPE] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS]
pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL12345 --wt WT_SEQ
[--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE]
[--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PLMC_FILE]
Expand Down
Loading

0 comments on commit df7839d

Please sign in to comment.