diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py index 2dfd3226..55ecaa4a 100644 --- a/ms2pip/__main__.py +++ b/ms2pip/__main__.py @@ -146,8 +146,24 @@ def correlate(*args, **kwargs): @cli.command(help=ms2pip.core.get_training_data.__doc__) +@click.argument("psms", required=True) +@click.argument("spectrum_file", required=True) +@click.option("--output-name", "-o", type=str) +@click.option("--spectrum-id-pattern", "-p") +@click.option("--model", type=click.Choice(MODELS), default="HCD") +@click.option("--ms2-tolerance", type=float, default=0.02) +@click.option("--processes", "-n", type=int) def get_training_data(*args, **kwargs): - ms2pip.core.get_training_data(*args, **kwargs) + # Parse arguments + output_name = kwargs.pop("output_name") + output_name = _infer_output_name(kwargs["psms"], output_name).with_suffix(".feather") + + # Run + training_data = ms2pip.core.get_training_data(*args, **kwargs) + + # Write output + logger.info(f"Writing training data to {output_name}") + training_data.to_feather(output_name) def main(): diff --git a/ms2pip/core.py b/ms2pip/core.py index 98e14ebd..e40448fd 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -20,6 +20,7 @@ from ms2pip import spectrum_output from ms2pip._cython_modules import ms2pip_pyx from ms2pip._utils.encoder import Encoder +from ms2pip._utils.feature_names import get_feature_names from ms2pip._utils.psm_input import read_psms from ms2pip._utils.retention_time import RetentionTime from ms2pip._utils.xgb_models import get_predictions_xgb, validate_requested_xgb_model @@ -197,6 +198,7 @@ def get_training_data( psms: Union[PSMList, str, Path], spectrum_file: Union[str, Path], spectrum_id_pattern: Optional[str] = None, + model: Optional[str] = "HCD", ms2_tolerance: float = 0.02, processes: Optional[int] = None, ): @@ -212,6 +214,9 @@ def get_training_data( spectrum_id_pattern Regular expression pattern to apply to spectrum titles before matching to peptide file ``spec_id`` entries. + model + Model to use as reference for the ion types that are extracted from the observed spectra. + Default: "HCD", which results in the extraction of singly charged b- and y-ions. ms2_tolerance MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. processes @@ -223,13 +228,13 @@ def get_training_data( :py:class:`pandas.DataFrame` with feature vectors and targets. """ - raise NotImplementedError psm_list = read_psms(psms) spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)" with Encoder.from_psm_list(psm_list) as encoder: ms2pip_parallelized = _Parallelized( encoder=encoder, + model=model, ms2_tolerance=ms2_tolerance, processes=processes, ) @@ -237,10 +242,11 @@ def get_training_data( results = ms2pip_parallelized.process_spectra( psm_list, spectrum_file, spectrum_id_pattern, vector_file=True ) - logger.debug("Merging results") - vectors = ms2pip_parallelized.write_vector_file(results) - return vectors + logger.info("Assembling training data in DataFrame...") + training_data = _assemble_training_data(results, model) + + return training_data def download_models( @@ -485,7 +491,7 @@ def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[Proc multiprocessing. """ - if not "xgboost_model_files" in MODELS[self.model].keys(): + if "xgboost_model_files" not in MODELS[self.model].keys(): raise ValueError("XGBoost model files not found in MODELS dictionary.") logger.debug("Converting feature vectors to XGBoost DMatrix...") @@ -514,45 +520,6 @@ def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[Proc return results - # TODO IMPLEMENT - def write_vector_file(self, results: List[ProcessingResult]): - raise NotImplementedError - all_results = [] - for r in results: - psmids, df, dtargets = r.get() - - # dtargets is a dict, containing targets for every ion type (keys are int) - for i, t in dtargets.items(): - df["targets_{}".format(MODELS[self.model]["ion_types"][i])] = np.concatenate( - t, axis=None - ) - df["psmid"] = psmids - - all_results.append(df) - - # Only concat DataFrames with content (we get empty ones if more CPUs than peptides) - all_results = pd.concat([df for df in all_results if len(df) != 0]) - - logger.info("Writing vector file %s...", self.vector_file) - # TODO Consider writing to DMatrix XGBoost binary file instead. - # write result. write format depends on extension: - ext = self.vector_file.split(".")[-1] - if ext == "pkl": - all_results.to_pickle(self.vector_file + ".pkl") - elif ext == "csv": - try: - all_results.to_csv(self.vector_file, lineterminator="\n") - except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) - all_results.to_csv(self.vector_file, line_terminator="\n") - else: - raise ValueError("Unknown vector file extension: {}".format(ext)) - # Avoid PyTables dependency - # else: - # "table" is a tag used to read back the .h5 - # all_results.to_hdf(self.vector_file, "table") - - return all_results - # TODO IMPLEMENT def write_predictions( self, all_preds: pd.DataFrame, peptides: pd.DataFrame, output_filename: str @@ -706,7 +673,7 @@ def _process_spectra( # Restructure PeptideRecord entries as spec_id -> [(id, psm_1), (id, psm_2), ...] psms_by_specid = defaultdict(list) for psm_index, psm in enumerated_psm_list: - psms_by_specid[psm.spectrum_id].append((psm_index, psm)) + psms_by_specid[str(psm.spectrum_id)].append((psm_index, psm)) # Track progress for only one worker (good approximation of all workers' progress) for spectrum in read_spectrum_file(spec_file): @@ -782,3 +749,33 @@ def _process_spectra( results.append(result) return results + + +def _assemble_training_data(results: List[ProcessingResult], model: str) -> pd.DataFrame: + """Assemble training data from results list to single pandas DataFrame.""" + # Get ion types + ion_types = [it.lower() for it in MODELS[model]["ion_types"]] + + # Assemble feature vectors, PSM indices, and targets + training_data = pd.DataFrame( + np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]), + columns=get_feature_names(), + ) + training_data["psm_index"] = np.concatenate( + [ + np.repeat(r.psm_index, r.feature_vectors.shape[0]) + for r in results + if r.feature_vectors is not None + ] + ) + for ion_type in ion_types: + training_data[f"target_{ion_type}"] = np.concatenate( + [r.observed_intensity[ion_type] for r in results if r.feature_vectors is not None] + ) + + # Reorder columns + training_data = training_data[ + ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types] + ] + + return training_data diff --git a/pyproject.toml b/pyproject.toml index 0b0364b9..17400e6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ requires-python = ">=3.8" dependencies = [ "numpy>=1.16,<2", "pandas>=1,<2", + "pyarrow", "pyteomics>=3.5,<5", "tomlkit>=0.5,<1", "sqlalchemy>=1.3,<2", @@ -90,8 +91,9 @@ include = ["ms2pip*"] line-length = 99 target-version = ['py38'] -[tool.isort] -profile = "black" +[tool.ruff] +line-length = 99 +target-version = 'py38' [tool.cibuildwheel] build = "cp3*-manylinux_x86_64 cp3*-win_amd64 cp3*-macosx_x86_64" diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 00000000..d2759476 --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,15 @@ +import pandas as pd + +from ms2pip.core import get_training_data + + +def _test_get_training_data(): + expected_df = pd.read_feather("tests/test_data/massivekb_selected_500.feather") + output_df = get_training_data( + "tests/test_data/massivekb_selected_500.peprec", + "tests/test_data/massivekb_selected_500.mgf", + model="HCD", + ms2_tolerance=0.02, + processes=1 + ) + pd.testing.assert_frame_equal(expected_df, output_df) diff --git a/tests/test_data/massivekb_selected_500_targetvectors.h5 b/tests/test_data/massivekb_selected_500_targetvectors.h5 deleted file mode 100644 index 1cda4fd0..00000000 Binary files a/tests/test_data/massivekb_selected_500_targetvectors.h5 and /dev/null differ diff --git a/tests/test_data/massivekb_selected_500_training_data.feather b/tests/test_data/massivekb_selected_500_training_data.feather new file mode 100644 index 00000000..b1109e1f Binary files /dev/null and b/tests/test_data/massivekb_selected_500_training_data.feather differ diff --git a/tests/test_features.py b/tests/test_features.py deleted file mode 100644 index 87bdca01..00000000 --- a/tests/test_features.py +++ /dev/null @@ -1,108 +0,0 @@ -import os - -import pandas as pd -import numpy as np - -from ms2pip.ms2pipC import MS2PIP - - -TEST_DIR = os.path.dirname(__file__) - - -class TestFeatureExtraction: - def _assert_get_feature_vectors(self, test_data, target_data): - assert test_data[test_data.columns[:-3]].equals( - target_data[target_data.columns[:-3]] - ) - - def _assert_get_targetsB(self, test_data, target_data): - for i in range(3): - assert np.isclose(test_data["targets_B"][i], target_data["targets_B"][i]) - - def _assert_get_targetsY(self, test_data, target_data): - for i in range(3): - assert np.isclose(test_data["targets_Y"][i], target_data["targets_Y"][i]) - - def _assert_get_psmid(self, test_data, target_data): - assert test_data["psmid"].equals(target_data["psmid"]) - - def test_dummy_spectrum(self): - # Run ms2pipC to extract features and targets from an .mgf and .PEPREC files - params = { - "ms2pip": { - "ptm": [], - "sptm": [], - "gptm": [], - "frag_method": "HCD2019", - "frag_error": 0.02, - "out": "csv", - } - } - ms2pip = MS2PIP( - os.path.join(TEST_DIR, "test_data/test.peprec"), - spec_file=os.path.join(TEST_DIR, "test_data/hard_test.mgf"), - vector_file=os.path.join(TEST_DIR, "test_data/test.h5"), - params=params, - ) - ms2pip.run() - - # Load target values - test_data = pd.read_hdf(os.path.join(TEST_DIR, "test_data/test.h5"), "table") - target_data = pd.read_hdf( - os.path.join(TEST_DIR, "test_data/hard_test_targetvectors.h5"), "table" - ) - - # Test - self._assert_get_feature_vectors(test_data, target_data) - self._assert_get_targetsB(test_data, target_data) - self._assert_get_targetsY(test_data, target_data) - self._assert_get_psmid(test_data, target_data) - - os.remove(os.path.join(TEST_DIR, "test_data/test.h5")) - - def test_real_spectra(self): - # Run ms2pipC to extract features and targets from an .mgf and .PEPREC files - params = { - "ms2pip": { - "ptm": [ - "Oxidation,15.994915,opt,M", - "Carbamidomethyl,57.021464,opt,C", - "Pyro_glu,-18.010565,opt,E", - "Deamidation,0.984016,opt,N", - "Acetyl,42.010565,opt,N-term", - "Carbamyl,43.005814,opt,N-term", - ], - "sptm": [], - "gptm": [], - "frag_method": "HCD2019", - "frag_error": 0.02, - "out": "csv", - } - } - ms2pip = MS2PIP( - os.path.join(TEST_DIR, "test_data/massivekb_selected_500.peprec"), - spec_file=os.path.join(TEST_DIR, "test_data/massivekb_selected_500.mgf"), - vector_file=os.path.join( - TEST_DIR, "test_data/massivekb_selected_500_test.h5" - ), - params=params, - num_cpu=1, - ) - ms2pip.run() - - # Load target values - test_data = pd.read_hdf( - os.path.join(TEST_DIR, "test_data/massivekb_selected_500_test.h5"), "table" - ) - target_data = pd.read_hdf( - os.path.join(TEST_DIR, "test_data/massivekb_selected_500_targetvectors.h5"), - "table", - ) - - # Test - self._assert_get_feature_vectors(test_data, target_data) - self._assert_get_targetsB(test_data, target_data) - self._assert_get_targetsY(test_data, target_data) - self._assert_get_psmid(test_data, target_data) - - os.remove(os.path.join(TEST_DIR, "test_data/massivekb_selected_500_test.h5"))