Implement get_training_data

compomics · Sep 4, 2023 · a76619a · a76619a
1 parent 5725c26
commit a76619a
Showing 7 changed files with 78 additions and 156 deletions.
diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py
@@ -146,8 +146,24 @@ def correlate(*args, **kwargs):
 
 
 @cli.command(help=ms2pip.core.get_training_data.__doc__)
+@click.argument("psms", required=True)
+@click.argument("spectrum_file", required=True)
+@click.option("--output-name", "-o", type=str)
+@click.option("--spectrum-id-pattern", "-p")
+@click.option("--model", type=click.Choice(MODELS), default="HCD")
+@click.option("--ms2-tolerance", type=float, default=0.02)
+@click.option("--processes", "-n", type=int)
 def get_training_data(*args, **kwargs):
-    ms2pip.core.get_training_data(*args, **kwargs)
+    # Parse arguments
+    output_name = kwargs.pop("output_name")
+    output_name = _infer_output_name(kwargs["psms"], output_name).with_suffix(".feather")
+
+    # Run
+    training_data = ms2pip.core.get_training_data(*args, **kwargs)
+
+    # Write output
+    logger.info(f"Writing training data to {output_name}")
+    training_data.to_feather(output_name)
 
 
 def main():

diff --git a/ms2pip/core.py b/ms2pip/core.py
@@ -20,6 +20,7 @@
 from ms2pip import spectrum_output
 from ms2pip._cython_modules import ms2pip_pyx
 from ms2pip._utils.encoder import Encoder
+from ms2pip._utils.feature_names import get_feature_names
 from ms2pip._utils.psm_input import read_psms
 from ms2pip._utils.retention_time import RetentionTime
 from ms2pip._utils.xgb_models import get_predictions_xgb, validate_requested_xgb_model
@@ -197,6 +198,7 @@ def get_training_data(
     psms: Union[PSMList, str, Path],
     spectrum_file: Union[str, Path],
     spectrum_id_pattern: Optional[str] = None,
+    model: Optional[str] = "HCD",
     ms2_tolerance: float = 0.02,
     processes: Optional[int] = None,
 ):
@@ -212,6 +214,9 @@ def get_training_data(
     spectrum_id_pattern
         Regular expression pattern to apply to spectrum titles before matching to
         peptide file ``spec_id`` entries.
+    model
+        Model to use as reference for the ion types that are extracted from the observed spectra.
+        Default: "HCD", which results in the extraction of singly charged b- and y-ions.
     ms2_tolerance
         MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
     processes
@@ -223,24 +228,25 @@ def get_training_data(
         :py:class:`pandas.DataFrame` with feature vectors and targets.
 
     """
-    raise NotImplementedError
     psm_list = read_psms(psms)
     spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)"
 
     with Encoder.from_psm_list(psm_list) as encoder:
         ms2pip_parallelized = _Parallelized(
             encoder=encoder,
+            model=model,
             ms2_tolerance=ms2_tolerance,
             processes=processes,
         )
         logger.info("Processing spectra and peptides...")
         results = ms2pip_parallelized.process_spectra(
             psm_list, spectrum_file, spectrum_id_pattern, vector_file=True
         )
-        logger.debug("Merging results")
-        vectors = ms2pip_parallelized.write_vector_file(results)
 
-    return vectors
+        logger.info("Assembling training data in DataFrame...")
+        training_data = _assemble_training_data(results, model)
+
+    return training_data
 
 
 def download_models(
@@ -485,7 +491,7 @@ def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[Proc
         multiprocessing.
         """
 
-        if not "xgboost_model_files" in MODELS[self.model].keys():
+        if "xgboost_model_files" not in MODELS[self.model].keys():
             raise ValueError("XGBoost model files not found in MODELS dictionary.")
 
         logger.debug("Converting feature vectors to XGBoost DMatrix...")
@@ -514,45 +520,6 @@ def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[Proc
 
         return results
 
-    # TODO IMPLEMENT
-    def write_vector_file(self, results: List[ProcessingResult]):
-        raise NotImplementedError
-        all_results = []
-        for r in results:
-            psmids, df, dtargets = r.get()
-
-            # dtargets is a dict, containing targets for every ion type (keys are int)
-            for i, t in dtargets.items():
-                df["targets_{}".format(MODELS[self.model]["ion_types"][i])] = np.concatenate(
-                    t, axis=None
-                )
-            df["psmid"] = psmids
-
-            all_results.append(df)
-
-        # Only concat DataFrames with content (we get empty ones if more CPUs than peptides)
-        all_results = pd.concat([df for df in all_results if len(df) != 0])
-
-        logger.info("Writing vector file %s...", self.vector_file)
-        # TODO Consider writing to DMatrix XGBoost binary file instead.
-        # write result. write format depends on extension:
-        ext = self.vector_file.split(".")[-1]
-        if ext == "pkl":
-            all_results.to_pickle(self.vector_file + ".pkl")
-        elif ext == "csv":
-            try:
-                all_results.to_csv(self.vector_file, lineterminator="\n")
-            except TypeError:  # Pandas < 1.5 (Required for Python 3.7 support)
-                all_results.to_csv(self.vector_file, line_terminator="\n")
-        else:
-            raise ValueError("Unknown vector file extension: {}".format(ext))
-        # Avoid PyTables dependency
-        # else:
-        # "table" is a tag used to read back the .h5
-        # all_results.to_hdf(self.vector_file, "table")
-
-        return all_results
-
     # TODO IMPLEMENT
     def write_predictions(
         self, all_preds: pd.DataFrame, peptides: pd.DataFrame, output_filename: str
@@ -706,7 +673,7 @@ def _process_spectra(
     # Restructure PeptideRecord entries as spec_id -> [(id, psm_1), (id, psm_2), ...]
     psms_by_specid = defaultdict(list)
     for psm_index, psm in enumerated_psm_list:
-        psms_by_specid[psm.spectrum_id].append((psm_index, psm))
+        psms_by_specid[str(psm.spectrum_id)].append((psm_index, psm))
 
     # Track progress for only one worker (good approximation of all workers' progress)
     for spectrum in read_spectrum_file(spec_file):
@@ -782,3 +749,33 @@ def _process_spectra(
             results.append(result)
 
     return results
+
+
+def _assemble_training_data(results: List[ProcessingResult], model: str) -> pd.DataFrame:
+    """Assemble training data from results list to single pandas DataFrame."""
+    # Get ion types
+    ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
+
+    # Assemble feature vectors, PSM indices, and targets
+    training_data = pd.DataFrame(
+        np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]),
+        columns=get_feature_names(),
+    )
+    training_data["psm_index"] = np.concatenate(
+        [
+            np.repeat(r.psm_index, r.feature_vectors.shape[0])
+            for r in results
+            if r.feature_vectors is not None
+        ]
+    )
+    for ion_type in ion_types:
+        training_data[f"target_{ion_type}"] = np.concatenate(
+            [r.observed_intensity[ion_type] for r in results if r.feature_vectors is not None]
+        )
+
+    # Reorder columns
+    training_data = training_data[
+        ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types]
+    ]
+
+    return training_data
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ requires-python = ">=3.8"
 dependencies = [
     "numpy>=1.16,<2",
     "pandas>=1,<2",
+    "pyarrow",
     "pyteomics>=3.5,<5",
     "tomlkit>=0.5,<1",
     "sqlalchemy>=1.3,<2",
@@ -90,8 +91,9 @@ include = ["ms2pip*"]
 line-length = 99
 target-version = ['py38']
 
-[tool.isort]
-profile = "black"
+[tool.ruff]
+line-length = 99
+target-version = 'py38'
 
 [tool.cibuildwheel]
 build = "cp3*-manylinux_x86_64 cp3*-win_amd64 cp3*-macosx_x86_64"

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+from ms2pip.core import get_training_data
+
+
+def _test_get_training_data():
+    expected_df = pd.read_feather("tests/test_data/massivekb_selected_500.feather")
+    output_df = get_training_data(
+        "tests/test_data/massivekb_selected_500.peprec",
+        "tests/test_data/massivekb_selected_500.mgf",
+        model="HCD",
+        ms2_tolerance=0.02,
+        processes=1
+    )
+    pd.testing.assert_frame_equal(expected_df, output_df)
diff --git a/tests/test_data/massivekb_selected_500_targetvectors.h5 b/tests/test_data/massivekb_selected_500_targetvectors.h5
diff --git a/tests/test_data/massivekb_selected_500_training_data.feather b/tests/test_data/massivekb_selected_500_training_data.feather
diff --git a/tests/test_features.py b/tests/test_features.py