From 8e99eabbf5b1828a45655a55add77b25ed398960 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 1 Nov 2024 16:30:48 +0100 Subject: [PATCH] Add script to filter mutations in MAF file This script uses the same filters that are used during the load (except filter by gene that required connection to the database) --- .../portal/scripts/FilterMutationData.java | 137 ++++++++++++++++++ .../scripts/TestFilterMutationData.java | 100 +++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java new file mode 100644 index 0000000..6934bf6 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 - 2022 Memorial Sloan Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.scripts; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.maf.MafRecord; +import org.mskcc.cbio.maf.MafUtil; +import org.mskcc.cbio.portal.util.*; + +import java.io.*; +import java.util.*; + +/** + * Read MAF records, filter records of interest and writes back to the file. The script backs up original file under {filename with extension}_backup. + * + * @author Ruslan Forostianov + */ +public class FilterMutationData extends ConsoleRunnable { + + /** + * Instantiates a ConsoleRunnable to run with the given command line args. + * + * @param args the command line arguments to be used + * @see {@link #run()} + */ + public FilterMutationData(String[] args) { + super(args); + } + + public void run() { + String description = "Filter MAF file for records of interest and rewrites it with selected mutations."; + OptionParser parser = new OptionParser(); + OptionSpec data = parser.accepts( "data", + "MAF data file" ).withRequiredArg().describedAs( "data_mutations.txt" ).ofType( String.class ); + OptionSpec meta = parser.accepts( "meta", + "meta (description) file" ).withOptionalArg().describedAs( "meta_mutations.txt" ).ofType( String.class ); + + OptionSet options = null; + File originalMutationFile; + Set namespaces = null; + Set filteredMutations = null; + + try { + options = parser.parse( args ); + originalMutationFile = new File((String) options.valueOf("data")); + if (options.has("meta")) { + File descriptorFile = new File((String) options.valueOf( "meta" ) ); + filteredMutations = GeneticProfileReader.getVariantClassificationFilter(descriptorFile); + namespaces = GeneticProfileReader.getNamespaces(descriptorFile); + } + } catch (OptionException e) { + throw new UsageException( + this.getClass().getName(), description, parser, + e.getMessage()); + } catch (Exception e) { + throw new RuntimeException(e); + } + ProgressMonitor.setCurrentMessage("Start filtering mutation records in the MAF file ..."); + File resultMutationFile = new File(originalMutationFile.getAbsolutePath() + "_filtered"); + final MutationFilter mutationFilter = new MutationFilter(); + try ( + BufferedReader originalFileBufferedReader = new BufferedReader(new FileReader(originalMutationFile)); + BufferedWriter resultFileBufferedWriter = new BufferedWriter(new FileWriter(resultMutationFile)) + ) { + String line; + MafUtil mafUtil = null; + while ((line = originalFileBufferedReader.readLine()) != null) { + ProgressMonitor.incrementCurValue(); + ConsoleUtil.showProgress(); + + if (TsvUtil.isDataLine(line)) { + if (mafUtil == null) { + mafUtil = new MafUtil(line, namespaces); + } else { + MafRecord record = mafUtil.parseRecord(line); + if (!mutationFilter.acceptMutation(record, filteredMutations)) { + continue; + } + } + } + resultFileBufferedWriter.write(line); + resultFileBufferedWriter.write(System.lineSeparator()); + } + } catch (IOException e) { + e.printStackTrace(); + } + File backupMutationFile = new File(originalMutationFile.getAbsolutePath() + "_backup"); + if (originalMutationFile.renameTo(backupMutationFile)) { + ProgressMonitor.setCurrentMessage("The original file is backed up to:" + + backupMutationFile.getAbsolutePath()); + if (resultMutationFile.renameTo(originalMutationFile)) { + ProgressMonitor.setCurrentMessage("The MAF file has been overwritten with filtered records."); + } else { + throw new RuntimeException("Failed to rename the filtered MAF file (" + + resultMutationFile.getAbsolutePath() + ") to the input MAF file (" + + originalMutationFile.getAbsolutePath() + ")."); + } + } else { + throw new RuntimeException("Failed to rename MAF file (" + + originalMutationFile.getAbsolutePath() + ") for backup."); + } + ProgressMonitor.setCurrentMessage(mutationFilter.getStatistics()); + } +} \ No newline at end of file diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java b/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java new file mode 100644 index 0000000..6a4b961 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.scripts; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.BeforeEach; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.Comparator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * JUnit tests for FilterMutationData step + */ +public class TestFilterMutationData { + + public static final String SRC_MAF_DATA_FILE_PATH = "src/test/resources/data_mutations_extended.txt"; + private Path tempDir; + + @BeforeEach + public void setUp() throws IOException { + // Create a temporary directory for each test + tempDir = Files.createTempDirectory("tempTestDir"); + + // Copy files to the temporary directory + Path dataFile = Paths.get(SRC_MAF_DATA_FILE_PATH); + Path copiedDataFile = tempDir.resolve(dataFile.getFileName()); + Files.copy(dataFile, copiedDataFile, StandardCopyOption.REPLACE_EXISTING); + + Path metaFile = Paths.get("src/test/resources/meta_mutations_extended.txt"); + Path copiedMetaFile = tempDir.resolve(metaFile.getFileName()); + Files.copy(metaFile, copiedMetaFile, StandardCopyOption.REPLACE_EXISTING); + } + + @AfterEach + public void tearDown() throws IOException { + // Delete the temporary directory and files after each test + Files.walk(tempDir) + .sorted(Comparator.reverseOrder()) + .map(Path::toFile) + .forEach(File::delete); + } + + @Test + public void testFilterMutationData() throws IOException { + String mafFile = tempDir + "/data_mutations_extended.txt"; + String[] args = { + "--data", mafFile, + "--meta", tempDir + "/meta_mutations_extended.txt" + }; + FilterMutationData runner = new FilterMutationData(args); + runner.run(); + + List filteredDataFileLines = Files.readAllLines(Paths.get(mafFile)); + List backedUpDataFileLines = Files.readAllLines(Paths.get(mafFile + "_backup")); + List originalDataFileLines = Files.readAllLines(Paths.get(SRC_MAF_DATA_FILE_PATH)); + assertEquals(originalDataFileLines, backedUpDataFileLines); + assertFalse(filteredDataFileLines.isEmpty()); + assertTrue(originalDataFileLines.size() > filteredDataFileLines.size()); + assertTrue(originalDataFileLines.containsAll(filteredDataFileLines)); + } +}