Skip to content

Commit

Permalink
Fixed bug related to picking top peaks. Changed methyl replacement al…
Browse files Browse the repository at this point in the history
…gorithm. Fixed shuffling bug and changed algorithm.
  • Loading branch information
jsonbrooks committed Oct 17, 2019
1 parent b472752 commit 4ae9565
Show file tree
Hide file tree
Showing 150 changed files with 1,523,152 additions and 417,745 deletions.
3 changes: 3 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ dependencies {
exclude("org.apache.logging.log4j")
}
compile("org.apache.commons", "commons-math3", "3.6.1")
compile("org.eclipse.collections", "eclipse-collections-api", "10.0.0")
compile("org.eclipse.collections", "eclipse-collections", "10.0.0")

implementation("com.squareup.moshi", "moshi-kotlin", "1.8.0")
testImplementation("org.junit.jupiter", "junit-jupiter", "5.4.0")
testCompile("org.assertj", "assertj-core", "3.11.1")
Expand Down
123 changes: 76 additions & 47 deletions src/main/kotlin/App.kt
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,34 @@ fun main(args: Array<String>) = Cli().main(args)
class Cli : CliktCommand() {

private val peaks by option("--peaks", help = "path to peaks in narrowPeak format")
.path(exists = true).required()
.path(exists = true).required()
private val twoBit by option("--twobit", help = "path to two-bit file for this assembly")
.path(exists = true).required()
.path(exists = true).required()
private val chromInfo by option("--chrom-info", help = "path to chromosome lengths for this assembly")
.path(exists = true).required()
.path(exists = true).required()
private val offset by option("--offset", help = "offset, in bp, to shift peaks")
.int().default(0)
.int().default(0)
private val outputDir by option("--output-dir", help = "path to write output")
.path().required()
.path().required()
private val chrFilter by option("--chrom-filter",
help = "chromosomes to filter out before running MEME.").multiple()
private val methylBed by option("--methyl-bed", help = "path to optional methylation state @CpG bed file")
private val shuffleOutputsPerInput by option("--shuffle-outputs-per-input",
help = "Number of shuffled sequences to fetch per input sequence")
.int().default(100)
private val shuffleGCTolerance by option("--shuffle-gc-tolerance",
help = "Acceptable distance from input gc content for fetching output sequences during shuffle as percentage (as integer 0-100)")
.int().default(10)
private val methylBeds by option("--methyl-beds", help = "path to optional methylation state @CpG bed file")
.path(exists = true)
.multiple()
private val methylPercentThreshold by option("--methyl-percent-threshold",
help = "the percentage over which we will use a methylation site from the methylation bed file.")
.int()
.int().default(0)

override fun run() {
val cmdRunner = DefaultCmdRunner()
cmdRunner.runTask(peaks, twoBit, chromInfo, offset, outputDir, chrFilter.toSet(), methylBed, methylPercentThreshold)
cmdRunner.runTask(peaks, twoBit, chromInfo, offset, outputDir, chrFilter.toSet(), shuffleOutputsPerInput,
shuffleGCTolerance, methylBeds, methylPercentThreshold)
}
}

Expand All @@ -44,11 +52,12 @@ class Cli : CliktCommand() {
* @param offset amount to shift peaks when creating summits
* @param outputDir directory to put output files
* @param chrFilter set of chromosomes to filter out before running
* @param methylBed methylated state @CpG file used for runs that create motifs with methyl base pairs
* @param methylBeds methylated state @CpG files used for runs that create motifs with methyl base pairs
* @param methylPercentThreshold the percentage over which we will use a methylation site
*/
fun CmdRunner.runTask(peaks: Path, twoBit: Path, chromInfo: Path, offset: Int, outputDir: Path,
chrFilter: Set<String>? = null, methylBed: Path? = null, methylPercentThreshold: Int? = null) {
chrFilter: Set<String>? = null, shuffleOutputsPerInput: Int, shuffleGCTolerance: Int,
methylBeds: List<Path> = listOf(), methylPercentThreshold: Int = 0) {
log.info {
"""
Running Meme task for
Expand All @@ -58,103 +67,123 @@ fun CmdRunner.runTask(peaks: Path, twoBit: Path, chromInfo: Path, offset: Int, o
offset: $offset
outputDir: $outputDir
chromFilter: $chrFilter
methylBed: $methylBed
methylBed: $methylBeds
methylPercentThreshold: $methylPercentThreshold
""".trimIndent()
}
val outPrefix = peaks.fileName.toString().split(".").first()
val chromSizes = parseChromSizes(chromInfo)

val methylData = if (methylBeds.isNotEmpty()) parseMethylBeds(methylBeds, methylPercentThreshold) else null

// Rewrite peaks names, apply chrom filter and filter peaks without methylated states (if methyl bed is given)
// Name rewrite is necessary because given peaks input may not include them
log.info { "Creating cleaned peaks file..." }
val cleanedPeaks = outputDir.resolve("$outPrefix$CLEANED_BED_SUFFIX")
cleanPeaks(peaks, chrFilter, methylBed, methylPercentThreshold, cleanedPeaks)
cleanPeaks(peaks, chrFilter, methylData, cleanedPeaks)
log.info { "Cleaned peaks file complete!" }

runMemeSteps(outPrefix, cleanedPeaks, twoBit, chromSizes, offset, outputDir, chrFilter,
methylBed, methylPercentThreshold)
runMemeSteps(outPrefix, cleanedPeaks, twoBit, chromSizes, offset, outputDir, chrFilter, methylData)

val summitsFile = outputDir.resolve("$outPrefix$SUMMITS_FILE_SUFFIX")
val memeOutDir = outputDir.resolve("$outPrefix$MEME_DIR_SUFFIX")
val top500CenterSeqsFile = outputDir.resolve("$outPrefix$TOP500_SEQS_CENTER_SUFFIX")
runQualitySteps(outPrefix, summitsFile, memeOutDir, top500CenterSeqsFile, twoBit, outputDir, chromSizes,
methylBed, methylPercentThreshold)

runOccurrencesSteps(outPrefix, cleanedPeaks, twoBit, outputDir, methylBed, methylPercentThreshold)
runPostMemeSteps(outPrefix, summitsFile, memeOutDir, cleanedPeaks, top500CenterSeqsFile, twoBit,
outputDir, chromSizes, shuffleOutputsPerInput, shuffleGCTolerance, methylData)
}

/**
* Run Meme pre-processing and Meme steps
*/
fun CmdRunner.runMemeSteps(outPrefix: String, cleanedPeaks: Path, twoBit: Path, chromSizes: Map<String, Int>,
offset: Int, outputDir: Path, chrFilter: Set<String>? = null,
methylBed: Path? = null, methylPercentThreshold: Int? = null) {
offset: Int, outputDir: Path, chrFilter: Set<String>? = null, methylData: MethylData? = null) {
// Create summits file
log.info { "Creating peak summits file..." }
val summitsFile = outputDir.resolve("$outPrefix$SUMMITS_FILE_SUFFIX")
summits(cleanedPeaks, chromSizes, 150, summitsFile, offset, chrFilter)
log.info { "Peak summits File creation complete!" }

// Run MEME on top 500 peaks
log.info { "Creating fasta file from top 500 summits..." }
val top500SeqsFile = outputDir.resolve("$outPrefix$TOP500_SEQS_SUFFIX")
val top500CenterSeqsFile = outputDir.resolve("$outPrefix$TOP500_SEQS_CENTER_SUFFIX")
peaksToFasta(summitsFile, twoBit, top500SeqsFile, methylBed, methylPercentThreshold,
0 until 500)
peaksToFasta(summitsFile, twoBit, top500SeqsFile, methylData, 0 until 500)
log.info { "Top 500 fasta file creation complete!" }

log.info { "Centering top 500 sequences..." }
fastaCenter(top500SeqsFile, 100, top500CenterSeqsFile)
log.info { "Centering top 500 sequences complete!" }

log.info { "Running meme on top 500 centered peaks..." }
val memeOutDir = outputDir.resolve("$outPrefix$MEME_DIR_SUFFIX")
val useMotifAlphabet = methylBed != null
val useMotifAlphabet = methylData != null
meme(top500CenterSeqsFile, memeOutDir, useMotifAlphabet)
log.info { "Top 500 centered peaks meme run complete!" }
}

// Length of sequences to use for peak centers, flanks, and shuffled regions
const val SEQUENCE_LENGTH = 100

/**
* Run post-Meme quality related steps
*/
fun CmdRunner.runQualitySteps(outPrefix: String, summitsFile: Path, memeDir: Path, top500CenterSeqsFile: Path,
twoBit: Path, outputDir: Path, chromSizes: Map<String, Int>,
methylBed: Path? = null, methylPercentThreshold: Int? = null) {
fun CmdRunner.runPostMemeSteps(outPrefix: String, summitsFile: Path, memeDir: Path, cleanedPeaks: Path,
top500CenterSeqsFile: Path, twoBit: Path, outputDir: Path, chromSizes: Map<String, Int>,
shuffleOutputsPerInput: Int, shuffleGCTolerance: Int, methylData: MethylData? = null) {
// Run FIMO against peaks 501-1000 center and flanks
log.info { "Generating 501-1000 peaks centers and flanks..." }
val next500SeqsFile = outputDir.resolve("$outPrefix$NEXT500_SEQS_SUFFIX")
val next500CenterSeqsFile = outputDir.resolve("$outPrefix$NEXT500_SEQS_CENTER_SUFFIX")
val next500FlankSeqsFile = outputDir.resolve("$outPrefix$NEXT500_SEQS_FLANK_SUFFIX")
peaksToFasta(summitsFile, twoBit, next500SeqsFile, methylBed, methylPercentThreshold,
500 until 1000)
fastaCenter(next500SeqsFile, 100, next500CenterSeqsFile, next500FlankSeqsFile)
peaksToFasta(summitsFile, twoBit, next500SeqsFile, methylData, 500 until 1000)
fastaCenter(next500SeqsFile, SEQUENCE_LENGTH, next500CenterSeqsFile, next500FlankSeqsFile)
log.info { "501-1000 peaks centers and flanks generation complete!" }

log.info { "Running FIMO on 501-1000 peaks centers..." }
val memeTxtFile = memeDir.resolve(MEME_TXT_FILENAME)
val next500CenterFimoDir = outputDir.resolve(CENTER_FIMO_DIR_SUFFIX)
val next500CenterFimoDir = outputDir.resolve("$outPrefix$CENTER_FIMO_DIR_SUFFIX")
fimo(memeTxtFile, next500CenterSeqsFile, next500CenterFimoDir)
log.info { "FIMO run on 501-1000 peaks centers complete!" }

log.info { "Running FIMO on 501-1000 peaks flanks..." }
val next500FlankFimoDir = outputDir.resolve(FLANK_FIMO_DIR_SUFFIX)
fimo(memeTxtFile, next500FlankSeqsFile, next500FlankFimoDir)
log.info { "FIMO run on 501-1000 peaks flanks complete!" }

// Run FIMO against 100x random sequences from reference genome (with matching length and gc content)
val randomSeqFile = outputDir.resolve("$outPrefix$SHUFFLED_SEQS_SUFFIX")
val randomFimoDir = outputDir.resolve(SHUFFLED_FIMO_DIR_SUFFIX)
randomSequences(twoBit, top500CenterSeqsFile, randomSeqFile, 100, chromSizes, 0.05,
methylBed, methylPercentThreshold)
log.info { "Generating Shuffled sequences..." }
randomSequences(twoBit, top500CenterSeqsFile, randomSeqFile, shuffleOutputsPerInput, chromSizes, SEQUENCE_LENGTH,
shuffleGCTolerance, methylData)
log.info { "Shuffled sequence generation complete!" }
log.info { "Running FIMO on shuffled sequences..." }
fimo(memeTxtFile, randomSeqFile, randomFimoDir)
log.info { "FIMO run on shuffled sequences complete!" }

// Run Motif Quality step
val memeXmlFile = memeDir.resolve(MEME_XML_FILENAME)
val outJsonFile = outputDir.resolve("$outPrefix$MOTIFS_JSON_SUFFIX")
motifQuality(memeXmlFile, next500CenterFimoDir, randomFimoDir, next500FlankFimoDir, outJsonFile)
}

/**
* Run occurrences file creation related steps
*/
fun CmdRunner.runOccurrencesSteps(outPrefix: String, cleanedPeaks: Path, twoBit: Path, outputDir: Path,
methylBed: Path? = null, methylPercentThreshold: Int? = null) {
// Create fasta file containing sequences for original input peaks file
log.info { "Creating fasta from original cleaned peaks..." }
val originalPeaksFastaFile = outputDir.resolve("$outPrefix$SEQS_SUFFIX")
peaksToFasta(cleanedPeaks, twoBit, originalPeaksFastaFile, methylBed, methylPercentThreshold, null)
peaksToFasta(cleanedPeaks, twoBit, originalPeaksFastaFile, methylData, null)
log.info { "Fasta from original cleaned peaks complete!" }

// Run FIMO against original peaks sequences
val memeOutDir = outputDir.resolve("$outPrefix$MEME_DIR_SUFFIX")
val memeTxtFile = memeOutDir.resolve(MEME_TXT_FILENAME)
log.info { "Running FIMO on original peaks fasta..." }
val originalPeaksFimoDir = outputDir.resolve("$outPrefix$FIMO_SUFFIX")
fimo(memeTxtFile, originalPeaksFastaFile, originalPeaksFimoDir)
log.info { "FIMO run on original peaks fasta complete!" }

// Convert FIMO Occurrences to custom Occurrences TSV with absolute positioned ranges
log.info { "Creation occurrences.tsv..." }
val originalPeaksFimoTsv = originalPeaksFimoDir.resolve(FIMO_TSV_FILENAME)
val occurrencesTsv = outputDir.resolve("$outPrefix$OCCURRENCES_SUFFIX")
occurrencesTsv(originalPeaksFimoTsv, cleanedPeaks, occurrencesTsv)
}
log.info { "occurrences.tsv creation complete!" }

// Create motifs json
log.info { "Creating motifs.json file..." }
val memeXmlFile = memeDir.resolve(MEME_XML_FILENAME)
val outJsonFile = outputDir.resolve("$outPrefix$MOTIFS_JSON_SUFFIX")
motifJson(memeXmlFile, originalPeaksFimoDir, next500CenterFimoDir, randomFimoDir, next500FlankFimoDir, outJsonFile)
log.info { "motifs.json file creation complete!" }
}
8 changes: 3 additions & 5 deletions src/main/kotlin/step/CleanPeaks.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@ import java.nio.file.Path
/**
* Rewrites narrowPeaks files with new names.
* Optionally filters out chromosomes that do not match given filter
* If a methylation state bed file is given, filters out peaks that don't intersect with any bases
* in a methyl bed file
* If methylation state data file is given, filters out peaks that don't intersect with any bases in it
*
* @param peaksBed the peaks file
* @param chrFilter Optional set of chromsomes to filter against. Anything not included is filtered out.
* @param methylBed the methyl bed file
* @param methylData the methyl bed file
* @param out the file to the filtered peaks results to
*/
fun cleanPeaks(peaksBed: Path, chrFilter: Set<String>?, methylBed: Path?, methylPercentThreshold: Int?, out: Path) {
val methylData = if (methylBed != null) parseMethylBed(methylBed, methylPercentThreshold) else null
fun cleanPeaks(peaksBed: Path, chrFilter: Set<String>?, methylData: MethylData?, out: Path) {
val filteredPeaks = mutableListOf<PeaksRow>()
var peakCount = 0
readPeaksFile(peaksBed) { row ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ data class OutputMotif(
@Json(name = "e_value") val eValue: Double,
// the number of occurrences found by MEME in the top 500
val sites: Int,
// Ratio of occurrences for this motif to total regions
@Json(name = "occurrences_ratio") val occurrencesRatio: Double,
// Number of original peaks
@Json(name = "original_peaks") val originalPeaks: Int,
// Number of occurrences in original peaks
@Json(name = "original_peaks_occurrences") val originalPeaksOccurrences: Int,
// Ratio of occurrences to sequences in top 501-1000 for this motif
@Json(name = "lesser_peaks_occurrences_ratio") val lesserPeaksOccurrencesRatio: Double,
// Control data from flank sequences
@Json(name = "flank_control_data") val flankControlData: MotifControlData,
// Control data from shuffled sequences
Expand All @@ -37,44 +41,56 @@ data class MotifControlData(
* Run our final step that puts together our meme output quality scores we calculate based our fimo results.
*
* @param memeXml path to results of MEME run. This contains our motifs for top 500 peaks, centered.
* @param peaksFimoDir path to results of FIMO run against 501-1000 peaks centers.
* @param origPeaksFimoDir path to results of FIMO run against original (cleaned) peaks.
* @param next500FimoDir path to results of FIMO run against 501-1000 peaks centers.
* @param shuffledFimoDir path to results of FIMO run against shuffled sequences pulled randomly from reference genome.
* @param flankFimoDir path to results of FIMO run againt 501-1000 peaks flanks.
* @param outJson path for output json file.
*/
fun motifQuality(memeXml: Path, peaksFimoDir: Path, shuffledFimoDir: Path, flankFimoDir: Path, outJson: Path) {
fun motifJson(memeXml: Path, origPeaksFimoDir: Path, next500FimoDir: Path, shuffledFimoDir: Path,
flankFimoDir: Path, outJson: Path) {
val memeMotifs = parseMotifs(memeXml)
val memeMotifNames = memeMotifs.map { it.name }
val peaksOccurrenceRatios = motifOccurrencesRatios(peaksFimoDir, memeMotifNames)

val origPeaksFimoXml = origPeaksFimoDir.resolve(FIMO_XML_FILENAME)
val origPeaks = parseNumSequences(origPeaksFimoXml)

val origPeaksFimoTsv = origPeaksFimoDir.resolve(FIMO_TSV_FILENAME)
val origPeaksOccurrences = motifOccurrencesCounts(origPeaksFimoTsv)

val lesserPeaksOccurrenceRatios = motifOccurrencesRatios(next500FimoDir, memeMotifNames)
val flankOccurrenceRatios = motifOccurrencesRatios(flankFimoDir, memeMotifNames)
val shuffledOccurrenceRatios = motifOccurrencesRatios(shuffledFimoDir, memeMotifNames)

val outputMotifs = mutableListOf<OutputMotif>()
for (memeMotif in memeMotifs) {
val motifName = memeMotif.name
val peaksOccurrenceRatioData = peaksOccurrenceRatios.getValue(motifName)
val lesserPeaksOccurrenceRatioData = lesserPeaksOccurrenceRatios.getValue(motifName)

val flankOccurrenceRatioData = flankOccurrenceRatios.getValue(motifName)
val flankZScore = compareOccurrenceProportions(peaksOccurrenceRatioData, flankOccurrenceRatioData)
val flankZScore = compareOccurrenceProportions(lesserPeaksOccurrenceRatioData, flankOccurrenceRatioData)
val flankPValue = zScoreToPValue(flankZScore)

val shuffledOccurrenceRatio = shuffledOccurrenceRatios.getValue(motifName)
val shuffledZScore = compareOccurrenceProportions(peaksOccurrenceRatioData, shuffledOccurrenceRatio)
val shuffledZScore = compareOccurrenceProportions(lesserPeaksOccurrenceRatioData, shuffledOccurrenceRatio)
val shuffledPValue = zScoreToPValue(shuffledZScore)

outputMotifs += OutputMotif(
name = motifName,
pwm = memeMotif.pwm,
eValue = memeMotif.eValue,
sites = memeMotif.sites,
occurrencesRatio = peaksOccurrenceRatios.getValue(motifName).ratio,
originalPeaks = origPeaks,
originalPeaksOccurrences = origPeaksOccurrences.getValue(motifName),
lesserPeaksOccurrencesRatio = lesserPeaksOccurrenceRatios.getValue(motifName).ratio,
flankControlData = MotifControlData(flankOccurrenceRatioData.ratio, flankZScore, flankPValue),
shuffledControlData = MotifControlData(shuffledOccurrenceRatio.ratio, shuffledZScore, shuffledPValue)
)
}
val moshi = Moshi.Builder().add(KotlinJsonAdapterFactory()).build()
val adapter = moshi.adapter<List<OutputMotif>>(
newParameterizedType(List::class.java, OutputMotif::class.java))
.indent(" ")
val outputJsonText = adapter.toJson(outputMotifs)
Files.createDirectories(outJson.parent)
Files.newBufferedWriter(outJson, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
Expand Down
Loading

0 comments on commit 4ae9565

Please sign in to comment.