From b3f7dd8a554d1eeb65d9cdd7071e18efc6ab280a Mon Sep 17 00:00:00 2001 From: jgranja24 Date: Fri, 10 Jan 2020 22:17:04 -0800 Subject: [PATCH] update --- .DS_Store | Bin 14340 -> 14340 bytes DESCRIPTION | 6 +- NAMESPACE | 37 +- R/AllClasses.R | 206 +++- R/ArchRBrowser.R | 42 +- R/ArchRProjectMethods.R | 897 ++++++++------- R/ArrowRead.R | 16 +- R/Clustering.R | 46 +- R/CoAccessibility.R | 35 +- R/ColorPalettes.R | 19 +- R/ComputeEmbedding.R | 50 +- R/CreateArrow.R | 37 +- R/DoubletsScores.R | 219 ++-- R/FilterCells.R | 47 +- R/Footprinting.R | 158 +-- R/GRangesUtils.R | 217 ++-- R/GgplotHelper.R | 198 ++-- R/GroupCoverages.R | 51 +- R/HelperUtils.R | 47 +- R/Imputation.R | 50 +- R/LatentSemanticIndexing.R | 53 +- R/MarkerFeatures.R | 254 +++-- R/MarkerHeatmap.R | 578 ---------- R/MatrixCNV.R | 31 +- R/MatrixDeviations.R | 113 +- R/MatrixFeatures.R | 46 +- R/MatrixGeneScores.R | 371 +----- R/MatrixTiles.R | 33 +- R/PeakAnnotation.R | 338 ++++++ R/ReproduciblePeakSet.R | 73 +- R/Trajectory.R | 119 +- R/VisualizeData.R | 90 +- README.md | 44 +- _pkgdown.yml | 67 +- docs/404.html | 27 +- docs/LICENSE-text.html | 27 +- docs/articles/ArchR.html | 264 ----- docs/articles/Articles/annotations.html | 156 +++ docs/articles/Articles/clusterComputing.html | 154 +++ docs/articles/Articles/cnvIdentification.html | 152 +++ docs/articles/Articles/dimReduction.html | 33 +- docs/articles/Articles/doubletRemoval.html | 36 +- docs/articles/{ => Articles}/faq.html | 46 +- docs/articles/Articles/footprinting.html | 38 +- docs/articles/Articles/geneScores.html | 45 +- docs/articles/Articles/inputFiles.html | 171 +++ docs/articles/Articles/markerFeatures.html | 176 +++ docs/articles/Articles/peakCalling.html | 47 +- .../Articles/pseudobulkGeneration.html | 225 ++++ docs/articles/Articles/qcFilters.html | 36 +- docs/articles/Articles/signalTracks.html | 27 +- docs/articles/Articles/trajectories.html | 152 +++ docs/articles/Articles/tutorial.html | 348 ++++++ docs/articles/index.html | 40 +- docs/authors.html | 33 +- docs/favicon-16x16.png | Bin 0 -> 1143 bytes docs/favicon-32x32.png | Bin 0 -> 1910 bytes docs/favicon.ico | Bin 0 -> 15086 bytes docs/images/ArchRProject_Schematic.png | Bin 0 -> 208482 bytes docs/images/ArchR_FunctionSchematic.png | Bin 0 -> 87932 bytes docs/images/Frags_vs_TSS.png | Bin 0 -> 228431 bytes docs/images/GeneActivityScore_Schematic.png | Bin 0 -> 66289 bytes docs/images/TSS_vs_FRiP.png | Bin 0 -> 82509 bytes docs/images/background_postNorm.png | Bin 0 -> 72690 bytes docs/images/background_preNorm.png | Bin 0 -> 28907 bytes .../Articles => images}/doubletRemoval.png | Bin docs/images/footprintingMethods.png | Bin 0 -> 34597 bytes docs/images/footprintingSchematic.png | Bin 0 -> 78529 bytes docs/images/iterativeLSI.png | Bin 0 -> 115655 bytes docs/images/markerFeature_schematic.png | Bin 0 -> 75541 bytes docs/images/peakCalling_ClusteredOverlap.png | Bin 0 -> 62950 bytes docs/images/peakCalling_Comparison.png | Bin 0 -> 20764 bytes docs/images/peakCalling_IterativeOverlap.png | Bin 0 -> 115594 bytes docs/images/peakCalling_RawOverlap.png | Bin 0 -> 59161 bytes .../pseudobulkReplicate_DecisionTree.png | Bin 0 -> 365734 bytes docs/index.html | 34 +- docs/logo.svg | 24 + docs/pkgdown.yml | 11 +- ...ArchR_palettes.html => ArchRPalettes.html} | 44 +- .../{ggLine.html => ArchRProject.html} | 104 +- docs/reference/ArchRRegionTrack.html | 90 +- docs/reference/FilterCells.html | 72 +- docs/reference/addArchRThreads.html | 207 ++++ docs/reference/addBgdPeaks.html | 233 ++++ docs/reference/addCNVMatrix.html | 246 ++++ docs/reference/addCellColData.html | 48 +- .../{IterativeLSI.html => addClusters.html} | 140 +-- docs/reference/addCoAccessibility.html | 254 +++++ docs/reference/addDemuxletResults.html | 42 +- docs/reference/addDeviationsMatrix.html | 65 +- docs/reference/addDoubletScores.html | 91 +- docs/reference/addEmbedding.html | 254 +++++ docs/reference/addFeatureMatrix.html | 56 +- docs/reference/addGeneScoreMatrix.html | 74 +- docs/reference/addGroupCoverages.html | 81 +- ...teEmbedding.html => addImputeWeights.html} | 128 +-- docs/reference/addIterativeLSI.html | 315 +++++ docs/reference/addMotifAnnotations.html | 54 +- docs/reference/addPeakMatrix.html | 42 +- docs/reference/addPeakSet.html | 44 +- docs/reference/addProjectSummary.html | 219 ++++ docs/reference/addReproduciblePeakSet.html | 100 +- docs/reference/addSampleColData.html | 56 +- docs/reference/addSeqLengths.html | 45 +- docs/reference/addSeqLengthsGR.html | 211 ++++ docs/reference/addTileMatrix.html | 55 +- docs/reference/addTrajectory.html | 252 ++++ docs/reference/alignCellsToTrajectory.html | 190 ---- docs/reference/columnOverlaps.html | 53 +- docs/reference/constructGR.html | 219 ++++ docs/reference/constructGRanges.html | 49 +- docs/reference/createArrowFiles.html | 104 +- docs/reference/createGeneAnnnotation.html | 228 ++++ docs/reference/createGenomeAnnotation.html | 220 ++++ docs/reference/dot-fileExtension.html | 189 --- docs/reference/extendGR.html | 215 ++++ docs/reference/extendGRanges.html | 53 +- docs/reference/figures/ArchR_Workflow.png | Bin 0 -> 292222 bytes docs/reference/figures/ArchR_dartLogo.jpg | Bin 76285 -> 0 bytes .../figures/apple-touch-icon-120x120.png | Bin 0 -> 6713 bytes .../figures/apple-touch-icon-152x152.png | Bin 0 -> 8642 bytes .../figures/apple-touch-icon-180x180.png | Bin 0 -> 10249 bytes .../figures/apple-touch-icon-60x60.png | Bin 0 -> 3441 bytes .../figures/apple-touch-icon-76x76.png | Bin 0 -> 4280 bytes docs/reference/figures/apple-touch-icon.png | Bin 0 -> 10249 bytes docs/reference/figures/favicon-16x16.png | Bin 0 -> 1143 bytes docs/reference/figures/favicon-32x32.png | Bin 0 -> 1910 bytes docs/reference/figures/favicon.ico | Bin 0 -> 15086 bytes docs/reference/filterChrGR.html | 224 ++++ ...ntifyClusters.html => filterDoublets.html} | 80 +- docs/reference/filterPlot.html | 222 ++++ ...uteCoAccessibility.html => findMacs2.html} | 44 +- .../{computeKNN.html => getArchRThreads.html} | 43 +- docs/reference/getArrowFiles.html | 40 +- docs/reference/getBlacklist.html | 40 +- docs/reference/getCellColData.html | 44 +- docs/reference/getCellNames.html | 40 +- docs/reference/getChromLengths.html | 32 +- docs/reference/getChromSizes.html | 32 +- docs/reference/getEmbedding.html | 48 +- docs/reference/getExons.html | 44 +- ...{VisualizeGroups.html => getFeatures.html} | 94 +- docs/reference/getFragmentsFromArrow.html | 54 +- docs/reference/getGeneAnnotation.html | 40 +- docs/reference/getGenes.html | 42 +- docs/reference/getGenome.html | 40 +- docs/reference/getGenomeAnnotation.html | 40 +- ...tAnnotation.html => getImputeWeights.html} | 48 +- .../{projectLSI.html => getInputFiles.html} | 63 +- docs/reference/getMatches.html | 44 +- docs/reference/getMatrixFromArrow.html | 63 +- docs/reference/getOutputDirectory.html | 40 +- ...computeLSI.html => getPeakAnnotation.html} | 67 +- docs/reference/getPeakSet.html | 40 +- docs/reference/getPositions.html | 44 +- docs/reference/getProjectSummary.html | 215 ++++ docs/reference/getReducedDims.html | 58 +- docs/reference/getSampleColData.html | 44 +- docs/reference/getSampleNames.html | 40 +- docs/reference/getTSS.html | 40 +- docs/reference/getTrajectory.html | 257 +++++ docs/reference/getTutorialData.html | 211 ++++ docs/reference/getValidBarcodes.html | 215 ++++ docs/reference/getVarDeviations.html | 220 ++++ docs/reference/ggAlignPlots.html | 56 +- docs/reference/ggHex.html | 84 +- docs/reference/ggOneToOne.html | 93 +- docs/reference/ggPoint.html | 123 +- docs/reference/ggViolin.html | 86 +- ...checkPath.html => grapes-bcin-grapes.html} | 54 +- ...nrichment.html => grapes-bcni-grapes.html} | 54 +- docs/reference/grapes-ni-grapes.html | 38 +- docs/reference/index.html | 1009 ++++++++++++++--- docs/reference/keepFilteredChromosomes.html | 49 +- docs/reference/loadArchRProject.html | 215 ++++ ...otFootprint.html => markerAnnoEnrich.html} | 101 +- docs/reference/markerFeatures.html | 90 +- docs/reference/markerGR.html | 215 ++++ docs/reference/markerHeatmap.html | 71 +- docs/reference/markerPlot.html | 225 ++++ docs/reference/markerRanges.html | 218 ++++ docs/reference/mergeGR.html | 211 ++++ docs/reference/mergeGRanges.html | 45 +- .../{availableFeatures.html => nCells.html} | 61 +- docs/reference/nOverlapGR.html | 215 ++++ docs/reference/nonOverlappingGR.html | 220 ++++ docs/reference/nonOverlappingGRanges.html | 57 +- docs/reference/overlappingBP.html | 47 +- docs/reference/overlapsMany.html | 49 +- docs/reference/overlapsManyGR.html | 220 ++++ docs/reference/paletteContinuous.html | 39 +- docs/reference/paletteDiscrete.html | 52 +- docs/reference/plotEmbedding.html | 286 +++++ docs/reference/plotFootprints.html | 293 +++++ ...isualizeEmbedding.html => plotGroups.html} | 119 +- docs/reference/plotPDF.html | 67 +- docs/reference/plotTrajectory.html | 292 +++++ docs/reference/saveArchRProject.html | 211 ++++ docs/reference/shuffleGR.html | 219 ++++ docs/reference/shuffleGRanges.html | 49 +- docs/reference/subsetSeqnames.html | 51 +- ...t-ArchRLogo.html => subsetSeqnamesGR.html} | 50 +- docs/reference/theme_ArchR.html | 76 +- docs/reference/trajectoryHeatmap.html | 246 ++++ images/ArchRProject_Schematic.png | Bin 0 -> 208482 bytes images/ArchR_FunctionSchematic.png | Bin 0 -> 87932 bytes images/Frags_vs_TSS.png | Bin 0 -> 228431 bytes images/GeneActivityScore_Schematic.png | Bin 0 -> 66289 bytes images/TSS_vs_FRiP.png | Bin 0 -> 82509 bytes images/background_postNorm.png | Bin 0 -> 72690 bytes images/background_preNorm.png | Bin 0 -> 28907 bytes .../Articles => images}/doubletRemoval.png | Bin images/footprintingMethods.png | Bin 0 -> 34597 bytes images/footprintingSchematic.png | Bin 0 -> 78529 bytes images/iterativeLSI.png | Bin 0 -> 115655 bytes .../iterativePeakCalling.png | Bin images/markerFeature_schematic.png | Bin 0 -> 75541 bytes images/peakCalling_ClusteredOverlap.png | Bin 0 -> 62950 bytes images/peakCalling_Comparison.png | Bin 0 -> 20764 bytes images/peakCalling_IterativeOverlap.png | Bin 0 -> 115594 bytes images/peakCalling_RawOverlap.png | Bin 0 -> 59161 bytes images/pseudobulkReplicate_DecisionTree.png | Bin 0 -> 365734 bytes images/tutorial_1_UMAP-Clusters.pdf | Bin 0 -> 1697013 bytes images/tutorial_2_tracks.pdf | Bin 0 -> 427791 bytes images/tutorial_3_MarkerGeneScores.pdf | Bin 0 -> 2067776 bytes images/tutorial_4_MarkerGeneHeatmap.pdf | Bin 0 -> 232115 bytes index.md | 48 + logo.svg | 24 + man/ArchRPalettes.Rd | 4 +- man/ArchRProject.Rd | 26 + man/ArchRRegionTrack.Rd | 30 +- man/addArchRThreads.Rd | 14 + man/addBgdPeaks.Rd | 28 + man/addCNVMatrix.Rd | 25 +- man/addCellColData.Rd | 10 +- man/addClusters.Rd | 32 +- man/addCoAccessibility.Rd | 30 +- man/addDemuxletResults.Rd | 8 +- man/addDeviationsMatrix.Rd | 25 +- man/addDoubletScores.Rd | 22 +- man/addEmbedding.Rd | 48 +- man/addFeatureMatrix.Rd | 21 +- man/addGeneScoreMatrix.Rd | 36 +- man/addGroupCoverages.Rd | 46 +- man/addImputeWeights.Rd | 25 +- man/addIterativeLSI.Rd | 51 +- man/addMotifAnnotations.Rd | 16 +- man/addPeakMatrix.Rd | 14 +- man/addPeakSet.Rd | 6 +- man/addProjectSummary.Rd | 6 +- man/addReproduciblePeakSet.Rd | 52 +- man/addSampleColData.Rd | 10 +- man/addSeqLengths.Rd | 16 - man/addSeqLengthsGR.Rd | 16 + man/addTileMatrix.Rd | 24 +- man/addTrajectory.Rd | 30 +- man/columnOverlaps.Rd | 21 - man/computeKNN.Rd | 12 - man/constructGR.Rd | 20 + man/constructGRanges.Rd | 20 - man/createArrowFiles.Rd | 34 +- man/createGeneAnnnotation.Rd | 25 + man/createGenomeAnnotation.Rd | 21 + man/dot-ArchRLogo.Rd | 14 - man/dot-checkPath.Rd | 18 - man/dot-fileExtension.Rd | 14 - man/dot-tempfile.Rd | 19 - man/extendGR.Rd | 18 + man/extendGRanges.Rd | 18 - man/figures/ArchR_Workflow.png | Bin 0 -> 292222 bytes man/figures/ArchR_dartLogo.jpg | Bin 76285 -> 0 bytes man/filterCells.Rd | 12 +- man/filterChrGR.Rd | 23 + man/filterDoublets.Rd | 12 +- man/filterPlot.Rd | 13 +- man/findMacs2.Rd | 11 + man/getAnnotation.Rd | 18 - man/getArchRThreads.Rd | 11 + man/getArrowFiles.Rd | 2 +- man/getBlacklist.Rd | 2 +- man/getCellColData.Rd | 6 +- man/getCellNames.Rd | 2 +- man/getChromLengths.Rd | 2 +- man/getChromSizes.Rd | 2 +- man/getEmbedding.Rd | 10 +- man/getExons.Rd | 6 +- man/getFeatures.Rd | 10 +- man/getFragmentsFromArrow.Rd | 6 +- man/getGeneAnnotation.Rd | 4 +- man/getGenes.Rd | 6 +- man/getGenome.Rd | 2 +- man/getGenomeAnnotation.Rd | 4 +- man/getImputeWeights.Rd | 6 +- man/getInputFiles.Rd | 4 +- man/getMatches.Rd | 12 +- man/getMatrixFromArrow.Rd | 10 +- man/getOutputDirectory.Rd | 2 +- man/getPeakAnnotation.Rd | 18 + man/getPeakSet.Rd | 2 +- man/getPositions.Rd | 12 +- man/getProjectSummary.Rd | 4 +- man/getReducedDims.Rd | 12 +- man/getSampleColData.Rd | 8 +- man/getSampleNames.Rd | 2 +- man/getTSS.Rd | 2 +- man/getTrajectory.Rd | 22 +- man/getTutorialData.Rd | 4 +- man/getValidBarcodes.Rd | 4 +- man/getVarDeviations.Rd | 21 + man/ggAlignPlots.Rd | 16 +- man/ggHex.Rd | 36 +- man/ggOneToOne.Rd | 20 +- man/ggPoint.Rd | 38 +- man/ggViolin.Rd | 30 +- man/grapes-bcin-grapes.Rd | 16 + man/grapes-bcni-grapes.Rd | 16 + man/grapes-ni-grapes.Rd | 8 +- man/keepFilteredChromosomes.Rd | 21 - man/loadArchRProject.Rd | 18 + man/markerAnnoEnrich.Rd | 28 + man/markerFeatures.Rd | 52 +- man/markerGR.Rd | 18 + man/markerHeatmap.Rd | 75 +- man/markerPlot.Rd | 24 + man/mergeGR.Rd | 16 + man/mergeGRanges.Rd | 16 - man/nCells.Rd | 2 +- man/nOverlapGR.Rd | 18 + man/nonOverlappingGR.Rd | 21 + man/nonOverlappingGRanges.Rd | 21 - man/overlappingBP.Rd | 18 - man/overlapsMany.Rd | 20 - man/overlapsManyGR.Rd | 21 + man/paletteContinuous.Rd | 4 +- man/paletteDiscrete.Rd | 8 +- man/plotEmbedding.Rd | 39 +- man/plotFootprints.Rd | 42 +- man/plotGroups.Rd | 21 +- man/plotPDF.Rd | 8 +- man/plotTrajectory.Rd | 46 +- man/saveArchRProject.Rd | 16 + man/shuffleGR.Rd | 20 + man/shuffleGRanges.Rd | 20 - man/subsetSeqnames.Rd | 16 - man/subsetSeqnamesGR.Rd | 16 + man/theme_ArchR.Rd | 30 +- man/trajectoryHeatmap.Rd | 20 +- pkgdown/favicon/apple-touch-icon-120x120.png | Bin 0 -> 6713 bytes pkgdown/favicon/apple-touch-icon-152x152.png | Bin 0 -> 8642 bytes pkgdown/favicon/apple-touch-icon-180x180.png | Bin 0 -> 10249 bytes pkgdown/favicon/apple-touch-icon-60x60.png | Bin 0 -> 3441 bytes pkgdown/favicon/apple-touch-icon-76x76.png | Bin 0 -> 4280 bytes pkgdown/favicon/apple-touch-icon.png | Bin 0 -> 10249 bytes pkgdown/favicon/favicon-16x16.png | Bin 0 -> 1143 bytes pkgdown/favicon/favicon-32x32.png | Bin 0 -> 1910 bytes pkgdown/favicon/favicon.ico | Bin 0 -> 15086 bytes vignettes/ArchR.Rmd | 143 --- vignettes/Articles/annotations.Rmd | 11 +- vignettes/Articles/clusterComputing.Rmd | 9 + vignettes/Articles/cnvIdentification.Rmd | 5 + vignettes/Articles/dimReduction.Rmd | 16 +- vignettes/Articles/doubletRemoval.Rmd | 25 +- vignettes/Articles/faq.Rmd | 9 + vignettes/Articles/footprinting.Rmd | 22 +- vignettes/Articles/geneScores.Rmd | 25 +- vignettes/Articles/inputFiles.Rmd | 27 + vignettes/Articles/iterativePeakCalling.png | Bin 125505 -> 0 bytes vignettes/Articles/markerFeatures.Rmd | 35 + vignettes/Articles/peakCalling.Rmd | 28 +- vignettes/Articles/pseudobulkGeneration.Rmd | 87 ++ vignettes/Articles/qcFilters.Rmd | 18 +- vignettes/Articles/signalTracks.Rmd | 4 +- vignettes/Articles/trajectories.Rmd | 5 + vignettes/Articles/tutorial.Rmd | 227 ++++ vignettes/faq.Rmd | 6 - 375 files changed, 17717 insertions(+), 6106 deletions(-) delete mode 100644 R/MarkerHeatmap.R create mode 100644 R/PeakAnnotation.R delete mode 100644 docs/articles/ArchR.html create mode 100644 docs/articles/Articles/annotations.html create mode 100644 docs/articles/Articles/clusterComputing.html create mode 100644 docs/articles/Articles/cnvIdentification.html rename docs/articles/{ => Articles}/faq.html (68%) create mode 100644 docs/articles/Articles/inputFiles.html create mode 100644 docs/articles/Articles/markerFeatures.html create mode 100644 docs/articles/Articles/pseudobulkGeneration.html create mode 100644 docs/articles/Articles/trajectories.html create mode 100644 docs/articles/Articles/tutorial.html create mode 100644 docs/favicon-16x16.png create mode 100644 docs/favicon-32x32.png create mode 100644 docs/favicon.ico create mode 100644 docs/images/ArchRProject_Schematic.png create mode 100644 docs/images/ArchR_FunctionSchematic.png create mode 100644 docs/images/Frags_vs_TSS.png create mode 100644 docs/images/GeneActivityScore_Schematic.png create mode 100644 docs/images/TSS_vs_FRiP.png create mode 100644 docs/images/background_postNorm.png create mode 100644 docs/images/background_preNorm.png rename docs/{articles/Articles => images}/doubletRemoval.png (100%) create mode 100644 docs/images/footprintingMethods.png create mode 100644 docs/images/footprintingSchematic.png create mode 100644 docs/images/iterativeLSI.png create mode 100644 docs/images/markerFeature_schematic.png create mode 100644 docs/images/peakCalling_ClusteredOverlap.png create mode 100644 docs/images/peakCalling_Comparison.png create mode 100644 docs/images/peakCalling_IterativeOverlap.png create mode 100644 docs/images/peakCalling_RawOverlap.png create mode 100644 docs/images/pseudobulkReplicate_DecisionTree.png create mode 100644 docs/logo.svg rename docs/reference/{ArchR_palettes.html => ArchRPalettes.html} (73%) rename docs/reference/{ggLine.html => ArchRProject.html} (64%) create mode 100644 docs/reference/addArchRThreads.html create mode 100644 docs/reference/addBgdPeaks.html create mode 100644 docs/reference/addCNVMatrix.html rename docs/reference/{IterativeLSI.html => addClusters.html} (51%) create mode 100644 docs/reference/addCoAccessibility.html create mode 100644 docs/reference/addEmbedding.html rename docs/reference/{ComputeEmbedding.html => addImputeWeights.html} (62%) create mode 100644 docs/reference/addIterativeLSI.html create mode 100644 docs/reference/addProjectSummary.html create mode 100644 docs/reference/addSeqLengthsGR.html create mode 100644 docs/reference/addTrajectory.html delete mode 100644 docs/reference/alignCellsToTrajectory.html create mode 100644 docs/reference/constructGR.html create mode 100644 docs/reference/createGeneAnnnotation.html create mode 100644 docs/reference/createGenomeAnnotation.html delete mode 100644 docs/reference/dot-fileExtension.html create mode 100644 docs/reference/extendGR.html create mode 100644 docs/reference/figures/ArchR_Workflow.png delete mode 100644 docs/reference/figures/ArchR_dartLogo.jpg create mode 100644 docs/reference/figures/apple-touch-icon-120x120.png create mode 100644 docs/reference/figures/apple-touch-icon-152x152.png create mode 100644 docs/reference/figures/apple-touch-icon-180x180.png create mode 100644 docs/reference/figures/apple-touch-icon-60x60.png create mode 100644 docs/reference/figures/apple-touch-icon-76x76.png create mode 100644 docs/reference/figures/apple-touch-icon.png create mode 100644 docs/reference/figures/favicon-16x16.png create mode 100644 docs/reference/figures/favicon-32x32.png create mode 100644 docs/reference/figures/favicon.ico create mode 100644 docs/reference/filterChrGR.html rename docs/reference/{IdentifyClusters.html => filterDoublets.html} (65%) create mode 100644 docs/reference/filterPlot.html rename docs/reference/{computeCoAccessibility.html => findMacs2.html} (73%) rename docs/reference/{computeKNN.html => getArchRThreads.html} (73%) rename docs/reference/{VisualizeGroups.html => getFeatures.html} (65%) rename docs/reference/{getAnnotation.html => getImputeWeights.html} (74%) rename docs/reference/{projectLSI.html => getInputFiles.html} (73%) rename docs/reference/{computeLSI.html => getPeakAnnotation.html} (72%) create mode 100644 docs/reference/getProjectSummary.html create mode 100644 docs/reference/getTrajectory.html create mode 100644 docs/reference/getTutorialData.html create mode 100644 docs/reference/getValidBarcodes.html create mode 100644 docs/reference/getVarDeviations.html rename docs/reference/{dot-checkPath.html => grapes-bcin-grapes.html} (72%) rename docs/reference/{featureEnrichment.html => grapes-bcni-grapes.html} (73%) create mode 100644 docs/reference/loadArchRProject.html rename docs/reference/{plotFootprint.html => markerAnnoEnrich.html} (62%) create mode 100644 docs/reference/markerGR.html create mode 100644 docs/reference/markerPlot.html create mode 100644 docs/reference/markerRanges.html create mode 100644 docs/reference/mergeGR.html rename docs/reference/{availableFeatures.html => nCells.html} (74%) create mode 100644 docs/reference/nOverlapGR.html create mode 100644 docs/reference/nonOverlappingGR.html create mode 100644 docs/reference/overlapsManyGR.html create mode 100644 docs/reference/plotEmbedding.html create mode 100644 docs/reference/plotFootprints.html rename docs/reference/{VisualizeEmbedding.html => plotGroups.html} (58%) create mode 100644 docs/reference/plotTrajectory.html create mode 100644 docs/reference/saveArchRProject.html create mode 100644 docs/reference/shuffleGR.html rename docs/reference/{dot-ArchRLogo.html => subsetSeqnamesGR.html} (71%) create mode 100644 docs/reference/trajectoryHeatmap.html create mode 100644 images/ArchRProject_Schematic.png create mode 100644 images/ArchR_FunctionSchematic.png create mode 100644 images/Frags_vs_TSS.png create mode 100644 images/GeneActivityScore_Schematic.png create mode 100644 images/TSS_vs_FRiP.png create mode 100644 images/background_postNorm.png create mode 100644 images/background_preNorm.png rename {vignettes/Articles => images}/doubletRemoval.png (100%) create mode 100644 images/footprintingMethods.png create mode 100644 images/footprintingSchematic.png create mode 100644 images/iterativeLSI.png rename {docs/articles/Articles => images}/iterativePeakCalling.png (100%) create mode 100644 images/markerFeature_schematic.png create mode 100644 images/peakCalling_ClusteredOverlap.png create mode 100644 images/peakCalling_Comparison.png create mode 100644 images/peakCalling_IterativeOverlap.png create mode 100644 images/peakCalling_RawOverlap.png create mode 100644 images/pseudobulkReplicate_DecisionTree.png create mode 100644 images/tutorial_1_UMAP-Clusters.pdf create mode 100644 images/tutorial_2_tracks.pdf create mode 100644 images/tutorial_3_MarkerGeneScores.pdf create mode 100644 images/tutorial_4_MarkerGeneHeatmap.pdf create mode 100644 index.md create mode 100644 logo.svg create mode 100644 man/ArchRProject.Rd create mode 100644 man/addArchRThreads.Rd create mode 100644 man/addBgdPeaks.Rd delete mode 100644 man/addSeqLengths.Rd create mode 100644 man/addSeqLengthsGR.Rd delete mode 100644 man/columnOverlaps.Rd delete mode 100644 man/computeKNN.Rd create mode 100644 man/constructGR.Rd delete mode 100644 man/constructGRanges.Rd create mode 100644 man/createGeneAnnnotation.Rd create mode 100644 man/createGenomeAnnotation.Rd delete mode 100644 man/dot-ArchRLogo.Rd delete mode 100644 man/dot-checkPath.Rd delete mode 100644 man/dot-fileExtension.Rd delete mode 100644 man/dot-tempfile.Rd create mode 100644 man/extendGR.Rd delete mode 100644 man/extendGRanges.Rd create mode 100644 man/figures/ArchR_Workflow.png delete mode 100644 man/figures/ArchR_dartLogo.jpg create mode 100644 man/filterChrGR.Rd create mode 100644 man/findMacs2.Rd delete mode 100644 man/getAnnotation.Rd create mode 100644 man/getArchRThreads.Rd create mode 100644 man/getPeakAnnotation.Rd create mode 100644 man/getVarDeviations.Rd create mode 100644 man/grapes-bcin-grapes.Rd create mode 100644 man/grapes-bcni-grapes.Rd delete mode 100644 man/keepFilteredChromosomes.Rd create mode 100644 man/loadArchRProject.Rd create mode 100644 man/markerAnnoEnrich.Rd create mode 100644 man/markerGR.Rd create mode 100644 man/markerPlot.Rd create mode 100644 man/mergeGR.Rd delete mode 100644 man/mergeGRanges.Rd create mode 100644 man/nOverlapGR.Rd create mode 100644 man/nonOverlappingGR.Rd delete mode 100644 man/nonOverlappingGRanges.Rd delete mode 100644 man/overlappingBP.Rd delete mode 100644 man/overlapsMany.Rd create mode 100644 man/overlapsManyGR.Rd create mode 100644 man/saveArchRProject.Rd create mode 100644 man/shuffleGR.Rd delete mode 100644 man/shuffleGRanges.Rd delete mode 100644 man/subsetSeqnames.Rd create mode 100644 man/subsetSeqnamesGR.Rd create mode 100644 pkgdown/favicon/apple-touch-icon-120x120.png create mode 100644 pkgdown/favicon/apple-touch-icon-152x152.png create mode 100644 pkgdown/favicon/apple-touch-icon-180x180.png create mode 100644 pkgdown/favicon/apple-touch-icon-60x60.png create mode 100644 pkgdown/favicon/apple-touch-icon-76x76.png create mode 100644 pkgdown/favicon/apple-touch-icon.png create mode 100644 pkgdown/favicon/favicon-16x16.png create mode 100644 pkgdown/favicon/favicon-32x32.png create mode 100644 pkgdown/favicon/favicon.ico delete mode 100644 vignettes/ArchR.Rmd create mode 100644 vignettes/Articles/clusterComputing.Rmd create mode 100644 vignettes/Articles/cnvIdentification.Rmd create mode 100644 vignettes/Articles/faq.Rmd create mode 100644 vignettes/Articles/inputFiles.Rmd delete mode 100644 vignettes/Articles/iterativePeakCalling.png create mode 100644 vignettes/Articles/markerFeatures.Rmd create mode 100644 vignettes/Articles/pseudobulkGeneration.Rmd create mode 100644 vignettes/Articles/trajectories.Rmd create mode 100644 vignettes/Articles/tutorial.Rmd delete mode 100644 vignettes/faq.Rmd diff --git a/.DS_Store b/.DS_Store index 6b58703617372fd68dc3e58fda974eb6b5bbac49..97c180ac117e5b4f5136633ed0e70b79bd84d4ef 100644 GIT binary patch delta 160 zcmZoEXeroW&BEg4?V&a~PWc7&w$23>d&b*+B!wfY1yfd5O8HlmD|$ zXS}g_I-94OfRT||t&T#qp`nq1j)Iw~@#GIWvCKd-CkN=>WIR7vOYgoS0n-E+f#%6h VcGs_Be$V)2auvJ$=EwSq902<7Cp!QD delta 123 zcmZoEXeroW&BEg0<*hb3P4=Rndb&LqG%`9G`dWOw~4=GTm0 OCRee`Z+@(=$N>Nbi6SWg diff --git a/DESCRIPTION b/DESCRIPTION index 302748e6..c68eecbd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,9 +3,11 @@ Type: Package Date: 2019-10-17 Title: Analyzing regulatory chromatin in R Version: 0.1.4 -Author: Jeffrey Granja [aut, cre], Ryan Corces [aut] -Maintainer: Jeffrey Granja +Authors@R: c( + person("Jeffrey", "Granja", email = "jgranja.stanford@gmail.com", role = c("aut","cre")), + person("Ryan", "Corces", role = "aut")) Description: This package is designed to streamline scATAC analyses in R. +Roxygen: list(markdown = TRUE) License: GPL (>= 2) LinkingTo: Rcpp LazyData: TRUE diff --git a/NAMESPACE b/NAMESPACE index 90dd0f94..8e5a7b7b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(.batchlapply) export(.centerRollMean) export(.checkPath) export(.cleanParams) +export(.computeKNN) export(.computeROC) export(.confusionMatrix) export(.fileExtension) @@ -61,7 +62,8 @@ export(.validTxDb) export(ArchRPalettes) export(ArchRProject) export(ArchRRegionTrack) -export(addBdgPeaks) +export(addArchRThreads) +export(addBgdPeaks) export(addCNVMatrix) export(addCellColData) export(addClusters) @@ -81,21 +83,22 @@ export(addPeakSet) export(addProjectSummary) export(addReproduciblePeakSet) export(addSampleColData) -export(addSeqLengths) +export(addSeqLengthsGR) export(addTileMatrix) export(addTrajectory) -export(columnOverlaps) -export(computeKNN) -export(constructGRanges) +export(constructGR) export(createArrowFiles) -export(extendGRanges) +export(createGeneAnnnotation) +export(createGenomeAnnotation) +export(extendGR) export(filterCells) +export(filterChrGR) export(filterDoublets) export(filterPlot) export(findMacs2) -export(getAnnotation) +export(getArchRThreads) export(getArrowFiles) -export(getBdgPeaks) +export(getBgdPeaks) export(getBlacklist) export(getCellColData) export(getCellNames) @@ -114,6 +117,7 @@ export(getInputFiles) export(getMatches) export(getMatrixFromArrow) export(getOutputDirectory) +export(getPeakAnnotation) export(getPeakSet) export(getPositions) export(getProjectSummary) @@ -130,17 +134,17 @@ export(ggHex) export(ggOneToOne) export(ggPoint) export(ggViolin) -export(keepFilteredChromosomes) +export(loadArchRProject) export(markerAnnoEnrich) export(markerFeatures) +export(markerGR) export(markerHeatmap) export(markerPlot) -export(markerRanges) -export(mergeGRanges) +export(mergeGR) export(nCells) -export(nonOverlappingGRanges) -export(overlappingBP) -export(overlapsMany) +export(nOverlapGR) +export(nonOverlappingGR) +export(overlapsManyGR) export(paletteContinuous) export(paletteDiscrete) export(plotEmbedding) @@ -148,8 +152,9 @@ export(plotFootprints) export(plotGroups) export(plotPDF) export(plotTrajectory) -export(shuffleGRanges) -export(subsetSeqnames) +export(saveArchRProject) +export(shuffleGR) +export(subsetSeqnamesGR) export(theme_ArchR) export(trajectoryHeatmap) importFrom(Rcpp,sourceCpp) diff --git a/R/AllClasses.R b/R/AllClasses.R index 8f270a61..a504f66d 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -4,7 +4,6 @@ NULL setClassUnion("characterOrNull", c("character", "NULL")) setClassUnion("GRangesOrNull", c("GRanges", "NULL")) -setClassUnion("matrixOrNull",members = c("dgCMatrix","NULL")) setClass("ArchRProject", representation( @@ -13,13 +12,13 @@ setClass("ArchRProject", sampleColData = "DataFrame", sampleMetadata = "SimpleList", cellColData = "DataFrame", - cellMetadata = "SimpleList", #Where clustering output will go to - reducedDims = "SimpleList", #Where clustering output will go to - embeddings = "SimpleList", #Where clustering output will go to + cellMetadata = "SimpleList", + reducedDims = "SimpleList", + embeddings = "SimpleList", peakSet = "GRangesOrNull", - annotations = "SimpleList", #MotifMatches ETC go here - geneAnnotation = "SimpleList", #genes exons TSS - genomeAnnotation = "SimpleList", #genome chromSizes BSgenome blacklist + peakAnnotation = "SimpleList", + geneAnnotation = "SimpleList", + genomeAnnotation = "SimpleList", imputeWeights = "SimpleList" ) ) @@ -44,15 +43,25 @@ setMethod("show", "ArchRProject", } ) +#' Create ArchRProject from ArrowFiles +#' +#' This function will create an ArchRProject with given ArrowFiles. +#' +#' @param ArrowFiles A character vector containing the names of ArrowFiles to be used. +#' @param outputDirectory A name for the relative path of the outputDirectory for ArchR results +#' @param copyArrows A boolean indicating whether ArrowFiles should be copied into outputDirectory +#' @param geneAnno The geneAnnotation (see createGeneAnnotation) is used for downstream analyses such as calculate TSS Enrichment Scores, Gene Scores, etc. +#' @param genomeAnno The genomeAnnotation (see createGenomeAnnotation) is used for downstream analyses for genome information such as nucleotide information or chromosome sizes. +#' @param showLogo A boolean indicating whether to show ArchR Logo after successful creation of an ArchRProject. #' @export ArchRProject <- function( - ArrowFiles=NULL, - sampleNames=NULL, - outputDirectory = "ArchR_Results", + ArrowFiles = NULL, + outputDirectory = "ArchR_Output", copyArrows = FALSE, geneAnnotation = NULL, genomeAnnotation = NULL, - showLogo = TRUE){ + showLogo = TRUE + ){ if(is.null(ArrowFiles)){ stop("Need to Provide Arrow Files!") @@ -62,10 +71,8 @@ ArchRProject <- function( message("Validating Arrows...") ArrowFiles <- unlist(lapply(ArrowFiles, .validArrow)) - if(is.null(sampleNames)){ - message("Getting SampleNames...") - sampleNames <- unlist(lapply(seq_along(ArrowFiles), function(x) .sampleName(ArrowFiles[x]))) - } + message("Getting SampleNames...") + sampleNames <- unlist(lapply(seq_along(ArrowFiles), function(x) .sampleName(ArrowFiles[x]))) if(any(duplicated(sampleNames))){ stop("Error cannot have duplicate sampleNames, please add sampleNames that will overwrite the current sample name in Arrow file!") @@ -74,7 +81,7 @@ ArchRProject <- function( if(length(sampleNames) != length(ArrowFiles)) stop("Samples is not equal to input ArrowFiles!") dir.create(outputDirectory,showWarnings=FALSE) - sampleDirectory <- file.path(normalizePath(outputDirectory),"InputArrows") + sampleDirectory <- file.path(normalizePath(outputDirectory), "ArrowFiles") dir.create(sampleDirectory,showWarnings=FALSE) if(copyArrows){ @@ -102,9 +109,12 @@ ArchRProject <- function( cellMetadata = SimpleList(), reducedDims = SimpleList(), embeddings = SimpleList(), - annotations = SimpleList(), + peakSet = NULL, + peakAnnotation = SimpleList(), geneAnnotation = geneAnnotation, - genomeAnnotation = genomeAnnotation) + genomeAnnotation = genomeAnnotation + ) + if(showLogo){ .ArchRLogo(ascii = "Logo") } @@ -115,3 +125,163 @@ ArchRProject <- function( } +#Validity +.validArchRProject <- function(ArchRProj, ...){ + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Not a valid ArchRProject as input!") + }else{ + ArchRProj + } +} + +#' Save ArchRProject for Later Usage +#' +#' This function will organize arrows and project output into a directory and save the ArchRProject for later usage. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param copyArrows A boolean indicating whether to copy or copy + remove original ArrowFiles prior to saving ArchRProject. +#' @export +saveArchRProject <- function( + ArchRProj = NULL, + copyArrows = TRUE + ){ + + outputDir <- getOutputDirectory(ArchRProj) + + #Set Up Arrow Files + ArrowDir <- file.path(basename(outputDir), "ArrowFiles") + dir.create(ArrowDir, showWarnings = FALSE) + + ArrowFiles <- getArrowFiles(ArchRProj) + ArrowFilesNew <- file.path(ArrowDir, basename(ArrowFiles)) + names(ArrowFilesNew) <- names(ArrowFiles) + + for(i in seq_along(ArrowFiles)){ + cf <- file.copy(ArrowFiles[i], ArrowFilesNew[i]) + if(!copyArrows){ + file.remove(ArrowFiles[i]) + } + } + + ArchRProj@sampleColData$ArrowFiles <- ArrowFilesNew[rownames(ArchRProj@sampleColData)] + + saveRDS(ArchRProj, file.path(outputDir, "Save-ArchR-Project.rds")) + +} + +#' Load Previous ArchRProject into R +#' +#' This function will load a previously saved ArchRProject and re-normalize paths for usage. +#' +#' @param path A character path to an ArchRProject directory that was previously saved. +#' @param force A boolean indicating when re-normalizing paths if an annotation/bdgPeaks is not found ignore and continue +#' @param showLogo show ArchRLogo upon completion. +#' @export +loadArchRProject <- function( + path = "./", + force = FALSE, + showLogo = TRUE + ){ + + path2Proj <- file.path(path, "Save-ArchR-Project.rds") + + if(!file.exists(path2Proj)){ + stop("Could not find previously saved ArchRProject in the path specified!") + } + + ArchRProj <- readRDS(path2Proj) + + outputDir <- getOutputDirectory(ArchRProj) + outputDirNew <- normalizePath(path) + + #1. Arrows Paths + ArrowFilesNew <- file.path(outputDirNew, gsub(paste0(basename(outputDir),"/"),"",ArchRProj@sampleColData$ArrowFiles)) + if(!all(file.exists(ArrowFilesNew))){ + stop("ArrowFiles do not exist in saved ArchRProject!") + } + ArchRProj@sampleColData$ArrowFiles <- ArrowFilesNew + + #2. Annotations Paths + + if(length(ArchRProj@peakAnnotation) > 0){ + + keepAnno <- rep(TRUE, length(ArchRProj@peakAnnotation)) + + for(i in seq_along(ArchRProj@peakAnnotation)){ + #Postions + if(!is.null(ArchRProj@peakAnnotation[[i]]$Positions)){ + + PositionsNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Positions) + if(!all(file.exists(PositionsNew))){ + if(force){ + keepAnno[i] <- FALSE + message("Positions for peakAnnotation do not exist in saved ArchRProject!") + }else{ + stop("Positions for peakAnnotation do not exist in saved ArchRProject!") + } + } + ArchRProj@peakAnnotation[[i]]$Positions <- PositionsNew + + } + + #Matches + if(!is.null(ArchRProj@peakAnnotation[[i]]$Matches)){ + + MatchesNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Matches) + if(!all(file.exists(MatchesNew))){ + if(force){ + message("Matches for peakAnnotation do not exist in saved ArchRProject!") + keepAnno[i] <- FALSE + }else{ + stop("Matches for peakAnnotation do not exist in saved ArchRProject!") + } + } + ArchRProj@peakAnnotation[[i]]$Matches <- MatchesNew + + } + + } + + ArchRProj@peakAnnotation <- ArchRProj@peakAnnotation[keepAnno] + + } + + + #3. Background Peaks Paths + if(!is.null(getPeakSet(ArchRProj))){ + + if(!is.null(metadata(getPeakSet(ArchRProj))$bgdPeaks)){ + + bgdPeaksNew <- gsub(outputDir, outputDirNew, metadata(getPeakSet(ArchRProj))$bgdPeaks) + + if(!all(file.exists(bgdPeaksNew))){ + + if(force){ + message("BackgroundPeaks do not exist in saved ArchRProject!") + metadata(ArchRProj@peakSet)$bgdPeaks <- NULL + }else{ + stop("BackgroundPeaks do not exist in saved ArchRProject!") + } + + }else{ + + metadata(ArchRProj@peakSet)$bgdPeaks <- bgdPeaksNew + + } + + } + + } + + #4. Set Output Directory + + ArchRProj@projectMetadata$outputDirectory <- outputDirNew + + message("Successfully loaded ArchRProject!") + if(showLogo){ + .ArchRLogo(ascii = "Logo") + } + + ArchRProj + +} diff --git a/R/ArchRBrowser.R b/R/ArchRBrowser.R index ee19d509..2e2d7c0c 100644 --- a/R/ArchRBrowser.R +++ b/R/ArchRBrowser.R @@ -1,34 +1,40 @@ +#################################################################### +# Signal Track Plotting Methods +#################################################################### + #' Plot ArchR Region Track #' #' This function will plot the coverage at an input region in the style of a browser track. It allows for normalization of the signal which enables direct comparison across samples. #' -#' @param ArchRProj An ArchRProject object. -#' @param region A GRanges region that indicates the region to be plotted. If more than one region exists in the GRanges object, all will be plotted. If no region is supplied, then the "geneSymbol" argument can be used to center the plot window at the transcription start site of the gene passed to "geneSymbol". -#' @param useGroups A boolean value that indicates whether samples should be grouped together to produce average tracks. Only TRUE/FALSE are accepted. -#' @param groupBy A string that indicates how samples should be grouped. This string corresponds to one of the standard or user-supplied metadata columns (for example, "Clusters"). Samples with the same value annotated in this metadata column will be grouped together and the average signal will be used. -#' @param useCoverages QQQ use group coverages for track plotting +#' @param ArchRProj An `ArchRProject` object. +#' @param region A `GRanges` region that indicates the region to be plotted. If more than one region exists in the `GRanges` object, all will be plotted. If no region is supplied, then the `geneSymbol` argument can be used to center the plot window at the transcription start site of the supplied gene. +#' @param groupBy A string that indicates how cells should be grouped. This string corresponds to one of the standard or user-supplied `cellColData` metadata columns (for example, "Clusters"). Cells with the same value annotated in this metadata column will be grouped together and the average signal will be used. +#' @param useGroups A character vector that is used to select a subset of groups by name from the designated `groupBy` column in `cellColData`. This limits the groups to be plotted. +#' @param useCoverages A boolean indicating whether to use group coverages of pseudobulks (see createGroupCoverages) for track plotting instead of ArrowFiles #' @param plotSummary A character vector containing the features to be potted. Possible values include "bulkTrack" (the ATAC-seq signal), "featureTrack" (i.e. the peak bed regions), and "geneTrack" (line diagrams of genes with introns and exons shown. Blue-colored genes are on the minus strand and red-colored genes are on the plus strand). -#' @param sizes A numeric vector containing 3 values that indicate the sizes of the individual components passed in plotSummary. The order must be the same as plotSummary. -#' @param features A GRanges object containing the "features" to be plotted (This should be thought of as a bed track. i.e. the set of peaks obtained using getPeakSet(ArchRProj)). +#' @param sizes A numeric vector containing 3 values that indicate the sizes of the individual components passed in `plotSummary`. The order must be the same as `plotSummary`. +#' @param features A `GRanges` object containing the "features" to be plotted (This should be thought of as a bed track. i.e. the set of peaks obtained using `getPeakSet(ArchRProj))`. #' @param geneSymbol If "region" is not supplied, plotting can be centered at the transcription start site corresponding to the gene symbol(s) passed here. -#' @param upstream The number of basepairs upstream of the transcription start site of "geneSymbol" to extend the plotting window. If "region" is supplied, this argument is ignored. -#' @param downstream The number of basepairs downstream of the transcription start site of "geneSymbol" to extend the plotting window. If "region" is supplied, this argument is ignored. +#' @param upstream The number of basepairs upstream of the transcription start site of `geneSymbol` to extend the plotting window. If `region` is supplied, this argument is ignored. +#' @param downstream The number of basepairs downstream of the transcription start site of `geneSymbol` to extend the plotting window. If `region` is supplied, this argument is ignored. #' @param tileSize The numeric width of the tile/bin in basepairs for plotting ATAC-seq signal tracks. All insertions in a single bin will be summed. -#' @param normMethod The name of the column in cellColData object by which normalization should be performed. The recommended and default value is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. +#' @param minCells The minimum number of cells contained within a cell group to allow for this cell group to be plotted. +#' @param normMethod The name of the column in `cellColData` object by which normalization should be performed. The recommended and default value is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. #' @param threads The number of threads to use for parallel execution. -#' @param ylim QQQ The numeric y-axis limit to be used for for bulkTrack plotting. If not provided, the y-axis limit will be QQQ. -#' @param baseSize QQQ The numeric font size to be used in the plot. This applies to all plot labels. +#' @param ylim The numeric quantile y-axis limit to be used for for "bulkTrack" plotting. If not provided, the y-axis limit will be c(0, 0.999). +#' @param baseSize The numeric font size to be used in the plot. This applies to all plot labels. #' @param borderWidth The numeric line width to be used for plot borders. #' @param tickWidth The numeric line width to be used for axis tick marks. -#' @param geneAnno QQQ The geneAnnotation objection to be used for plotting the "geneTrack" object. This must be a TxDB object. -#' @param title verbose sections +#' @param facetbaseSize The numeric font size to be used in the facets (gray boxes) of the plot. +#' @param geneAnno The `geneAnnotation` object to be used for plotting the "geneTrack" object. See createGeneAnnotation() for more info. +#' @param title The title to add at the top of the plot next to the plot's genomic coordinates. #' @param ... additional args #' @export ArchRRegionTrack <- function( ArchRProj = NULL, region = NULL, groupBy = "Clusters", - useGroups = NULL, + useGroups = NULL, useCoverages = FALSE, plotSummary = c("bulkTrack", "featureTrack", "geneTrack"), sizes = c(10, 2, 4), @@ -62,7 +68,7 @@ ArchRRegionTrack <- function( region <- region[which(tolower(mcols(region)$symbol) %in% tolower(geneSymbol))] region <- resize(region, 1, "start") strand(region) <- "*" - region <- extendGRanges(region, upstream = upstream, downstream = downstream) + region <- extendGR(region, upstream = upstream, downstream = downstream) } } region <- .validGRanges(region) @@ -508,7 +514,7 @@ ArchRRegionTrack <- function( #only take first region region <- ArchR::.validGRanges(region) - region <- subsetSeqnames(region[1],as.character(seqnames(region[1]))) + region <- subsetSeqnamesGR(region[1], as.character(seqnames(region[1]))) genes <- sort(sortSeqlevels(geneAnnotation$genes), ignore.strand = TRUE) exons <- sort(sortSeqlevels(geneAnnotation$exons), ignore.strand = TRUE) @@ -641,7 +647,7 @@ ArchRRegionTrack <- function( #only take first region region <- ArchR::.validGRanges(region) - region <- subsetSeqnames(region[1],as.character(seqnames(region[1]))) + region <- subsetSeqnamesGR(region[1], as.character(seqnames(region[1]))) if(!inherits(features,"GRangesList") & !inherits(features,"GenomicRangesList")){ features <- ArchR::.validGRanges(features) diff --git a/R/ArchRProjectMethods.R b/R/ArchRProjectMethods.R index a93f2d75..c3a2c064 100644 --- a/R/ArchRProjectMethods.R +++ b/R/ArchRProjectMethods.R @@ -1,21 +1,364 @@ ########################################################################################## -# Validation Methods +# Parallel Information ########################################################################################## -.validArchRProject <- function(ArchRProj, ...){ - if(!inherits(ArchRProj, "ArchRProject")){ - stop("Not a valid ArchRProject as input!") +#' Add global number of threads for default parallel computing. +#' +#' This function will set the global number of threads to be used for ArchR functions. +#' +#' @param threads The default number of threads to be used for parallel execution across all ArchR functions. This value is stored as a global environment variable, not part of the `ArchRProject`. This can be overwritten on a per-function basis using the given function's parameters. +#' @export +addArchRThreads <- function(threads = floor(parallel::detectCores()/ 2)){ + if(tolower(.Platform$OS.type) == "windows"){ + message("Detected windows OS, setting threads to 1.") + threads <- 1 + } + message("Setting default number of Parallel threads to ", threads, ".") + assign("ArchRThreads", as.integer(threads), envir = .GlobalEnv) +} + +#' Get global number of threads for default parallel computing. +#' +#' This function will get the global number of threads to be used for ArchR functions. +#' +#' @export +getArchRThreads <- function(){ + if(exists("ArchRThreads")){ + if(!is.integer(ArchRThreads)){ + 1 + }else{ + ArchRThreads + } + }else{ + 1 + } +} + +########################################################################################## +# Create Gene/Genome Annotation +########################################################################################## + +#' Create Genome Annotation for ArchR +#' +#' This function will create a genome annotation that can be used for createArrowFiles, ArchRProject, etc. +#' +#' @param genome A string that indicates a valid `BSgenome` or a `BSgenome` object (ie "hg38" or "BSgenome.Hsapiens.UCSC.hg38"). +#' @param chromSizes A `GRanges` containing chromosome start and end coordinates. +#' @param blacklist A `GRanges` containing regions that should be excluded from analyses due to unwanted biases. +#' @param filter A boolean indicating whether non-standard chromosome scaffolds should be excluded. +#' @export +createGenomeAnnotation <- function( + genome = NULL, + chromSizes = NULL, + blacklist = NULL, + filter = TRUE + ){ + + if(is.null(genome) | is.null(blacklist) | is.null(chromSizes)){ + + ################## + message("Getting genome...") + bsg <- .validBSgenome(genome) + genome <- bsg@pkgname + + ################## + message("Getting chromSizes...") + chromSizes <- GRanges(names(seqlengths(bsg)), IRanges(1, seqlengths(bsg))) + if(filter){ + chromSizes <- filterChrGR(chromSizes) + } + seqlengths(chromSizes) <- end(chromSizes) + + ################## + message("Getting blacklist...") + blacklist <- .getBlacklist(genome = bsg@provider_version) + }else{ - ArchRProj + + bsg <- .validBSgenome(genome) + genome <- bsg@pkgname + + chromSizes <- .validGRanges(chromSizes) + + blacklist <- .validGRanges(blacklist) + } + + SimpleList(genome = genome, chromSizes = chromSizes, blacklist = blacklist) + +} + +#' Create Gene Annotation for ArchR +#' +#' This function will create a gene annotation that can be used to create ArrowFiles or an ArchRProject, etc. +#' +#' @param genome A string that specifies the genome (ie "hg38", "hg19", "mm10", "mm9"). If `genome` is not supplied, `TxDb` and `OrgDb` are required. If genome is supplied, `TxDb` wil +#' @param TxDb QQQ A `TxDb` object (transcript database) from Bioconductor which contains information for gene/transcript coordinates. For example, from `txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene` +#' @param OrgDb An organism database from Bioconductor which contains information for gene/transcript symbols from ids. +#' @param genes A GRanges of gene coordinates (start to end). Needs to have a symbols column matching the exons symbols column. +#' @param exons A GRanges of gene exon coordinates. Needs to have a symbols column matching the genes symbols column +#' @param TSS A GRanges of transcription start sites (stranded) for computing TSS enrichment scores downstream. +#' @export +createGeneAnnnotation <- function( + genome = NULL, + TxDb = NULL, + OrgDb = NULL, + genes = NULL, + exons = NULL, + TSS = NULL + ){ + + if(is.null(genes) | is.null(exons) | is.null(TSS)){ + + inGenes <- genes + inExons <- exons + inTSS <- TSS + + .requirePackage("GenomicFeatures") + + if(is.null(genome)) { + if (is.null(TxDb) | is.null(OrgDb)) { + stop("If no provided genome then you need TxDb and OrgDb!") + } + } + + if(!is.null(genome)){ + TxDb <- .getTxDb(genome) + OrgDb <- .getOrgDb(genome) + } + + ########################### + message("Getting Genes...") + genes <- GenomicFeatures::genes(TxDb) + mcols(genes)$symbol <- suppressMessages(AnnotationDbi::mapIds(OrgDb, keys = mcols(genes)$gene_id, + column = "SYMBOL", keytype = "ENTREZID", multiVals = "first")) + names(genes) <- NULL + genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE) + + ########################### + message("Getting Exons...") + exons <- unlist(GenomicFeatures::exonsBy(TxDb, by = "tx")) + exons$tx_id <- names(exons) + mcols(exons)$gene_id <- suppressMessages(AnnotationDbi::select(TxDb, keys = paste0(mcols(exons)$tx_id), + column = "GENEID", keytype = "TXID")[, "GENEID"]) + exons <- exons[!is.na(mcols(exons)$gene_id), ] + mcols(exons)$symbol <- suppressMessages(AnnotationDbi::mapIds(OrgDb, keys = mcols(exons)$gene_id, + column = "SYMBOL", keytype = "ENTREZID", multiVals = "first")) + names(exons) <- NULL + mcols(exons)$exon_id <- NULL + mcols(exons)$exon_name <- NULL + mcols(exons)$exon_rank <- NULL + mcols(exons)$tx_id <- NULL + exons <- sort(sortSeqlevels(exons), ignore.strand = TRUE) + + ########################### + message("Getting TSS...") + TSS <- unique(resize(GenomicFeatures::transcripts(TxDb), width = 1, fix = "start")) + + if(!is.null(inGenes)){ + genes <- .validGRanges(inGenes) + } + + if(!is.null(inExons)){ + exons <- .validGRanges(inExons) + } + + if(!is.null(inTSS)){ + TSS <- .validGRanges(inTSS) + } + + }else{ + + genes <- .validGRanges(genes) + exons <- .validGRanges(exons) + TSS <- unique(.validGRanges(TSS)) + + } + + SimpleList(genes = genes, exons = exons, TSS = TSS) + +} + +.getBlacklist <- function(genome){ + + encodeBL <- c( + "hg19" = "https://github.com/Boyle-Lab/Blacklist/raw/master/lists/hg19-blacklist.v2.bed.gz", + "hg38" = "https://github.com/Boyle-Lab/Blacklist/raw/master/lists/hg38-blacklist.v2.bed.gz", + "mm10" = "https://github.com/Boyle-Lab/Blacklist/raw/master/lists/mm10-blacklist.v2.bed.gz", + "mm9" = "https://github.com/Boyle-Lab/Blacklist/raw/master/lists/Blacklist_v1/mm9-blacklist.bed.gz", + "ce10" = "http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/ce10-C.elegans/ce10-blacklist.bed.gz", + "dm3" = "http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/dm3-D.melanogaster/dm3-blacklist.bed.gz" + ) + + if(tolower(genome) %in% names(encodeBL)){ + bl <- tryCatch({ + blacklist <- import.bed(encodeBL[tolower(genome)]) + }, error = function(x){ + message("Blacklist not downloaded! Continuing without, be careful for downstream biases...") + GRanges() + }) + }else{ + message("Blacklist not downloaded! Continuing without, be careful for downstream biases...") + bl <- GRanges() + } + + bl + +} + +.getTxDb <- function(genome, filter = TRUE, install = TRUE){ + + if(toupper(genome) == "HG19"){ + if(suppressWarnings(!require(TxDb.Hsapiens.UCSC.hg19.knownGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene", update=FALSE) + }else{ + stop("TxDb.Hsapiens.UCSC.hg19.knownGene is not installed!") + } + } + library(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + }else if(toupper(genome) == "HG38"){ + if(suppressWarnings(!require(TxDb.Hsapiens.UCSC.hg38.knownGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene", update=FALSE) + }else{ + stop("TxDb.Hsapiens.UCSC.hg38.knownGene is not installed!") + } + } + library(TxDb.Hsapiens.UCSC.hg38.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene + }else if(toupper(genome) == "MM9"){ + if(suppressWarnings(!require(TxDb.Mmusculus.UCSC.mm9.knownGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Mmusculus.UCSC.mm9.knownGene", update=FALSE) + }else{ + stop("TxDb.Mmusculus.UCSC.mm9.knownGene is not installed!") + } + } + library(TxDb.Mmusculus.UCSC.mm9.knownGene) + txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene + }else if(toupper(genome) == "MM10"){ + if(suppressWarnings(!require(TxDb.Mmusculus.UCSC.mm10.knownGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Mmusculus.UCSC.mm10.knownGene", update=FALSE) + }else{ + stop("TxDb.Mmusculus.UCSC.mm10.knownGene is not installed!") + } + } + library(TxDb.Mmusculus.UCSC.mm10.knownGene) + txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene + }else if(toupper(genome) == "SACCER3"){ + if(suppressWarnings(!require(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Scerevisiae.UCSC.sacCer3.sgdGene", update=FALSE) + }else{ + stop("TxDb.Scerevisiae.UCSC.sacCer3.sgdGene is not installed!") + } + } + library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) + txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene + }else if(toupper(genome) == "RHEMAC8"){ + if(suppressWarnings(!require(TxDb.Mmulatta.UCSC.rheMac8.refGene))){ + if(install){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("TxDb.Mmulatta.UCSC.rheMac8.refGene", update=FALSE) + }else{ + stop("TxDb.Mmulatta.UCSC.rheMac8.refGene is not installed!") + } + } + library(TxDb.Mmulatta.UCSC.rheMac8.refGene) + txdb <- TxDb.Mmulatta.UCSC.rheMac8.refGene + }else{ + stop("Genome not recognized!") + } + + if(filter){ + txdb <- filterChrGR(txdb) + } + + return(txdb) + +} + +.getOrgDb <- function(genome){ + + if(toupper(genome) == "HG19" | toupper(genome) == "HG38"){ + if(suppressWarnings(!require(org.Hs.eg.db))){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("org.Hs.eg.db", update=FALSE) + } + library(org.Hs.eg.db) + annodb <- org.Hs.eg.db + }else if(toupper(genome) == "MM9" | toupper(genome) == "MM10"){ + if(suppressWarnings(!require(org.Mm.eg.db))){ + message("Package does not exist, now trying bioconductor...") + BiocManager::install("org.Mm.eg.db", update=FALSE) + } + library(org.Mm.eg.db) + annodb <- org.Mm.eg.db + }else{ + stop("Genome not recognized!") + } + return(annodb) + } .validGeneAnnotation <- function(geneAnnotation, ...){ - # Need to put this code + + if(!inherits(geneAnnotation, "SimpleList")){ + if(inherits(geneAnnotation, "list")){ + geneAnnotation <- as(geneAnnotation, "SimpleList") + }else{ + stop("geneAnnotation must be a list/SimpleList of 3 GRanges for : Genes GRanges, Exons GRanges and TSS GRanges!") + } + } + if(identical(sort(tolower(names(geneAnnotation))), c("exons", "genes", "tss"))){ + + gA <- SimpleList() + gA$genes <- .validGRanges(geneAnnotation[[grep("genes", names(geneAnnotation), ignore.case = TRUE)]]) + gA$exons <- .validGRanges(geneAnnotation[[grep("exons", names(geneAnnotation), ignore.case = TRUE)]]) + gA$TSS <- .validGRanges(geneAnnotation[[grep("TSS", names(geneAnnotation), ignore.case = TRUE)]]) + + }else{ + stop("geneAnnotation must be a list/SimpleList of 3 GRanges for : Genes GRanges, Exons GRanges and TSS GRanges!") + } + + gA + } .validGenomeAnnotation <- function(genomeAnnotation, ...){ - # Need to put this code + + if(!inherits(genomeAnnotation, "SimpleList")){ + if(inherits(genomeAnnotation, "list")){ + genomeAnnotation <- as(genomeAnnotation, "SimpleList") + }else{ + stop("genomeAnnotation must be a list/SimpleList of 3 GRanges for : blacklist GRanges, chromSizes GRanges and genome BSgenome package string (ie hg38 or BSgenome.Hsapiens.UCSC.hg38)!") + } + } + + if(identical(sort(tolower(names(genomeAnnotation))), c("blacklist", "chromsizes", "genome"))){ + + gA <- SimpleList() + gA$blacklist <- .validGRanges(genomeAnnotation[[grep("blacklist", names(genomeAnnotation), ignore.case = TRUE)]]) + bsg <- .validBSgenome(genomeAnnotation[[grep("genome", names(genomeAnnotation), ignore.case = TRUE)]]) + gA$genome <- bsg@pkgname + gA$chromSizes <- .validGRanges(genomeAnnotation[[grep("chromsizes", names(genomeAnnotation), ignore.case = TRUE)]]) + + }else{ + + stop("genomeAnnotation must be a list/SimpleList of 3 GRanges for : blacklist GRanges, chromSizes GRanges and genome BSgenome package string (ie hg38 or BSgenome.Hsapiens.UCSC.hg38)!") + + } + + gA + } ########################################################################################## @@ -26,7 +369,7 @@ #' #' This function gets the outputDirectory from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getOutputDirectory <- function(ArchRProj, ...){ @@ -43,7 +386,7 @@ getOutputDirectory <- function(ArchRProj, ...){ #' #' This function gets the names of all ArrowFiles associated with a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getArrowFiles <- function(ArchRProj, ...){ @@ -57,7 +400,7 @@ getArrowFiles <- function(ArchRProj, ...){ #' #' This function gets the sampleNames from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getSampleNames <- function(ArchRProj, ...){ @@ -70,7 +413,7 @@ getSampleNames <- function(ArchRProj, ...){ #' #' This function gets number of cells in ArchRProject/ArrowFile #' -#' @param input ArchRProject or ArrowFiles +#' @param input An `ArchRProject` object or ArrowFile. #' @param ... additional args #' @export nCells <- function(input, ...){ @@ -89,11 +432,11 @@ nCells <- function(input, ...){ #' Get sampleColData from an ArchRProject #' -#' This function gets the sampleColData from a given ArchRProject. +#' This function gets the `sampleColData` from a given `ArchRProject`. #' -#' @param ArchRProj An ArchRProject object. -#' @param select select a subset of column names from sampleColData -#' @param drop drop if selecting only one column name +#' @param ArchRProj An `ArchRProject` object. +#' @param select A character vector containing the column names to select from sampleColData. +#' @param drop A boolean value that indicates whether to drop the `dataframe` structure and convert to a vector if selecting only one column. #' @param ... additional args #' @export getSampleColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ @@ -113,11 +456,11 @@ getSampleColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ #' #' This function adds new data to sampleColData in ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param data The data to add to sampleColData. -#' @param name The column header name to be used for this new data in sampleColData. If a column with this name already exists, you may set "force" equal to TRUE to overwrite the data in this column. -#' @param samples The names of the samples corresponding to data. Typically new data is added to all samples but you may use this argument to only add data to a subset of samples. Samples here data is not added are set to NA. -#' @param force A boolean (TRUE/FALSE) argument that indicates whether or not to overwrite data in a given column when the value passed to "name" already exists as a column name in sampleColData. +#' @param ArchRProj An `ArchRProject` object. +#' @param data The data to add to `sampleColData`. +#' @param name The column header name to be used for this new data in `sampleColData`. If a column with this name already exists, you may set `force` equal to TRUE to overwrite the data in this column. +#' @param samples The names of the samples corresponding to `data`. Typically new data is added to all samples but you may use this argument to only add data to a subset of samples. Samples where `data` is not added are set to `NA`. +#' @param force A boolean value that indicates whether or not to overwrite data in a given column when the value passed to `name` already exists as a column name in `sampleColData`. #' @param ... additional args #' @export addSampleColData <- function(ArchRProj, data = NULL, name = NULL, samples = NULL, force = FALSE){ @@ -154,7 +497,7 @@ addSampleColData <- function(ArchRProj, data = NULL, name = NULL, samples = NULL #' #' This function gets the cellNames from a given ArchRProject object. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getCellNames <- function(ArchRProj, ...){ @@ -167,9 +510,9 @@ getCellNames <- function(ArchRProj, ...){ #' #' This function gets the cellColData from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param select A character vector of column names to select from cellColData if you would like to subset the returned data. -#' @param drop A boolean argument to indicate whether additional data.frame information should be dropped if selecting only a single column name. +#' @param ArchRProj An `ArchRProject` object. +#' @param select A character vector of column names to select from `cellColData` if you would like to subset the returned data. +#' @param drop A boolean value that indicates whether to drop the `dataframe` structure and convert to a vector if selecting only one column. #' @param ... additional args #' @export getCellColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ @@ -198,11 +541,11 @@ getCellColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ #' #' This function adds new data to cellColData in a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param data The data to add to cellColData. -#' @param name The column header name to be used for this new data in cellColData. If a column with this name already exists, you may set "force" equal to TRUE to overwrite the data in this column. -#' @param cells The names of the cells corresponding to "data". Typically new data is added to all cells but you may use this argument to only add data to a subset of cells. Cells where data is not added are set to NA. -#' @param force A boolean (TRUE/FALSE) argument that indicates whether or not to overwrite data in a given column when the value passed to "name" already exists as a column name in cellColData. +#' @param ArchRProj An `ArchRProject` object. +#' @param data The data to add to `cellColData`. +#' @param name The column header name to be used for this new data in `cellColData`. If a column with this name already exists, you may set `force` equal to TRUE to overwrite the data in this column. +#' @param cells The names of the cells corresponding to `data`. Typically new data is added to all cells but you may use this argument to only add data to a subset of cells. Cells where `data` is not added are set to `NA`. +#' @param force A boolean value indicating whether or not to overwrite data in a given column when the value passed to `name` already exists as a column name in `cellColData`. #' @param ... additional args #' @export addCellColData <- function(ArchRProj, data = NULL, name = NULL, cells = getCellNames(ArchRProj), force = FALSE, ...){ @@ -245,7 +588,7 @@ addCellColData <- function(ArchRProj, data = NULL, name = NULL, cells = getCellN #' Get the peak set from an ArchRProject #' -#' This function gets the peakSet as a GRanges object from an ArchRProject. +#' This function gets the peak set as a GRanges object from an ArchRProject. #' #' @param ArchRProj An ArchRProject object. #' @param ... additional args @@ -259,9 +602,9 @@ getPeakSet <- function(ArchRProj, ...){ #' #' This function adds a peak set as a GRanges object to a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param peakSet A GRanges object containing the set of regions that define all peaks in the desired peak set. -#' @param force If a peakSet object has already been added to the given ArchRProject, the value of "force" determines whether or not to overwrite this peakSet. +#' @param ArchRProj An `ArchRProject` object. +#' @param peakSet A `GRanges` object containing the set of regions that define all peaks in the desired peak set. +#' @param force If a `peakSet` object has already been added to the given `ArchRProject`, the value of `force` determines whether or not to overwrite this `peakSet`. #' @param ... additional args #' @export addPeakSet <- function(ArchRProj, peakSet, force = FALSE, ...){ @@ -285,9 +628,9 @@ addPeakSet <- function(ArchRProj, peakSet, force = FALSE, ...){ #' Get genomeAnnotation from an ArchRProject #' -#' This function gets the genomeAnnotation (in format QQQ) from a given ArchRProject. +#' This function gets the genomeAnnotation (see createGenomeAnnotation) from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getGenomeAnnotation <- function(ArchRProj, ...){ @@ -299,7 +642,7 @@ getGenomeAnnotation <- function(ArchRProj, ...){ #' #' This function gets the blacklist (the regions to be excluded from analysis) as a GRanges from the genomeAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getBlacklist <- function(ArchRProj, ...){ @@ -311,7 +654,7 @@ getBlacklist <- function(ArchRProj, ...){ #' #' This function gets the name of the genome from the genomeAnnotation used by a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getGenome <- function(ArchRProj, ...){ @@ -323,7 +666,7 @@ getGenome <- function(ArchRProj, ...){ #' #' This function gets the chromosome lengths as a GRanges onject from the genomeAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getChromSizes <- function(ArchRProj, ...){ @@ -335,7 +678,7 @@ getChromSizes <- function(ArchRProj, ...){ #' #' This function gets the chromosome lengths as a vector from the genomeAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getChromLengths <- function(ArchRProj, ...){ @@ -361,9 +704,9 @@ getChromLengths <- function(ArchRProj, ...){ #' Get geneAnnotation from an ArchRProject #' -#' This function gets the geneAnnotation (in format QQQ) from a given ArchRProject +#' This function gets the geneAnnotation (see createGeneAnnotation) from a given ArchRProject #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getGeneAnnotation <- function(ArchRProj, ...){ @@ -375,7 +718,7 @@ getGeneAnnotation <- function(ArchRProj, ...){ #' #' This function gets the transcription start sites (TSSs) as a GRanges object of all genes from the geneAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getTSS <- function(ArchRProj, ...){ @@ -385,10 +728,10 @@ getTSS <- function(ArchRProj, ...){ #' Get the genes from an ArchRProject #' -#' This function gets the genes (in format QQQ) from the geneAnnotation of a given ArchRProject. +#' This function gets the genes start to end coordinates as a GRanges from the geneAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param symbols gene symbols to subset +#' @param ArchRProj An `ArchRProject` object. +#' @param symbols A character vector containing the gene symbols to subset from the `geneAnnotation`. #' @param ... additional args #' @export getGenes <- function(ArchRProj, symbols = NULL, ...){ @@ -402,9 +745,10 @@ getGenes <- function(ArchRProj, symbols = NULL, ...){ #' Get the exons from an ArchRProject #' -#' This function gets the exons (in format QQQ) from the geneAnnotation of a given ArchRProject. +#' This function gets the exons coordinates as a GRanges from the geneAnnotation of a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object. +#' @param symbols A character vector containing the gene symbols for the genes where exons should be extracted. #' @param ... additional args #' @export getExons <- function(ArchRProj, symbols = NULL, ...){ @@ -431,9 +775,11 @@ getExons <- function(ArchRProj, symbols = NULL, ...){ #' #' This function gets a dimensionality reduction object (i.e. UMAP, tSNE, etc) from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param reducedDims QQQ The name of the reducedDims object to retrieve from the designated ArchRProject. Options include QQQ. -#' @param return If set to "mat" or "matrix", the function will return the reducedDims object as a matrix. Otherwise, it will return the full reducedDims object. +#' @param ArchRProj An `ArchRProject` object. +#' @param reducedDims The name of the `reducedDims` object (i.e. IterativeLSI) to retrieve from the designated `ArchRProject`. +#' @param returnMatrix If set to "mat" or "matrix", the function will return the `reducedDims` object as a matrix with entries for each individual cell. Otherwise, it will return the full `reducedDims` object. +#' @param dimsToUse A vector containing the dimensions (i.e. 1:25) to return from the `reducedDims` object. +#' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the corCutOff, it will be excluded. #' @param ... additional args #' @export getReducedDims <- function( @@ -470,11 +816,11 @@ getReducedDims <- function( #' Get embedding information stored in an ArchRProject #' -#' QQQ This function gets an embedding (i.e. QQQ) from a given ArchRProject. +#' This function gets an embedding (i.e. UMAP) from a given ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param embedding QQQ The name of the embedding object to retrieve from the designated ArchRProject. Options include QQQ. -#' @param return If set to "df", the function will return the embedding object as a data.frame. Otherwise, it will return the full embedding object. +#' @param ArchRProj An `ArchRProject` object. +#' @param embedding The name of the `embeddings` object (i.e. UMAP, TSNE see embeddingOut of addEmbeddings) to retrieve from the designated `ArchRProject`. +#' @param returnDF A boolean value indicating whether to return the embedding object as a `data.frame`. Otherwise, it will return the full embedding object. #' @param ... additional args #' @export getEmbedding <- function(ArchRProj, embedding = "UMAP", returnDF = TRUE, ...){ @@ -499,8 +845,8 @@ getEmbedding <- function(ArchRProj, embedding = "UMAP", returnDF = TRUE, ...){ #' #' This function prints the projectSummary from an ArchRProject #' -#' @param ArchRProj ArchRProject -#' @param returnSummary return summary or just print +#' @param ArchRProj An `ArchRProject` object. +#' @param returnSummary A boolean value indicating whether to return a summary of the `ArchRProject` or to just print the summary. #' @param ... additional args #' @export getProjectSummary <- function(ArchRProj, returnSummary = FALSE, ...){ @@ -524,9 +870,9 @@ getProjectSummary <- function(ArchRProj, returnSummary = FALSE, ...){ #' #' This function adds info to the projectSummary from an ArchRProject #' -#' @param ArchRProj ArchRProject -#' @param name name of summary input -#' @param summary summary vector +#' @param ArchRProj An `ArchRProject` object. +#' @param name The name of the summary information to add to the `ArchRProject` object. +#' @param summary A vector to add as summary information to the `ArchRProject` object. #' @param ... additional args #' @export addProjectSummary <- function(ArchRProj, name, summary, ...){ @@ -539,340 +885,18 @@ addProjectSummary <- function(ArchRProj, name, summary, ...){ ArchRProj } -########################################################################################## -# Annotation Methods -########################################################################################## - -#' Get annotation from an ArchRProject -#' -#' This function gets an annotation from a given ArchRProject. -#' -#' @param ArchRProj An ArchRProject object. -#' @param name QQQ The name of the annotation object to retrieve from the designated ArchRProject. Options include QQQ. -#' @param ... additional args -#' @export -getAnnotation <- function(ArchRProj, name = NULL, ...){ - ArchRProj <- .validArchRProject(ArchRProj) - if(is.null(name)){ - name <- 1 - }else{ - if(name %ni% names(ArchRProj@annotations)){ - stop("Name is not in Annotations!") - } - } - ArchRProj@annotations[[name]] -} - -#' Get annotation positions from an ArchRProject -#' -#' This function gets the annotation positions from a given ArchRProject. -#' -#' @param ArchRProj An ArchRProject object. -#' @param name QQQ The name of the annotation object to retrieve from the designated ArchRProject. Options include QQQ. -#' @param annoName QQQ name to subset with annotations -#' @param ... additional args -#' @export -getPositions <- function(ArchRProj, name = NULL, annoName = NULL, ...){ - ArchRProj <- .validArchRProject(ArchRProj) - if(is.null(name)){ - name <- 1 - }else{ - if(name %ni% names(ArchRProj@annotations)){ - stop("Name is not in Annotations!") - } - } - anno <- ArchRProj@annotations[[name]] - idx <- grep("positions", names(anno), ignore.case=TRUE) - if(length(idx)==0){ - stop("Annotation does not contain positions!") - } - positions <- readRDS(anno[[idx]]) - if(!is.null(annoName)){ - idx <- grep(annoName, names(positions), ignore.case=TRUE) - if(length(idx)==0){ - stop("Positons do not contain annoName!") - } - positions <- positions[idx] - } - positions -} - -#' Get annotation matches from an ArchRProject -#' -#' This function gets annotation matches from a given ArchRProject. -#' -#' @param ArchRProj An ArchRProject object. -#' @param name name of annotations -#' @param annoName name to subset with annotations -#' @param ... additional args -#' @export -getMatches <- function(ArchRProj, name = NULL, annoName = NULL, ...){ - ArchRProj <- .validArchRProject(ArchRProj) - if(is.null(name)){ - name <- 1 - }else{ - if(name %ni% names(ArchRProj@annotations)){ - stop("Name is not in Annotations!") - } - } - anno <- ArchRProj@annotations[[name]] - idx <- grep("matches", names(anno), ignore.case=TRUE) - if(length(idx)==0){ - stop("Annotation does not contain positions!") - } - matches <- readRDS(anno[[idx]]) - if(!is.null(annoName)){ - idx <- grep(annoName, colnames(matches), ignore.case=TRUE) - if(length(idx)==0){ - stop("Matches do not contain annoName!") - } - matches <- matches[, idx, drop=FALSE] - } - matches -} - -#' Add motif annotations to an ArchRProject -#' -#' This function adds information about which peaks contain motifs to a given ArchRProject. For each peak, a binary value is stored indicating whether each motif is observed within the peak region. -#' -#' @param ArchRProj An ArchRProject object. -#' @param motifSet The motif set to be used for annotation. Options include: (i) "JASPAR2016", which gives the 2016 version of JASPAR motifs, (ii) "JASPAR2018", which gives the 2018 version of JASPAR motifs, or (iii) one of "human", "mouse", "encode", or "homer" which gives the corresponding motif sets from the chromVAR package. -#' @param name QQQ of annotations to store as in ArchRProject -#' @param species QQQ The name of the species relevant to the supplied ArchRProject. This is used for QQQ. By default, this function will attempt to guess the species based on the value from getGenome. -#' @param collection QQQ If one of the JASPAR motif sets is used via "motifSet", this parameter allows you to indicate the JASPAR collection to be used. Possible options include "CORE", QQQ. -#' @param cutOff The p-value cutoff to be used for motif search (see the motimatchr package for more information). -#' @param w The width in basepairs to consider for motif matches (see the motimatchr package for more information). -#' @param ... additional args -#' @export -addMotifAnnotations <- function( - ArchRProj = NULL, - motifSet = "cisbp", - name = "Motif", - species = NULL, - collection = "CORE", - cutOff = 5e-05, - w = 7, - ... - ){ - - .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")') - ArchRProj <- .validArchRProject(ArchRProj) - - if(grepl("JASPAR|CISBP", motifSet, ignore.case = TRUE) & is.null(species)){ - if(grepl("hg19",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ - species <- "Homo sapiens" - } - if(grepl("hg38",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ - species <- "Homo sapiens" - } - if(grepl("mm9",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ - species <- "Mus musculus" - } - if(grepl("mm10",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ - species <- "Mus musculus" - } - } - - ############################################################# - # Get PWM List adapted from chromVAR! - ############################################################# - tstart <- Sys.time() - .messageDiffTime(paste0("Gettting Motif Set, Species : ", species), tstart) - - if(tolower(motifSet)=="jaspar2020"){ - .requirePackage("JASPAR2020",installInfo='BiocManager::install("JASPAR2020")') - args <- list(species = species, collection = collection, ...) - motifs <- TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020, args) - obj <- .summarizeJASPARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else if(tolower(motifSet)=="jaspar2016"){ - .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")') - args <- list(species = species, collection = collection, ...) - motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args) - obj <- .summarizeJASPARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else if(tolower(motifSet)=="jaspar2016"){ - .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")') - args <- list(species = species, collection = collection, ...) - motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args) - obj <- .summarizeJASPARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else if(tolower(motifSet)=="cisbp"){ - .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') - if(tolower(species) == "mus musculus"){ - data("mouse_pwms_v2") - motifs <- mouse_pwms_v2 - obj <- .summarizeChromVARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else if(tolower(species) == "homo sapiens"){ - data("human_pwms_v2") - motifs <- human_pwms_v2 - obj <- .summarizeChromVARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else{ - stop("Species not recognized homo sapiens, mus musculus supported by CisBP!") - } - }else if(tolower(motifSet)=="encode"){ - .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') - data("encode_pwms") - motifs <- encode_pwms - obj <- .summarizeChromVARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else if(tolower(motifSet)=="homer"){ - .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') - data("homer_pwms") - motifs <- homer_pwms - obj <- .summarizeChromVARMotifs(motifs) - motifs <- obj$motifs - motifSummary <- obj$motifSummary - }else{ - stop("Error MotifSet Not Recognized!") - } - - ############################################################# - # Get BSgenome Information! - ############################################################# - genome <- ArchRProj@genomeAnnotation$genome - .requirePackage(genome) - BSgenome <- eval(parse(text = genome)) - BSgenome <- .validBSgenome(BSgenome) - - ############################################################# - # Calculate Motif Positions - ############################################################# - .messageDiffTime("Finding Motif Positions with motifmatchr!", tstart) - peakSet <- ArchRProj@peakSet - motifPositions <- motifmatchr::matchMotifs( - pwms = motifs, - subject = peakSet, - genome = BSgenome, - out = "positions", - p.cutoff = cutOff, - w = w - ) - - ############################################################# - # Motif Overlap Matrix - ############################################################# - .messageDiffTime("Creating Motif Overlap Matrix", tstart) - allPositions <- unlist(motifPositions) - overlapMotifs <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE) - motifMat <- Matrix::sparseMatrix( - i = queryHits(overlapMotifs), - j = match(names(allPositions),names(motifPositions))[subjectHits(overlapMotifs)], - x = rep(TRUE, length(overlapMotifs)), - dims = c(length(peakSet), length(motifPositions)) - ) - colnames(motifMat) <- names(motifPositions) - motifMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = motifMat), rowRanges = peakSet) - .messageDiffTime("Finished Getting Motif Info!", tstart) - - out <- SimpleList( - motifSummary = motifSummary, - motifMatches = motifMat, - motifPositions = motifPositions, - motifList = motifs, - date = Sys.Date() - ) - - dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) - savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds")) - saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) - - ArchRProj@annotations[[name]]$Name <- name - ArchRProj@annotations[[name]]$motifs <- motifs - ArchRProj@annotations[[name]]$motifSummary <- motifSummary - ArchRProj@annotations[[name]]$Positions <- savePositions - ArchRProj@annotations[[name]]$Matches <- saveMatches - - saveRDS(out, file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE) - saveRDS(out$motifPositions, savePositions, compress = FALSE) - saveRDS(out$motifMatches, saveMatches, compress = FALSE) - - return(ArchRProj) - -} - -.summarizeJASPARMotifs <- function(motifs){ - - motifNames <- lapply(seq_along(motifs), function(x){ - namex <- make.names(motifs[[x]]@name) - if(substr(namex,nchar(namex),nchar(namex))=="."){ - namex <- substr(namex,1,nchar(namex)-1) - } - namex <- paste0(namex, "_", x) - namex - }) %>% unlist(.) - - motifDF <- lapply(seq_along(motifs), function(x){ - data.frame( - row.names = motifNames[x], - name = motifs[[x]]@name[[1]], - ID = motifs[[x]]@ID, - strand = motifs[[x]]@strand, - symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) , - family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA), - alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA), - stringsAsFactors = FALSE - ) - }) %>% Reduce("rbind", .) %>% DataFrame - - names(motifs) <- motifNames - - out <- list(motifs = motifs, motifSummary = motifDF) - - return(out) - -} - -.summarizeChromVARMotifs <- function(motifs){ - - motifNames <- lapply(seq_along(motifs), function(x){ - namex <- make.names(motifs[[x]]@name) - if(substr(namex,nchar(namex),nchar(namex))=="."){ - namex <- substr(namex,1,nchar(namex)-1) - } - namex <- paste0(namex, "_", x) - namex - }) %>% unlist(.) - - motifDF <- lapply(seq_along(motifs), function(x){ - data.frame( - row.names = motifNames[x], - name = motifs[[x]]@name[[1]], - ID = motifs[[x]]@ID, - strand = motifs[[x]]@strand, - tags = motifs[[x]]@tags, - stringsAsFactors = FALSE - ) - }) %>% Reduce("rbind", .) %>% DataFrame - - names(motifs) <- motifNames - - out <- list(motifs = motifs, motifSummary = motifDF) - - return(out) - -} - ########################################################################################## # Additional Methods ########################################################################################## -#' Return the available features that could be selected from a given data matrix within an ArchRProject +#' Get the features that could be selected from a given data matrix within an ArchRProject #' #' This function will identify available features from a given data matrix (i.e. "GeneScoreMatrix", or "TileMatrix") and return them for downstream plotting utilities. #' -#' @param ArchRProj An ArchRProject object. -#' @param useMatrix QQQ The name of the data matrix as stored in the ArrowFiles of the ArchRProject. Options include "TileMatrix", "GeneScoreMatrix", QQQ. -#' @param select QQQ select a specific name with grep -#' @param ignore.case A boolean value indicating whether or not to ignore the case (upper-case / lower-case) when searching via grep for the string passed to "select". +#' @param ArchRProj An `ArchRProject` object. +#' @param useMatrix The name of the data matrix as stored in the ArrowFiles of the `ArchRProject`. Options include "TileMatrix", "GeneScoreMatrix", etc. +#' @param select A string specifying a specific featureName (or rowname) found with grep +#' @param ignore.case A boolean value indicating whether or not to ignore the case (upper-case / lower-case) when searching via grep for the string passed to `select`. #' @param ... additional args #' @export getFeatures <- function(ArchRProj, useMatrix = "GeneScoreMatrix", select = NULL, ignore.case = TRUE, ...){ @@ -902,13 +926,15 @@ getFeatures <- function(ArchRProj, useMatrix = "GeneScoreMatrix", select = NULL, #' #' This function will save a plot or set of plots as a PDF file in the output directory of a given ArchRProject. #' +#' @param ... vector of plots to be plotted (if input is a list use plotList instead) #' @param name The file name to be used for the output PDF file. #' @param width The width in inches to be used for the output PDF file. #' @param height The height in inches to be used for the output PDF. -#' @param ArchRProj An ArchRProject object. +#' @param ArchRProj An `ArchRProject` object to be used for getting plotDirectory in outputDirectory. #' @param addDOC A boolean variable that determines whether to add the date of creation to end of the PDF file name. This is useful for preventing overwritting of old plots. #' @param useDingbats A boolean variable that determines wheter to use dingbats characters for plotting points. -#' @param ... additional args to pdf +#' @param plotList A `list` of plots to be printed to the output PDF file. Each element of `plotList` should be a printable plot formatted object (ggplot2, plot, heatmap, etc). +#' @param useSink use sink to hide messages from plotting #' @export plotPDF <- function(..., name = "Plot", width = 6, height = 6, ArchRProj = NULL, addDOC = TRUE, @@ -953,60 +979,71 @@ plotPDF <- function(..., name = "Plot", width = 6, sink(tmpFile) } - pdf(filename, width = width, height = height, useDingbats = useDingbats) - for(i in seq_along(plotList)){ - - if(inherits(plotList[[i]], "gg")){ + o <- tryCatch({ + + pdf(filename, width = width, height = height, useDingbats = useDingbats) + for(i in seq_along(plotList)){ - print("plotting ggplot!") + if(inherits(plotList[[i]], "gg")){ + + print("plotting ggplot!") + + if(!is.null(attr(plotList[[i]], "ratioYX"))){ + print(.fixPlotSize(plotList[[i]], plotWidth = width, plotHeight = height, height = attr(plotList[[i]], "ratioYX"), newPage = FALSE)) + }else{ + print(.fixPlotSize(plotList[[i]], plotWidth = width, plotHeight = height, newPage = FALSE)) + } + + if(i != length(plotList)){ + grid::grid.newpage() + } + + }else if(inherits(plotList[[i]], "gtable")){ + + print(grid::grid.draw(plotList[[i]])) + if(i != length(plotList)){ + grid::grid.newpage() + } + }else if(inherits(plotList[[i]], "HeatmapList") | inherits(plotList[[i]], "Heatmap") ){ + padding <- 45 + draw(plotList[[i]], + padding = unit(c(padding, padding, padding, padding), "mm"), + heatmap_legend_side = "bot", + annotation_legend_side = "bot" + ) - if(!is.null(attr(plotList[[i]], "ratioYX"))){ - print(.fixPlotSize(plotList[[i]], plotWidth = width, plotHeight = height, height = attr(plotList[[i]], "ratioYX"), newPage = FALSE)) }else{ - print(.fixPlotSize(plotList[[i]], plotWidth = width, plotHeight = height, newPage = FALSE)) - } - if(i != length(plotList)){ - grid::grid.newpage() - } - - }else if(inherits(plotList[[i]], "gtable")){ - - print(grid::grid.draw(plotList[[i]])) - if(i != length(plotList)){ - grid::grid.newpage() - } - }else if(inherits(plotList[[i]], "HeatmapList") | inherits(plotList[[i]], "Heatmap") ){ - padding <- 45 - draw(plotList[[i]], - padding = unit(c(padding, padding, padding, padding), "mm"), - heatmap_legend_side = "bot", - annotation_legend_side = "bot" - ) + print("plotting with print") + + print(plotList[[i]]) - }else{ + } - print("plotting with print") - - print(plotList[[i]]) + } + dev.off() + if(useSink){ + sink() + file.remove(tmpFile) } - } - dev.off() + }, error = function(x){ - if(useSink){ - sink() - file.remove(tmpFile) - } + suppressWarnings(sink()) + message(x) + + }) + + return(0) } #' Get Tutorial Data For ArchR #' -#' This function will download data for a given tutorial and return the inputFiles for ArchR +#' This function will download data for a given tutorial and return the input files required for ArchR #' -#' @param tutorial Available tutorials are Hematopoiesis, PBMC, FreshFrozen +#' @param tutorial The name of the available tutorial for which to retreive the tutorial data. Options are "Hematopoiesis", "PBMC", "FreshFrozen". "Hematopoiesis" refers to hematopoieitic scATAC hierarchy. "PBMC" refers to a small standard PBMC scATAC dataset. "FreshFrozen" refers to a PBMC fresh and frozen scATAC dataset. #' @param ... additional args #' @export getTutorialData <- function(tutorial = "hematopoiesis", ...){ @@ -1073,9 +1110,9 @@ getTutorialData <- function(tutorial = "hematopoiesis", ...){ #' Get Input Files from paths to create arrows #' -#' This function will look for fragment files and bam files in the input paths and return the full path and sampleNames +#' This function will look for fragment files and bam files in the input paths and return the full path and sample names #' -#' @param paths vector of paths for searching for input files +#' @param paths A character vector of paths to search for usable input files. #' @param ... additional args #' @export getInputFiles <- function(paths, ...){ @@ -1107,8 +1144,8 @@ getInputFiles <- function(paths, ...){ #' #' This function will read in processed 10x cell ranger files and identify barcodes that are associated with a cell that passed QC. #' -#' @param csvFiles vector of filenames for reading in and identifying valid cell barcodes -#' @param sampleNames sample names associated with each individual csv file +#' @param csvFiles A character vector of file names to be read in for identification of valid cell barcodes. +#' @param sampleNames A character vector containing the sample names to be associated with each individual entry in `csvFiles`. #' @param ... additional args #' @export getValidBarcodes <- function(csvFiles, sampleNames, ...){ diff --git a/R/ArrowRead.R b/R/ArrowRead.R index 932b06af..dcee427f 100644 --- a/R/ArrowRead.R +++ b/R/ArrowRead.R @@ -7,9 +7,9 @@ #' This function retrieves the fragments from a given ArrowFile as a GRanges object. #' #' @param ArrowFile The ArrowFile object from which fragments should be obtained. -#' @param chr A name of a chromosome to be used to subset the fragments GRanges object to a specific chromsome if desired. -#' @param cellNames QQQ matrix output name in ArrowFiles cannot be a protected matrix name -#' @param verbose A boolean variable indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. +#' @param chr A name of a chromosome to be used to subset the fragments `GRanges` object to a specific chromsome if desired. +#' @param cellNames A string vector of cell names to extract fragments for. By default will extract all cells getCellNames(ArchRProject) for cells only in current project. +#' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. #' @param ... additional params #' @export getFragmentsFromArrow <- function( @@ -147,10 +147,12 @@ getFragmentsFromArrow <- function( #' This function gets a given data matrix from an individual ArrowFile. #' #' @param ArrowFile The ArrowFile object from which the selected data matrix should be obtained. -#' @param useMatrix QQQ The name of the data matrix to retrieve from the given ArrowFile. Options include "TileMatrix", "GeneScoreMatrix", QQQ. +#' @param useMatrix The name of the data matrix to retrieve from the given ArrowFile. Options include "TileMatrix", "GeneScoreMatrix", etc. #' @param useSeqnames A character vector of chromosome names to be used to subset the data matrix being obtained. -#' @param cellNames QQQ ceiling for the number of counts per feature -#' @param verbose A boolean variable indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. +#' @param cellNames A string vector of cell names to extract the matrix for. By default will extract all cells getCellNames(ArchRProject) for cells only in current project. +#' @param ArchRProj An ArchRProject object to be used for getting additional information for cells in cellColData. +#' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. +#' @param binarize A boolean value indicating whether the matrix should be binarized before return. This is often desired when working with insertion counts. #' @param ... additional params #' @export getMatrixFromArrow <- function( @@ -581,7 +583,7 @@ getMatrixFromArrow <- function( rownames(featureDF) <- paste0("f", seq_len(nrow(featureDF))) fnames <- rownames(featureDF) - featureDF <- split(featureDF, as.character(featureDF$seqnames)) + featureDF <- S4Vectors::split(featureDF, as.character(featureDF$seqnames)) ns <- lapply(seq_along(ArrowFiles), function(y){ length(.availableCells(ArrowFiles[y], useMatrix)) diff --git a/R/Clustering.R b/R/Clustering.R index 7cd68651..e6a851be 100644 --- a/R/Clustering.R +++ b/R/Clustering.R @@ -1,15 +1,25 @@ +########################################################################################## +# Clustering Methods +########################################################################################## + +#' Add cluster information to an ArchRProject #' -#' This function will identify clusters for single cell reduced dimensions supplied or from and ArchRProject +#' This function will identify clusters from a reduced dimensions object in an ArchRProject or from a supplied reduced dimensions matrix. #' -#' @param input ArchRProject or matrix for cluster identification -#' @param reducedDims reducedDims of ArchRProject if provided -#' @param name name of cluster column if input is ArchRProject -#' @param method supported methods are Seurat and LouvainJaccard -#' @param dimsToUse reduced dims to use -#' @param knnAssign number of nearest neighbors for assignment of outliers and estimation -#' @param nOutlier number of cells required for a cluster to be called if not then these will be considered an outlier -#' @param seed seed -#' @param ... arguments to provide Seurat::FindClusters or ArchR:::.clustLouvain (knn = 50, jaccard = TRUE) +#' @param input Either (i) an `ArchRProject` object containing the dimensionality reduction matrix passed by `reducedDims` or (ii) a dimensionality reduction matrix. This object will be used for cluster identification. +#' @param reducedDims The name of the `reducedDims` object (i.e. IterativeLSI) to retrieve from the designated `ArchRProject`. Not required if input is a matrix. +#' @param name The column name of the column to be added to `cellColData` if `input` is an `ArchRProject` object. +#' @param sampleCells An integer specifying number of cells to subset perform clustering and assign the remainder cells by euclidean distance. +#' @param seed A number to be used as the seed for random number generation required in cluster determination. It is recommended to keep track of the seed used so that you can reproduce results downstream. +#' @param method A string indicating the clustering method to be used. Supported methods are "Seurat" and "Scran". +#' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in clustering. +#' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the corCutOff, it will be excluded from analysis. +#' @param knnAssign The number of nearest neighbors to be used during clustering for assignment of outliers (clusters with less than nOutlier cells). +#' @param nOutlier The minimum number of cells required for a group of cells to be called as a cluster. If a group of cells does not reach this threshold, then the cells will be considered outliers and assigned to nearby clusters. +#' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. +#' @param tstart The time at which the function run was started. Useful for keeping track of how long clustering takes relative to a start time. +#' @param force A boolean value that indicates whether or not to overwrite data in a given column when the value passed to `name` already exists as a column name in `cellColData`. +#' @param ... Additional arguments to be provided to Seurat::FindClusters or scran::buildSNNGraph (for example, knn = 50, jaccard = TRUE) #' @export #' addClusters <- function( @@ -40,7 +50,6 @@ addClusters <- function( stop("Error reducedDims not available!") } matDR <- getReducedDims(ArchRProj = input, reducedDims = reducedDims, dimsToUse = dimsToUse, corCutOff = corCutOff) - print(dim(matDR)) }else if(inherits(input, "matrix")){ matDR <- input }else{ @@ -86,6 +95,7 @@ addClusters <- function( }else if(grepl("louvainjaccard",tolower(method))){ + stop("LouvainJaccard method not currently functional!") clust <- .clustLouvain(matDR, ...) }else{ @@ -99,7 +109,7 @@ addClusters <- function( ################################################################################# if(estimatingClusters == 1){ .messageDiffTime("Finding Nearest Clusters", tstart, verbose = verbose) - knnAssigni <- computeKNN(matDR, matDRAll[-idx,], knnAssign) + knnAssigni <- .computeKNN(matDR, matDRAll[-idx,], knnAssign) clustUnique <- unique(clust) clustMatch <- match(clust, clustUnique) knnAssigni <- apply(knnAssigni, 2, function(x) clustMatch[x]) @@ -128,7 +138,7 @@ addClusters <- function( for(i in seq_along(clustAssign)){ clusti <- names(clustAssign[i]) idxi <- which(clust==clusti) - knni <- computeKNN(matDR[-idxi,], matDR[idxi,], knnAssign) + knni <- .computeKNN(matDR[-idxi,], matDR[idxi,], knnAssign) clustf <- unlist(lapply(seq_len(nrow(knni)), function(x) names(sort(table(clust[-idxi][knni[x,]]),decreasing=TRUE)[1]))) clust[idxi] <- clustf } @@ -149,7 +159,7 @@ addClusters <- function( } .messageDiffTime(sprintf("Assigning Cluster Names to %s Clusters", length(unique(clust))), tstart, verbose = verbose) meanSVD <- t(.groupMeans(t(matDR), clust)) - meanKNN <- computeKNN(meanSVD, meanSVD, nrow(meanSVD)) + meanKNN <- .computeKNN(meanSVD, meanSVD, nrow(meanSVD)) idx <- sample(seq_len(nrow(meanSVD)), 1) clustOld <- c() clustNew <- c() @@ -325,9 +335,8 @@ addClusters <- function( # } -#' Group Means #' @export -computeKNN <- function(data, query = NULL, k = 50, method = NULL, includeSelf = FALSE, ...){ +.computeKNN <- function(data, query = NULL, k = 50, method = NULL, includeSelf = FALSE, ...){ if(is.null(query)){ query <- data @@ -388,8 +397,3 @@ computeKNN <- function(data, query = NULL, k = 50, method = NULL, includeSelf = } - - - - - diff --git a/R/CoAccessibility.R b/R/CoAccessibility.R index 318cf43e..4b2ce0c3 100644 --- a/R/CoAccessibility.R +++ b/R/CoAccessibility.R @@ -1,18 +1,22 @@ -#' Add Peak Co-Accessibility to ArchR Project +########################################################################################## +# Co-accessibility Methods +########################################################################################## + +#' Add Peak Co-Accessibility to an ArchRProject #' -#' This function will randomly group cells and compute correlations of knn groupings +#' This function will add co-accessibility scores to peaks in a given ArchRProject #' -#' @param ArchRProj ArchRProject -#' @param reducedDims reduced dimensions for KNN groupings -#' @param k k-nearest neighbors -#' @param knnIteration number of KNN groupings to test overlapCutoff -#' @param overlapCutoff overlap maximum between group and previous groups to be added to group list -#' @param maxDist maximum distance in bp between peaks for co-accessibility -#' @param scaleTo scale group accessibility to prior to computing correlations -#' @param log2Norm log2 normalize prior to computing correlations -#' @param seed seed for sampling -#' @param knnMethod method for KNN computations -#' @param threads number of threads +#' @param ArchRProj An `ArchRProject` object. +#' @param reducedDims The name of the `reducedDims` object (i.e. "IterativeLSI") to retrieve from the designated `ArchRProject`. +#' @param k The number of k-nearest neighbors to use for creating single cell groups for correlation. +#' @param knnIteration The number of KNN groupings to test for passing the supplied `overlapCutoff`. +#' @param overlapCutoff The maximum allowable overlap between the current group and all previous groups to permit the current group be added to the group list during k-nearest neighbor calculations. +#' @param maxDist The maximum allowable distance in basepairs between two peaks to consider for co-accessibility. +#' @param scaleTo A numeric value indicating how to scale the accessibility of a single cell group prior to computing co-accessibility correlations. +#' @param log2Norm A boolean value indicating whether to log2 transform the single cell groups prior to computing co-accessibility correlations. +#' @param seed A number to be used as the seed for random number generation required in cluster determination. It is recommended to keep track of the seed used so that you can reproduce results downstream. +#' @param knnMethod The method to be used for k-nearest neighbor computations. Options are "nabor", "RANN", and "FNN" and the corresponding package is required. +#' @param threads The number of threads to be used for parallel computing. #' @param ... additional args #' @export addCoAccessibility <- function( @@ -25,7 +29,8 @@ addCoAccessibility <- function( scaleTo = 10^4, log2Norm = TRUE, seed = 1, - knnMethod = "nabor", + knnMethod = "nabor", + threads = 1, ... ){ @@ -41,7 +46,7 @@ addCoAccessibility <- function( #KNN Matrix .messageDiffTime("Computing KNN", tstart) - knnObj <- computeKNN(data = rD, query = rD[idx,], k = k, method = knnMethod) + knnObj <- .computeKNN(data = rD, query = rD[idx,], k = k, method = knnMethod) #Determin Overlap .messageDiffTime("Identifying Non-Overlapping KNN pairs", tstart) diff --git a/R/ColorPalettes.R b/R/ColorPalettes.R index 1a195e3a..ae527d64 100644 --- a/R/ColorPalettes.R +++ b/R/ColorPalettes.R @@ -1,4 +1,11 @@ -#' List of palettes to be used in plots +########################################################################################## +# Plot Aesthetics Objects and Methods +########################################################################################## + +#' List of color palettes that can be used in plots +#' +#' A collection of some original and some borrowed color palettes to provide appealing color aesthetics for plots in ArchR +#' #' @export ArchRPalettes <- list( @@ -82,12 +89,11 @@ ArchRPalettes <- list( #' Optimized discrete color palette generation #' -#' This function assesses the number of inputs and returns a discrete color palette that is tailored to provide the most posible color contrast from the designated set. +#' This function assesses the number of inputs and returns a discrete color palette that is tailored to provide the most possible color contrast from the designated color set. #' -#' @param set The name or numeric index of a color palette provided in ArchR_palettes. -#' @param values A character vector containing the sample names that will be used. Each entry in this character vector will be given a unique color from the designated pallete set. +#' @param set The name or numeric index of a color palette provided in the `ArchRPalettes` list object. +#' @param values A character vector containing the sample names that will be used. Each entry in this character vector will be given a unique color from the designated palette set. #' @param reverse A boolean variable that indicates whether to return the palette colors in reverse order. -#' @param returnStructure QQQ return structure palette #' @export paletteDiscrete <- function( set = "stallion", @@ -133,10 +139,9 @@ paletteDiscrete <- function( #' Continuous Color Palette #' -#' @param set The name or numeric index of a color palette provided in ArchR_palettes. +#' @param set The name or numeric index of a color palette provided in the `ArchRPalettes` list object. #' @param n The number of unique colors to generate as part of this continuous color palette. #' @param reverse A boolean variable that indicates whether to return the palette colors in reverse order. -#' @param returnStructure QQQ return structure palette #' @export paletteContinuous <- function( set = "solar_extra", diff --git a/R/ComputeEmbedding.R b/R/ComputeEmbedding.R index 8ca463fd..4a1c7eb2 100644 --- a/R/ComputeEmbedding.R +++ b/R/ComputeEmbedding.R @@ -1,27 +1,22 @@ -#' Add embedding of a reduced dimensions object in an ArchRProject +########################################################################################## +# Embedding Methods +########################################################################################## + +#' Add an embedding of a reduced dimensions object to an ArchRProject #' -#' This function will compute an embedding and add to an ArchRProject. +#' This function will compute an embedding and add it to an ArchRProject. #' -#' @param ArchRProj An ArchRProject object. -#' @param reducedDims QQQ The name of the reducedDims object to use. Possible options include "IterativeLSI", QQQ. -#' @param embedding QQQ The name of the embedding to create. Possible options include "UMAP", "TUMAP", "RTSNE", and "FFRTSNE". -#' @param colorBy QQQ colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) -#' @param name QQQ name of column in cellColData or Feature in Array in Arrows -#' @param log2Norm A boolean value that indicates whether log2 Normalization should be performed on the features if they are continuous. -#' @param pal The name or numeric index of a custom palette from ArchR_palettes to use for plotting the individual points of the embedding visualization. -#' @param size The numeric size of points to plot. -#' @param rastr A boolean valut that indicates that the plot should be rasterized. This does not rasterize lines and labels, just the internal portions of the plot. -#' @param quantCut If this is not null, a quantile cut is performed to threshold the top and bottom of the distribution. This prevents skewed color scales caused by strong outliers. The format of this should be c(x,y) where x is the upper threshold and y is the lower threshold. For example, quantileCut = c(0.975,0.025) will take the top and bottom 2.5% of values and set them to the value of the 97.5th and 2.5th percentile values respectively. -#' @param quantHex QQQ quantile evaluation for each hex in geom_hex -#' @param discreteSet QQQ The name or numeric index of a discrete palette from ArchR_palettesdiscrete to use for plotting QQQ. -#' @param continuousSet QQQcontinuous palette for visualizing embedding -#' @param randomize A boolean value that determines whether to randomly order the plotting of points to avoid (for ex.) all points from a single cluster being plotted as the top-most layer of the plot. -#' @param keepAxis QQQ keep x and y axis for plot -#' @param baseSize QQQ The numeric font size to be used in the plot. This applies to all plot labels. -#' @param plotContinuous QQQ how to plot continuous features (points and hex) -#' @param plotParams QQQ additional params to pass to ggPoint/ggHex -#' @param plotWidth QQQ plot width used for creating a consistent plot independent of legend size -#' @param plotHeight QQQ plot height used for creating a consistent plot independent of legend size +#' @param ArchRProj An `ArchRProject` object. +#' @param reducedDims The name of the `reducedDims` object (i.e. IterativeLSI) to use from the designated `ArchRProject`. +#' @param embedding The type of the embedding to add to the `ArchRProject` object. Possible options include "UMAP", "TUMAP", "RTSNE", and "FFRTSNE". +#' @param embeddingOut The name for the embedding to be stored as in the `ArchRProject` object. +#' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in computing the embedding. +#' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the corCutOff, it will be excluded from analysis. +#' @param saveModel A boolean value indicating whether to save UMAP model for later usage such as projection. Only relevant for UMAP. +#' @param seed A number to be used as the seed for random number generation required in cluster determination. It is recommended to keep track of the seed used so that you can reproduce results downstream. +#' @param force A boolean value that indicates whether or not to overwrite the relevant data in the `ArchRProject` object if the given `embedding` already exists. +#' @param threads The number of threads to use for embedding generation computations. +#' @param embeddingParams A list of extra parameters to pass to the designated `embedding` function. #' @param ... additional args #' @export addEmbedding <- function( @@ -34,7 +29,7 @@ addEmbedding <- function( saveModel = TRUE, seed = 1, force = FALSE, - threads = floor(detectCores()/2), + threads = 1, embeddingParams = list(), ... ){ @@ -170,12 +165,3 @@ addEmbedding <- function( } - - - - - - - - - diff --git a/R/CreateArrow.R b/R/CreateArrow.R index cade7718..f40011f3 100644 --- a/R/CreateArrow.R +++ b/R/CreateArrow.R @@ -2,38 +2,43 @@ #' #' This function will create Arrow Files from input files. These Arrow Files are the main constituent for downstream analysis in ArchR. #' -#' @param inputFiles The names of the input files to use to generate the arrow files. These files can be in any of the following formats: tabix QQQ, QQQ BAM, or a fragments file). The precise format of each file type QQQ... +#' @param inputFiles The names of the input files to use to generate the arrow files. These files can be in any of the following formats: scATAC tabix fragment files or a bam file). #' @param sampleNames The names to assign to the samples that correspond to the "inputFiles". Each input file should receive a unique sample name. This list should be in the same order as "inputFiles". #' @param outputNames The prefix to use for output files. Each input file should receive a unique output file name. This list should be in the same order as "inputFiles". For example, if the predix is "PBMC" the output file will be named "PBMC.arrow" -#' @param geneAnno QQQ The geneAnnotation in QQQ format to associate with these arrow files. This is used downstream to calculate TSS Scores etc. -#' @param genomeAnno QQQ The genomeAnnotation in QQQ format to associate with these arrow files. This is used downstream to collect chromosome sizes and nucleotide information etc. +#' @param validBarcodes A list of validBarcode strings to be used for filtering cells read from each input file (see getValidBarcodes for 10x fragment files). +#' @param geneAnno The geneAnnotation (see createGeneAnnotation) to associate with these arrow files. This is used downstream to calculate TSS Enrichment Scores etc. +#' @param genomeAnno The genomeAnnotation (see createGenomeAnnotation) format to associate with these arrow files. This is used downstream to collect chromosome sizes and nucleotide information etc. #' @param filterFrags The minimum number of mapped ATAC-seq fragments required per cell to pass filtering for use in downstream analyses. #' @param filterTSS The minimum numeric transcription start site (TSS) enrichment score required for a cell to pass filtering for use in downstream analyses. TSS enrichment score is a measurement of signal-to-background in ATAC-seq. #' @param removeFilteredCells A boolean value that determines whether to remove fragments corresponding to cells that do not pass filterFrags and filterTSS. -#' @param minFrags QQQ min fragments per cell to be immediately filtered +#' @param minFrags The minimum fragments per cell to be filtered immediately before any QC calculations (such as TSS Enrichment). #' @param outDir The name or path for the output directory for QC-level information and plots for each sample/arrow. #' @param nucLength The length in basepairs that wraps around a nucleosome. This number is used for identifying fragments as sub-nucleosome, mono-nucleosome, or multi-nucleosome spanning -#' @param TSSParams QQQ TSS parameters for computing TSS scores +#' @param TSSParams TSS parameters for computing TSS Enrichment scores. This includes `window` which describes the window centered at each TSS (default 101), the `flank` which describes the +/- window size to compute TSS enrichment (default 2000) , the `norm` which describes the normalization window size at the flanks to compute TSS enrichment (default 100 i.e. -2000:-1901 and 1901:2000). #' @param excludeChr The names of chromosomes to be excluded from downstream analyses. In most human/mouse analyses, this includes the mitochondrial DNA (chrM) and the male sex chromosome (chrY). This does, however, not exclude the corresponding fragments from being stored in the .arrow file. #' @param nChunk The number of chunks to divide each chromosome into reading in input files. Higher numbers reduce memory usage but increase compute time. #' @param bcTag The name of the field in the input bam file containing the barcode tag information. See ScanBam in Rsamtools. -#' @param bamFlag QQQ A list of bam flags to be used for reading in fragments from input bam files. Fromat should be QQQ. See ScanBam in Rsamtools. +#' @param gsubExpression A regular expression to clean up the barcode tag read in from a bam file. For example if the barcode is appended to the qname (read name) like for Shendure mouse data the gsubExpression would be ":.*" for getting the string after the colon in the qname. +#' @param bamFlag A list of bam flags to be used for reading in fragments from input bam files. Format should be scanBamFlag for ScanBam in Rsamtools. #' @param offsetPlus The numeric offset to apply to a "+" stranded Tn5 insertion to account for the precise Tn5 binding site. See Buenrostro et al. Nature Methods 2013. -#' @param offsetMinus The numeric offset to apply to a "-" stranded Tn5 insertion to account for the precise Tn5 binding site. Ssee Buenrostro et al. Nature Methods 2013. +#' @param offsetMinus The numeric offset to apply to a "-" stranded Tn5 insertion to account for the precise Tn5 binding site. See Buenrostro et al. Nature Methods 2013. #' @param addTileMat A boolean value indicating whether to add a "Tile Matrix" to each Arrow file. A Tile Matrix is a counts matrix that, instead of using peaks, uses a fixed-width sliding window of bins across the whole genome. -#' @param TileMatParams A list of parameters to pass to the addTileMatrix function. See addTileMatrix for options. +#' @param TileMatParams A list of parameters to pass to the `addTileMatrix()` function. See `ArchR::addTileMatrix()` for options. #' @param addGeneScoreMat A boolean value indicating whether to add a Gene-Score Matrix to each Arrow file. A Gene-Score Matrix uses ATAC-seq signal proximal to the TSS to estimate gene activity. -#' @param GeneScoreMatParams A list of parameters to pass to the addGeneScoreMatrix function. See addGeneScoreMatrix for options. -#' @param force A bollean value indicating whether to force arrow files to be overwritten if already exist in outDir. -#' @param threads The number threads to be used for parallel computing. -#' @param parallelParam QQQ A list of parameters to be used for batch-style parallel computing. +#' @param GeneScoreMatParams A list of parameters to pass to the [addGeneScoreMatrix()] function. See `ArchR::addGeneScoreMatrix()` for options. +#' @param force A boolean value indicating whether to force arrow files to be overwritten if they already exist in `outDir`. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. +#' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args #' @export +#' createArrowFiles <- function( inputFiles = NULL, sampleNames = NULL, outputNames = paste0("./", sampleNames), - validBaroces = NULL, + validBarcodes = NULL, geneAnno = NULL, genomeAnno = NULL, filterFrags = 1000, @@ -253,6 +258,7 @@ createArrowFiles <- function( }, error = function(x){ + suppressWarnings(sink()) .messageDiffTime("Continuing through after error ggplot for Fragment Size Distribution", tstart) print(x) message("\n") @@ -308,6 +314,7 @@ createArrowFiles <- function( }, error = function(x) { + suppressWarnings(sink()) .messageDiffTime("Continuing through after error ggplot for TSS by Frags", tstart) print(x) message("\n") @@ -588,7 +595,7 @@ createArrowFiles <- function( } ######################################################################################################### -# Methods to Turn Input File into a Temp File that can then be Efficiently converted to an Arrow! +# Methods to turn input file into a temp file that can then be efficiently converted to an ArrowFile! ######################################################################################################### .isTabix <- function(file){ tryCatch({ @@ -864,7 +871,7 @@ createArrowFiles <- function( ######################################################################################################### -# Methods to temp file to arrow! +# Methods to convert temp file to an ArrowFile! ######################################################################################################### .tmpToArrow <- function( diff --git a/R/DoubletsScores.R b/R/DoubletsScores.R index 8704d0ba..ce989e26 100644 --- a/R/DoubletsScores.R +++ b/R/DoubletsScores.R @@ -1,20 +1,25 @@ +########################################################################################## +# Doublet Identification Methods +########################################################################################## + #' Add Doublet Scores to a collection of Arrow files or an ArchRProject #' #' For each sample in the Arrow files or ArchRProject provided, this function will independently assign inferred doublet information #' to each cell. This allows for removing strong heterotypic doublet-based clusters downstream. A doublet results from a droplet that #' contained two cells, causing the ATAC-seq data to be a mixture of the signal from each cell. #' -#' @param input An ArchRProject object or a set of ArrowFiles. -#' @param useMatrix The name of the matrix to be used for performing doublet identification analyses. +#' @param input An `ArchRProject` object or a character vector containing the names of ArrowFiles to be used. +#' @param useMatrix The name of the matrix to be used for performing doublet identification analyses. Options include "TileMatrix", "PeakMatrix". #' @param k The number of cells neighboring a simulated doublet to be considered as putative doublets. -#' @param nTrials The number of trials (in terms of the number of input cells) to simulate doublets when calculating doublet scores. A value of 5 would utilize 5N trials. +#' @param nTrials The number of trials (in terms of the number of input cells) to simulate doublets when calculating doublet scores. A value of 5 would utilize 5 trials. +#' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in clustering. +#' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the corCutOff, it will be excluded from analysis. #' @param knnMethod The name of the dimensionality reduction method to be used for k-nearest neighbors calculation. Possible values are "UMAP" or "SVD". -#' @param UMAPParams The list of parameters to pass to the UMAP function. See the function umap in the uwot package. -#' @param LSIParams The list of parameters to pass to the IterativeLSI. See IterativeLSI. -#' @param useClusters QQQ A boolean value that determins QQQ -#' @param outDir The name or path for the output directory for writing information on doublet identification, -#' @param threads The number threads to be used for parallel computing. -#' @param parallelParam QQQ A list of parameters to be used for batch-style parallel computing. +#' @param UMAPParams The list of parameters to pass to the UMAP function if "UMAP" is designated to `knnMethod`. See the function umap in the uwot package. +#' @param LSIParams The list of parameters to pass to the IterativeLSI function. See IterativeLSI. +#' @param outDir The name or path for the output directory for plot/result information on doublet identification, +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. #' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. #' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args @@ -132,7 +137,6 @@ addDoubletScores <- function( dir.create(tmpDir) proj <- suppressMessages(ArchRProject( ArrowFiles = ArrowFile, - sampleNames = .sampleName(ArrowFile), outputDirectory = tmpDir, copyArrows = FALSE, showLogo = FALSE, @@ -242,99 +246,109 @@ addDoubletScores <- function( dfDoub$color <- dfDoub$density tmpFile <- .tempfile() - sink(tmpFile) - #Plot Doublet Summary - pdf(file.path(outDir, paste0(.sampleName(ArrowFile), "-Doublet-Summary.pdf")), width = 6, height = 6) + o <- tryCatch({ - #Plot Doublet Density - xlim <- range(df$X1) %>% extendrange(f = 0.05) - ylim <- range(df$X2) %>% extendrange(f = 0.05) - - if(!requireNamespace("ggrastr", quietly = TRUE)){ - - message("ggrastr is not available for rastr of points, continuing without rastr!") - - pdensity <- ggplot() + - geom_point(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + - geom_point(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + - scale_colour_gradientn(colors = pal) + - xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + - guides(fill = FALSE) + theme_ArchR(baseSize = 6) + - labs(color = "Simulated Doublet Density") + - theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), - axis.text.y = element_blank(), axis.ticks.y = element_blank()) + - coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + - ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", - legend.box.background = element_rect(color = NA)) + sink(tmpFile) - }else{ + #Plot Doublet Summary + pdf(file.path(outDir, paste0(.sampleName(ArrowFile), "-Doublet-Summary.pdf")), width = 6, height = 6) - .requirePackage("ggrastr") - - pdensity <- ggplot() + - geom_point_rast(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + - geom_point_rast(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + - scale_colour_gradientn(colors = pal) + - xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + - labs(color = "Simulated Doublet Density") + - guides(fill = FALSE) + theme_ArchR(baseSize = 6) + - theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), - axis.text.y = element_blank(), axis.ticks.y = element_blank()) + - coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + - ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", - legend.box.background = element_rect(color = NA)) + #Plot Doublet Density + xlim <- range(df$X1) %>% extendrange(f = 0.05) + ylim <- range(df$X2) %>% extendrange(f = 0.05) + + if(!requireNamespace("ggrastr", quietly = TRUE)){ + + message("ggrastr is not available for rastr of points, continuing without rastr!") + + pdensity <- ggplot() + + geom_point(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + + geom_point(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + + scale_colour_gradientn(colors = pal) + + xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + + guides(fill = FALSE) + theme_ArchR(baseSize = 6) + + labs(color = "Simulated Doublet Density") + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + + ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", + legend.box.background = element_rect(color = NA)) + + }else{ + + .requirePackage("ggrastr") + + pdensity <- ggplot() + + geom_point_rast(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + + geom_point_rast(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + + scale_colour_gradientn(colors = pal) + + xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + + labs(color = "Simulated Doublet Density") + + guides(fill = FALSE) + theme_ArchR(baseSize = 6) + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + + ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", + legend.box.background = element_rect(color = NA)) - } - - print(.fixPlotSize(pdensity, plotWidth = 6, plotHeight = 6)) - - #Plot Doublet Score - pscore <- ggPoint( - x = df[,1], - y = df[,2], - color = .quantileCut(df$score, 0, 0.95), - xlim = xlim, - ylim = ylim, - discrete = FALSE, - size = 0.5, - xlab = "UMAP Dimension 1", - ylab = "UMAP Dimension 2", - pal = pal, - title = "Doublet Scores -log10(FDR)", - colorTitle = "Doublet Scores -log10(FDR)", - rastr = TRUE, - baseSize = 6 - ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), - axis.text.y = element_blank(), axis.ticks.y = element_blank()) - - grid::grid.newpage() - print(.fixPlotSize(pscore, plotWidth = 6, plotHeight = 6)) - - #Plot Enrichment Summary - penrich <- ggPoint( - x = df[,1], - y = df[,2], - color = .quantileCut(df$enrichment, 0, 0.95), - xlim = xlim, - ylim = ylim, - discrete = FALSE, - size = 0.5, - xlab = "UMAP Dimension 1", - ylab = "UMAP Dimension 2", - pal = pal, - title = "Doublet Enrichment", - colorTitle = "Doublet Enrichment", - rastr = TRUE - ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), - axis.text.y = element_blank(), axis.ticks.y = element_blank()) - - grid::grid.newpage() - print(.fixPlotSize(penrich, plotWidth = 6, plotHeight = 6)) + } + + print(.fixPlotSize(pdensity, plotWidth = 6, plotHeight = 6)) + + #Plot Doublet Score + pscore <- ggPoint( + x = df[,1], + y = df[,2], + color = .quantileCut(df$score, 0, 0.95), + xlim = xlim, + ylim = ylim, + discrete = FALSE, + size = 0.5, + xlab = "UMAP Dimension 1", + ylab = "UMAP Dimension 2", + pal = pal, + title = "Doublet Scores -log10(FDR)", + colorTitle = "Doublet Scores -log10(FDR)", + rastr = TRUE, + baseSize = 6 + ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + grid::grid.newpage() + print(.fixPlotSize(pscore, plotWidth = 6, plotHeight = 6)) + + #Plot Enrichment Summary + penrich <- ggPoint( + x = df[,1], + y = df[,2], + color = .quantileCut(df$enrichment, 0, 0.95), + xlim = xlim, + ylim = ylim, + discrete = FALSE, + size = 0.5, + xlab = "UMAP Dimension 1", + ylab = "UMAP Dimension 2", + pal = pal, + title = "Doublet Enrichment", + colorTitle = "Doublet Enrichment", + rastr = TRUE + ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + grid::grid.newpage() + print(.fixPlotSize(penrich, plotWidth = 6, plotHeight = 6)) - dev.off() - sink() - file.remove(tmpFile) + dev.off() + sink() + file.remove(tmpFile) + + }, error = function(x){ + + suppressWarnings(sink()) + message(x) + + }) summaryList <- SimpleList( originalDataUMAP = df, @@ -468,11 +482,11 @@ addDoubletScores <- function( #Compute KNN if(toupper(knnMethod) == "SVD"){ - knnDoub <- computeKNN(LSI$matSVD, simLSI, k) + knnDoub <- .computeKNN(LSI$matSVD, simLSI, k) }else if(toupper(knnMethod) == "UMAP"){ - knnDoub <- computeKNN(uwotUmap[[1]], umapProject, k) + knnDoub <- .computeKNN(uwotUmap[[1]], umapProject, k) }else{ @@ -518,9 +532,10 @@ addDoubletScores <- function( #' This function will read in the .best file output from demuxlet and add the doublet #' classifications into the cellColData for the ArchR Project #' -#' @param ArchRProj An ArchRProject object. -#' @param bestFiles The file path to the .best files created by Demuxlet. There should be one .best file for each sample in the ArchRProject. -#' @param sampleNames The sample names corresponding to the .best files. These must match the sample names present in the ArchRProject. +#' @param ArchRProj An `ArchRProject` object. +#' @param bestFiles The file path to the .best files created by Demuxlet. There should be one .best file for each sample in the `ArchRProject`. +#' @param sampleNames The sample names corresponding to the .best files. These must match the sample names present in the `ArchRProject`. +#' @param ... additional args #' @export addDemuxletResults <- function(ArchRProj, bestFiles, sampleNames, ...){ diff --git a/R/FilterCells.R b/R/FilterCells.R index 1fc4b8bd..226ceb71 100644 --- a/R/FilterCells.R +++ b/R/FilterCells.R @@ -1,9 +1,19 @@ +########################################################################################## +# Cell Filtering Methods +########################################################################################## + #' Filter cells in an ArchRProject #' -#' This function plots a list of filters to see which cells would pass filter. +#' This function returns an ArchRProject object that has been filtered to remove cells that do not pass the filter critera suppplied in filterList. #' -#' @param ArchRProj an ArchR Project object -#' @param filterList list of filters for filtering cells from cellColData +#' @param ArchRProj An `ArchRProject` object +#' @param filterList A list of filters based on cellColData to apply when filtering cells. +#' Format should be a named list where the name corresponds to a column name in cellColData and the value corresponds to the filter criteria. +#' If numeric, a lower threshold is expected, below which cells are filtered (a higher threshold is optional). +#' If a chatacter vector, rows in cellColData corresponding to the supplied character values are kept. +#' If a list or simpleList, the user can additionally supply filters that are specific to each sample +#' (i.e. apply to all samples : list("TSSEnrichment" = c(4,25)), +#' apply to specific samples : list(TSSEnrichment = list("Sample1" = c(4, 25), "Sample2" = c(5, 25)))). #' @param ... additional params #' @export filterCells <- function(ArchRProj, filterList, ...){ @@ -73,13 +83,16 @@ filterCells <- function(ArchRProj, filterList, ...){ } -#' Filter Plot for cells in an ArchRProject +#' Filter plot for cells in an ArchRProject #' -#' This function plots a list of filters to see which cells would pass filter. +#' This function plots a list of attributes with filter criteria to visualize which cells would pass filter. #' -#' @param ArchRProj an ArchR Project object -#' @param filterList list of filters for filtering cells from cellColData (up to 2 will be plotted) -#' @param sampleNames specific samples to include +#' @param ArchRProj An `ArchRProject` object +#' @param filterList A list of filters based on `cellColData` to apply when filtering cells. +#' Format should be a named numeric list where the name corresponds to a column name in cellColData +#' and the value corresponds to the filter criteria. A lower threshold is expected, below which +#' cells are filtered (a higher threshold is optional). Only the first 2 filters will be plotted. +#' @param sampleNames The sample names corresponding to the subset of samples to plot. If NULL, all samples are included. #' @param ... additional params #' @export filterPlot <- function(ArchRProj, filterList, sampleNames = NULL, ...){ @@ -132,14 +145,14 @@ filterPlot <- function(ArchRProj, filterList, sampleNames = NULL, ...){ } -#' Filter Doublets From an ArchR Project +#' Filter Doublets From an ArchRProject #' -#' This function wil filter doublets from an ArchRProject after addDoubletScores has been ran +#' This function wil filter doublets from an ArchRProject after addDoubletScores has been run #' -#' @param ArchRProj an ArchR Project object -#' @param cutEnrich minimum cutoff for doubletEnrichment which represents number of simulated doublets nearest a cell over the expected if uniform. -#' @param cutScore minimum cutoff for doubletScore which represents -log10 binomial adjusted p-value. -#' @param filterRatio filter ratio for max number of inferred doublets to remove based on the number of cells PF. If there are 10,000 cells the maximum would be filterRatio * 10,000^2 / (1000 * 100). +#' @param ArchRProj An `ArchRProject` object +#' @param cutEnrich The minimum numeric cutoff for `DoubletEnrichment`. This number is equivalent to the number of simulated doublets identified as a nearest neighbor to the cell divided by the expected number given a random uniform distribution. +#' @param cutScore The minimum numeric cutoff for `DoubletScore` which represents the -log10(binomial adjusted p-value) for the `DoubletEnrichment`. +#' @param filterRatio JJJ The maximum ratio of predicted doublets to filter based on the number of pass-filter cells. If there are 10,000 cells the maximum would be filterRatio * 10,000^2 / (100,000). #' @param ... additional params #' @export filterDoublets <- function(ArchRProj, cutEnrich = 1, cutScore = -Inf, filterRatio = 1, ...){ @@ -180,9 +193,3 @@ filterDoublets <- function(ArchRProj, cutEnrich = 1, cutScore = -Inf, filterRati } - - - - - - diff --git a/R/Footprinting.R b/R/Footprinting.R index 5a6645a4..2582ec59 100644 --- a/R/Footprinting.R +++ b/R/Footprinting.R @@ -1,24 +1,35 @@ +########################################################################################## +# Transcription Factor Footprinting Methods +########################################################################################## + #' Plot footprints for an ArchRProject #' #' This function will plot footprints for all samples in a given ArchRProject or a properly-formatted Summarized Experiment #' -#' @param input An ArchRProject object or Footprint Summarized Experiment -#' @param positions QQQ A GenomicRangesList, a list, or a SimpleList object containing the positions to incorporate into the footprint. Each position should be QQQ. -#' @param groupBy QQQ The column name in sampleColData to use for grouping multiple samples together prior to footprinting. -#' @param useGroups QQQ -#' @param pal The name or numeric index of a custom palette from ArchR_palettes to use for plotting the lines corresponding to the footprints. -#' @param flank QQQ The number of basepairs from the position center to consider as the flank. -#' @param flankNorm QQQ The number of basepairs to consider at the edge of the flank region to be used for footprint normalization. -#' @param smoothWindow QQQ The size in basepairs of the sliding window to be used for smoothing of the footprint signal. -#' @param nTop QQQ The number of positions to consider. Only the top nTop positions based on QQQ will be considered for the footprint. -#' @param normMethod QQQ The name of the normalization method to use to normalize the footprint relative to the Tn5 insertion bias. Options include QQQ. -#' @param threads The number threads to be used for parallel computing. +#' @param ArchRProj An `ArchRProject` object +#' @param positions A GenomicRangesList, a list, or a SimpleList object containing the positions to incorporate into the footprint. Each position should be stranded. +#' @param plotName The prefix to add to the file name for the output PDF file. +#' @param groupBy The name of the column in `cellColData` to use for grouping multiple samples together prior to footprinting. +#' @param useGroups A character vector that is used to select a subset of groups by name from the designated `groupBy` column in `cellColData`. This limits the groups used to perform footprinting. +#' @param pal The name of a custom palette from `ArchRPalettes` to use for plotting the lines corresponding to the footprints. +#' @param flank The number of basepairs from the position center (+/-) to consider as the flank. +#' @param flankNorm The number of basepairs to consider at the edge of the flank region (+/-) to be used for footprint normalization. +#' @param smoothWindow The size in basepairs of the sliding window to be used for smoothing of the footprint signal. +#' @param minCells The minimum number of cells required in a given cell group to permit footprint generation. +#' @param nTop The number of genomic regions to consider. Only the top `nTop` genomic regions based on the "score" column in the GRanges will be considered for the footprint. +#' @param normMethod The name of the normalization method to use to normalize the footprint relative to the Tn5 insertion bias. Options include "None", "Subtract", "Divide". +#' @param inputSE Input a previous footprint Summarized Experiment to be plotted instead of being regenerated. +#' @param height The height in inches to be used for the output PDF. +#' @param width The width in inches to be used for the output PDF file. +#' @param addDOC A boolean variable that determines whether to add the date of creation to end of the PDF file name. This is useful for preventing overwritting of old plots. +#' @param useSink Use sink to hide messages during plotting. +#' @param threads The number of threads to be used for parallel computing. #' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. #' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args #' @export plotFootprints <- function( - input = NULL, + ArchRProj = NULL, positions = NULL, plotName = "Plot-Footprints", groupBy = "Clusters", @@ -30,6 +41,7 @@ plotFootprints <- function( minCells = 25, nTop = NULL, normMethod = "none", + inputSE = NULL, height = 6, width = 4, addDOC = TRUE, @@ -41,7 +53,7 @@ plotFootprints <- function( ){ tstart <- Sys.time() - if(inherits(input, "ArchRProject")){ + if(is.null(inputSE)){ #Validate Positions if(!inherits(positions, "GenomicRangesList") & !inherits(positions, "list") & !inherits(positions, "SimpleList")){ @@ -65,7 +77,7 @@ plotFootprints <- function( #Get Footprints .messageDiffTime("Summarizing Footprints", tstart, addHeader = verboseAll) seFoot <- .summarizeFootprints( - ArchRProj = input, + ArchRProj = ArchRProj, positions = positions, groupBy = groupBy, useGroups = useGroups, @@ -76,15 +88,13 @@ plotFootprints <- function( verboseAll = verboseAll ) - ArchRProj <- input - }else{ ArchRProj <- NULL - if(inherits(input, "SummarizedExperiment")){ - seFoot <- input - rm(input) + if(inherits(inputSE, "SummarizedExperiment")){ + seFoot <- inputSE + rm(inputSE) gc() if(!is.null(useGroups)){ if(sum(SummarizedExperiment::colData(seFoot)[,1] %in% useGroups) == 0){ @@ -92,6 +102,8 @@ plotFootprints <- function( } seFoot <- seFoot[,SummarizedExperiment::colData(seFoot)[,1] %in% useGroups] } + }else{ + stop("inputSE must be a footprint summarized experiment!") } } @@ -99,51 +111,60 @@ plotFootprints <- function( ############################################################################################ # Plot Helper ############################################################################################ - if(useSink){ - tmpFile <- .tempfile() - sink(tmpFile) - } - name <- gsub("\\.pdf", "", plotName) - if(is.null(ArchRProj)){ - outDir <- "Plots" - }else{ - ArchRProj <- .validArchRProject(ArchRProj) - outDir <- file.path(getOutputDirectory(ArchRProj), "Plots") - } + o <- tryCatch({ + if(useSink){ + tmpFile <- .tempfile() + sink(tmpFile) + } - dir.create(outDir, showWarnings = FALSE) - if(addDOC){ - doc <- gsub(":","-",stringr::str_split(Sys.time(), pattern=" ",simplify=TRUE)[1,2]) - filename <- file.path(outDir, paste0(name, "_Date-", Sys.Date(), "_Time-", doc, ".pdf")) - }else{ - filename <- file.path(outDir, paste0(name, ".pdf")) - } + name <- gsub("\\.pdf", "", plotName) + if(is.null(ArchRProj)){ + outDir <- "Plots" + }else{ + ArchRProj <- .validArchRProject(ArchRProj) + outDir <- file.path(getOutputDirectory(ArchRProj), "Plots") + } - pdf(filename, width = width, height = height, useDingbats = FALSE) - - for(i in seq_along(seFoot@assays)){ - print( - grid::grid.draw(.ggFootprint( - seFoot = seFoot, - name = names(seFoot@assays)[i], - pal = pal, - smoothWindow = smoothWindow, - flank = flank, - flankNorm = flankNorm, - normMethod=normMethod - ) - )) - if(i != length(seFoot@assays)){ - grid::grid.newpage() + dir.create(outDir, showWarnings = FALSE) + if(addDOC){ + doc <- gsub(":","-",stringr::str_split(Sys.time(), pattern=" ",simplify=TRUE)[1,2]) + filename <- file.path(outDir, paste0(name, "_Date-", Sys.Date(), "_Time-", doc, ".pdf")) + }else{ + filename <- file.path(outDir, paste0(name, ".pdf")) } - } - dev.off() - if(useSink){ - sink() - file.remove(tmpFile) - } + pdf(filename, width = width, height = height, useDingbats = FALSE) + + for(i in seq_along(seFoot@assays)){ + print( + grid::grid.draw(.ggFootprint( + seFoot = seFoot, + name = names(seFoot@assays)[i], + pal = pal, + smoothWindow = smoothWindow, + flank = flank, + flankNorm = flankNorm, + normMethod=normMethod + ) + )) + if(i != length(seFoot@assays)){ + grid::grid.newpage() + } + } + dev.off() + + if(useSink){ + sink() + file.remove(tmpFile) + } + + }, error = function(x){ + + suppressWarnings(sink()) + message(x) + + }) seFoot @@ -456,22 +477,3 @@ plotFootprints <- function( return(kmers) } - - - - - - - - - - - - - - - - - - - diff --git a/R/GRangesUtils.R b/R/GRangesUtils.R index 10592d86..70824b08 100644 --- a/R/GRangesUtils.R +++ b/R/GRangesUtils.R @@ -1,20 +1,31 @@ -#-------------------------------------------------------------------------------------------- +########################################################################################## # Helper Functions for GenomicRanges -#-------------------------------------------------------------------------------------------- +########################################################################################## -#' Filters unwanted chr mainly underscores -#' @param x GRanges or something with seqlevels -#' @param remove remove vector -#' @param underscore remove all underscores? -#' @param standard keep standard chromosomes +#' Filters unwanted seqlevels from a Genomic Ranges object or similar object +#' +#' This function allows for removal of manually designated or more broadly undesirable seqlevels from a Genomic Ranges object or similar object +#' +#' @param gr A `GRanges` object or another object containing seqlevels. +#' @param remove A character vector indicating the seqlevels that should be removed if manual removal is desired for certain seqlevels. +#' @param underscore A boolean value indicating whether to remove all seqlevels whose name contains an underscore (for example "chr11_KI270721v1_random"). +#' @param standard A boolean value indicating whether only standard chromosomes should be kept. Standard chromosomes are defined by `GenomeInfoDb::keepStandardChromosomes()`. +#' @param pruning.mode See ?seqinfo for a description of the pruning modes. #' @export -keepFilteredChromosomes <- function(x, remove = c("chrM"), underscore = TRUE, standard = TRUE, pruning.mode="coarse"){ +filterChrGR <- function( + gr = NULL, + remove = c("chrM"), + underscore = TRUE, + standard = TRUE, + pruning.mode="coarse" + ){ + #first we remove all non standard chromosomes if(standard){ - x <- GenomeInfoDb::keepStandardChromosomes(x, pruning.mode = pruning.mode) + gr <- GenomeInfoDb::keepStandardChromosomes(gr, pruning.mode = pruning.mode) } #Then check for underscores or specified remove - seqNames <- seqlevels(x) + seqNames <- seqlevels(gr) chrRemove <- c() #first we remove all chr with an underscore if(underscore){ @@ -28,39 +39,27 @@ keepFilteredChromosomes <- function(x, remove = c("chrM"), underscore = TRUE, st chrKeep <- seqNames } #this function restores seqlevels - seqlevels(x, pruning.mode=pruning.mode) <- chrKeep - return(x) -} + seqlevels(gr, pruning.mode=pruning.mode) <- chrKeep + + return(gr) -#' Instead of counting overlaps get columns like max score or etc in query -#' @param query granges query -#' @param subject granges subject -#' @param colname mcols(gr)[[colname]] cannot be null -#' @param decreasing for order -#' @export -columnOverlaps <- function(query, subject, colname = "score", ignore.strand = TRUE, decreasing = TRUE){ - #First get overlaps - o <- data.frame(findOverlaps(query, subject, ignore.strand = ignore.strand)) - #Then append information - o$col <- mcols(subject)[[colname]][o[,2]] - #Order it by the factor to rank - o <- o[order(o$col, decreasing = decreasing),] - #Deduplicate - o <- o[!duplicated(o$queryHits),] - #Initialize - val <- rep(0, length(query)) - #Fill Values - val[o[,1]] <- o$col - return(val) } -#' Instead of counting overlaps get columns like max score or etc in query -#' @param query granges query -#' @param subject granges subject -#' @param colname mcols(gr)[[colname]] cannot be null -#' @param decreasing for order +#' Retreive a non-overlapping set of regions from a Genomic Ranges object +#' +#' This function returns a GRanges object containing a non-overlapping set regions derived from a supplied Genomic Ranges object. +#' +#' @param gr A `GRanges` object. +#' @param by The name of a column in `mcols(gr)` that should be used to determine how overlapping regions should be resolved. The resolution of overlapping regions also depends on `decreasing`. For example, if a column named "score" is used for `by`, `decreasing = TRUE` means that the highest "score" in the overlap will be retained and `decreasing = FALSE` means that the lowest "score" in the overlap will be retained. +#' @param decreasing A boolean value indicating whether the values in the column indicated via `by` should be ordered in decreasing order. If `TRUE`, the higher value in `by` will be retained. +#' @param verbose A boolean value that determines whether the output should include extra reporting. #' @export -nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE){ +nonOverlappingGR <- function( + gr = NULL, + by = "score", + decreasing = TRUE, + verbose = FALSE + ){ stopifnot(by %in% colnames(mcols(gr))) gr <- .validGRanges(gr) @@ -121,26 +120,28 @@ nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = } -#' Subset by Seqnames -#' @param gr grange -#' @param seqnames seqnames to subset +#' Subset a Genomic Ranges object by the provided seqnames +#' +#' This function returns a subsetted Genomic Ranges object based on a vector of provided seqnames +#' +#' @param gr A `GRanges` object to be subsetted. +#' @param names A character vector containing the `seqnames` to keep from the provided `GRanges` object. #' @export -subsetSeqnames <- function(gr, seqNames, useNames = FALSE){ +subsetSeqnamesGR <- function(gr = NULL, names = NULL){ gr <- .validGRanges(gr) - gr <- gr[which(as.character(seqnames(gr)) %in% seqNames),] - if(useNames){ - seqlevels(gr) <- seqNames - }else{ - seqlevels(gr) <- as.character(unique(seqnames(gr))) - } + gr <- gr[which(as.character(seqnames(gr)) %in% names),] + seqlevels(gr) <- as.character(unique(seqnames(gr))) return(gr) } -#' Add Seqlengths to genomic ranges -#' @param gr see validGRanges -#' @param genome see validBSgenome +#' Adds seqlength information to the seqnames of a Genomic Ranges object +#' +#' This function adds seqlength information for each of the seqnames in the provided Genomic Ranges object. +#' +#' @param gr A `GRanges` object. +#' @param genome The name of a valid genome (for example "hg38", "hg19", or "mm10"). See `ArchR::validBSgenome()`. #' @export -addSeqLengths <- function(gr, genome){ +addSeqLengthsGR <- function(gr = NULL, genome = NULL){ gr <- .validGRanges(gr) genome <- validBSgenome(genome) stopifnot(all(as.character(seqnames(gr)) %in% as.character(seqnames(genome)))) @@ -148,18 +149,21 @@ addSeqLengths <- function(gr, genome){ return(gr) } -#' Shuffle Genomic Ranges -#' @param subject see validGRanges -#' @param genome see validBSgenome -#' @param n nPermutations -#' @param shuffleChr shuffle across chromosomes randomly vs using previous knowledge of chromosome distribution +#' Randomly shuffle a Genomic Ranges object +#' +#' This function randomly shuffles a Genomic Ranges object. +#' +#' @param gr A `GRanges` object. +#' @param genome The name of a valid genome (for example "hg38", "hg19", or "mm10"). See `ArchR::validBSgenome()`. +#' @param n The number of permutations to perform during shuffling. +#' @param shuffleChr A boolean value indicating whether to shuffle across chromosomes randomly based on length of chromosomes or use previous knowledge of chromosome distribution. #' @export -shuffleGRanges <- function(subject, genome, n, shuffleChr=TRUE){ +shuffleGR <- function(gr = NULL, genome = NULL, n = 100, shuffleChr = TRUE){ #adapted from ChIPseeker's shuffle cs <- getChromSizes(genome) seqL <- seqlengths(cs) seqL <- seqL[sort(names(seqL))] - sub <- subsetSeqnames(subject, seqNames = names(seqL)) #change + sub <- subsetSeqnamesGR(gr, names = names(seqL)) #change sub <- sub[order(as.character(seqnames(sub)))] #stopifnot(identical(unique(as.character(seqnames(sub))), names(seqL))) w <- width(sub) @@ -195,11 +199,14 @@ shuffleGRanges <- function(subject, genome, n, shuffleChr=TRUE){ return(grL) } -#' Merge Genomic Ranges -#' @param gr see validGRanges -#' @param ignore.strand ignore strandedness for merging +#' Merge genomic regions within a single Genomic Ranges object +#' +#' This function merges overlapping regions within a single Genomic Ranges object +#' +#' @param gr A `GRanges` object. +#' @param ignore.strand A boolean value indicating whether strandedness should be ignored in `findOverlaps()`. #' @export -mergeGRanges <- function(gr, ignore.strand = TRUE){ +mergeGR <- function(gr, ignore.strand = TRUE){ gr <- .validGRanges(gr) grR <- reduce(gr,min.gapwidth=0L,ignore.strand = ignore.strand) o <- DataFrame(findOverlaps(grR, gr,ignore.strand = ignore.strand)) @@ -214,30 +221,36 @@ mergeGRanges <- function(gr, ignore.strand = TRUE){ return(mGR) } -#' Merge Genomic Ranges -#' @param query see validGRanges -#' @param subject see validGRanges -#' @param ignore.strand ignore strandedness for overlaps +#' Extend regions from a Genomic Ranges object +#' +#' This function extends each region in a Genomic Ranges object by a designated upstream and downstream extension in a strand-aware fashion +#' +#' @param gr A `GRanges` object. +#' @param upstream The number of basepairs upstream (5') to extend each region in `x`. Strand-aware. +#' @param downstream The number of basepairs downstream (3') to extend each region in `x`. Strand-aware. #' @export -extendGRanges <- function(x, upstream, downstream){ +extendGR <- function(gr = NULL, upstream = NULL, downstream = NULL){ #https://bioinformatics.stackexchange.com/questions/4390/expand-granges-object-different-amounts-upstream-vs-downstream - isMinus <- BiocGenerics::which(strand(x) == "-") - isOther <- BiocGenerics::which(strand(x) != "-") + isMinus <- BiocGenerics::which(strand(gr) == "-") + isOther <- BiocGenerics::which(strand(gr) != "-") #Forward - start(x)[isOther] <- start(x)[isOther] - upstream - end(x)[isOther] <- end(x)[isOther] + downstream + start(gr)[isOther] <- start(gr)[isOther] - upstream + end(gr)[isOther] <- end(gr)[isOther] + downstream #Reverse - end(x)[isMinus] <- end(x)[isMinus] + upstream - start(x)[isMinus] <- start(x)[isMinus] - downstream - return(x) + end(gr)[isMinus] <- end(gr)[isMinus] + upstream + start(gr)[isMinus] <- start(gr)[isMinus] - downstream + return(gr) } -#' Merge Genomic Ranges -#' @param query see validGRanges -#' @param subject see validGRanges -#' @param ignore.strand ignore strandedness for overlaps +#' Identify the number of bases that overlap two Genomic Ranges objects +#' +#' This function returns a data.frame describing how many basepairs overlap the provided query and subject Genomic Ranges objects +#' +#' @param query A `GRanges` object to be used as the query in `findOverlaps()`. +#' @param subject A `GRanges` object to be used as the subject in `findOverlaps()`. +#' @param ignore.strand A boolean value indicating whether strandedness should be ignored in `findOverlaps()`. #' @export -overlappingBP <- function(query, subject, ignore.strand = TRUE){ +nOverlapGR <- function(query = NULL, subject = NULL, ignore.strand = TRUE){ query <- .validGRanges(query) subject <- .validGRanges(subject) o <- findOverlaps(query, subject, ignore.strand = ignore.strand) @@ -248,36 +261,42 @@ overlappingBP <- function(query, subject, ignore.strand = TRUE){ nBP <- perOverlap * sum(width(subject)) type <- c("queryBP", "sharedBP", "subjectBP") nBases <- c(sum(width(query))-nBP, nBP, sum(width(subject))-nBP) - return(data.frame(type,nBases)) + return(data.frame(type = type, bp = nBases)) } -#' Overlaps Many includes information from mcols(gr) -#' @param query see validGRanges -#' @param subject see validGRanges -#' @param by column in subject to split overlaps by -#' @param ignore.strand ignore strandedness for overlaps +#' Overlap with many genomic regions +#' +#' This function returns a sparse matrix that describes the overlap with each sub-grouped genomic region as specified in the "by" column. +#' +#' @param query A `GRanges` object to be used as the query in `findOverlaps()`. +#' @param subject A `GRanges` object with a column sub-grouping to be used as the subject in `findOverlaps()`. +#' @param by The name of a column in `mcols(gr)` that should be used to determine how overlapping regions should be sub-grouped. +#' @param ignore.strand A boolean value indicating whether strandedness should be ignored in `findOverlaps()`. #' @export -overlapsMany <- function(query, subject, by, ignore.strand = TRUE){ +overlapsManyGR <- function(query = NULL, subject = NULL, by = NULL, ignore.strand = TRUE){ o <- DataFrame(findOverlaps(query, subject, ignore.strand = ignore.strand)) - o$name <- mcols(subject)[o$subjectHits,by] + o$name <- mcols(subject)[o$subjectHits, by] o$id <- match(o$name, unique(o$name)) sparse <- Matrix::sparseMatrix( - i=o[,1], - j=o[,4], - x=rep(TRUE,nrow(o)), - dims=c(length(query),length(unique(o$name))) + i = o[,1], + j = o[,4], + x = rep(TRUE,nrow(o)), + dims = c(length(query),length(unique(o$name))) ) colnames(sparse) <- unique(o$name) return(sparse) } -#' Construct GRanges seqnames start end accounting for ends before starts (adding strandedness) -#' @param seqnames seqnames of GRanges -#' @param start start of GRanges -#' @param end end of GRanges -#' @param ignore.strand ignore strandedness for overlaps +#' Construct a Genomic Ranges object taking into account strandedness +#' +#' This function creates a Genomic Ranges object accounting for strandedness indicated by the relative orientation of the provided start and end positions +#' +#' @param seqnames A character vector containing the seqnames to be added to the `GRanges` object. +#' @param start A vector of start positions to be added to the `GRanges` object. +#' @param end A vector of end positions to be added to the `GRanges` object. +#' @param ignore.strand A boolean value indicating whether strandedness should be ignored in `findOverlaps()`. #' @export -constructGRanges <- function(seqnames, start, end, ignore.strand = TRUE){ +constructGR <- function(seqnames, start, end, ignore.strand = TRUE){ df <- data.frame(seqnames, start, end) idx <- which(df[,2] > df[,3]) df[idx,2:3] <- df[idx,3:2] diff --git a/R/GgplotHelper.R b/R/GgplotHelper.R index cd773210..a57c729b 100644 --- a/R/GgplotHelper.R +++ b/R/GgplotHelper.R @@ -1,36 +1,40 @@ -#' A ggplot-based dot plot +########################################################################################## +# ggPlot Wrapper Methods For Easy Plotting +########################################################################################## + +#' A ggplot-based dot plot wrapper function #' #' This function is a wrapper around ggplot geom_point to allow for a more intuitive plotting of ArchR data. #' #' @param x A numeric vector containing the x-axis values for each point. #' @param y A numeric vector containing the y-axis values for each point. -#' @param color QQQ -#' @param discrete QQQ A boolean value indicating QQQ -#' @param discreteSet QQQ The name or numeric index of a custom palette from ArchR_palettes to use for QQQ. -#' @param labelMeans QQQ A boolean value indicating QQQ -#' @param continuousSet QQQ The name or numeric index of a custom palette from ArchR_palettes to use for QQQ. -#' @param pal QQQ The name or numeric index of a custom palette from ArchR_palettes to use for QQQ. -#' @param colorDensity QQQ A boolean value indicating whether the density of points on the plot should be indicated colorimetrically. If TRUE, QQQ is used as the color palette. +#' @param color A numeric/categorical vector containing coloring information for each point. +#' @param discrete A boolean value indicating whether the supplied data is discrete (TRUE) or continuous (FALSE). +#' @param discreteSet The name of a custom palette from `ArchRPalettes` to use for categorical/discrete color. +#' @param continuousSet The name of a custom palette from `ArchRPalettes` to use for numeric color. +#' @param labelMeans A boolean value indicating whether the mean of each categorical/discrete color should be labeled. +#' @param pal A custom palette used to override discreteSet/continuousSet for coloring vector. +#' @param defaultColor The default color for points that do not have another color applied (i.e. `NA` values). +#' @param colorDensity A boolean value indicating whether the density of points on the plot should be indicated colorimetrically. If TRUE, continuousSet is used as the color palette. #' @param size The numeric size of the points to be plotted. -#' @param xlim A set of numeric values indicating the lower and upper bounds of the x-axis on the plot. -#' @param ylim A set of numeric values indicating the lower and upper bounds of the y-axis on the plot. -#' @param extend QQQ +#' @param xlim A vector of two numeric values indicating the lower and upper bounds of the x-axis on the plot. +#' @param ylim A vector of two numeric values indicating the lower and upper bounds of the y-axis on the plot. +#' @param extend A numeric value indicating the fraction to extend the x-axis and y-axis beyond the maximum and minimum values if `xlim` and `ylim` are not provided. For example, 0.05 will extend the x-axis and y-axis by 5% on each end. #' @param xlabel The label to plot for the x-axis. -#' @param randomize A boolean value indicating whether to randomize the order of the points when plotting. -#' @param seed A numeric seed number for use in randomization. #' @param ylabel The label to plot for the y-axis. #' @param title The title of the plot. -#' @param alpha A number indicating the transparency to use for each point. See ggplot2 for more details. -#' @param baseSize QQQ The size in inches of the plot. +#' @param randomize A boolean value indicating whether to randomize the order of the points when plotting. +#' @param seed A numeric seed number for use in randomization. +#' @param alpha A number indicating the transparency to use for each point. See `ggplot2` for more details. +#' @param baseSize The base font size to use in the plot. #' @param ratioYX The aspect ratio of the x and y axes on the plot. -#' @param labelType QQQ A string indicating how to label the points on the plot. Options include "ggrepel", QQQ -#' @param bgColor The background color of the plot. +#' @param labelType A string indicating how to label the points on the plot. Options include "ggrepel", "shadowtext". #' @param fgColor The foreground color of the plot. +#' @param bgColor The background color of the plot. #' @param labelSize The numeric font size of labels. -#' @param addFit QQQ A string indicating if a fit/regression line should be included in the plot and what method to use for this fit. Options include QQQ. -#' @param nullColor The color to be used for points that correspond to null values in either the x or y vectors. -#' @param rastr A boolean valut that indicates that the plot should be rasterized. This does not rasterize lines and labels, just the internal portions of the plot. -#' @param dpi The resolution to use for the plot. +#' @param addFit A string indicating if a fit/regression line (see geom_smooth methods) should be included in the plot and what method to use for this fit. +#' @param rastr A boolean value that indicates whether the plot should be rasterized using ggrastr. This does not rasterize lines and labels, just the internal portions of the plot. +#' @param dpi The resolution in dots per inch to use for the plot. #' @export ggPoint <- function( x = NULL, @@ -251,25 +255,28 @@ ggPoint <- function( } -#' A ggplot-based one-to-one dot plot +#' A ggplot-based one-to-one dot plot wrapper function #' #' This function is a wrapper around ggplot geom_point to allow for plotting one-to-one sample comparisons in ArchR. #' #' @param x A numeric vector containing the x-axis values for each point. #' @param y A numeric vector containing the y-axis values for each point. -#' @param nPlot The number of points to plot. When this value is less than the total points, sample is used to extract random data points to plot. -#' @param nKernel The value of n to use the kde2d from the MASS package. #' @param size The numeric size of the points to plot. +#' @param alpha A number indicating the transparency to use for each point. See `ggplot2` for more details. #' @param xlabel The label to plot for the x-axis. #' @param ylabel The label to plot for the y-axis. #' @param title The title of the plot. -#' @param min xmin quantile [0,1] -#' @param max xmax quantile [0,1] -#' @param alpha geom_point alpha -#' @param baseSize base_font size default is 12 -#' @param pal continuous color palette to use +#' @param min x-limits min quantile [0,1] +#' @param max x-limits max quantile [0,1] +#' @param nPlot The number of points to plot. When this value is less than the total points, sample is used to extract random data points to plot. +#' @param nKernel The value of n to use the kde2d from the MASS package. +#' @param densityMax The quantile that should be represented by the maximum color on the continuous scale designated by `pal`. Above this value will be thresholded to the maximum color. +#' @param extend A numeric value indicating the fraction to extend the x-axis and y-axis beyond the maximum and minimum values if `xlim` and `ylim` are not provided. For example, 0.05 will extend the x-axis and y-axis by 5% on each end. +#' @param baseSize The base font size to use in the plot. +#' @param rastr A boolean value that indicates whether the plot should be rasterized. This does not rasterize lines and labels, just the internal portions of the plot. +#' @param pal A custom palette used to override continuousSet for coloring vector. +#' @param ... Additional params to be supplied to ggPoint #' @export -#' ggOneToOne <- function ( x = NULL, y = NULL, @@ -287,7 +294,8 @@ ggOneToOne <- function ( baseSize = 6, rastr = TRUE, pal = paletteContinuous(set = "blue_yellow"), - ...){ + ... + ){ #Check is Numeric stopifnot(is.numeric(x)) @@ -351,33 +359,36 @@ ggOneToOne <- function ( return(df) } -#' GG Violin Plot +#' A ggplot-based violin plot wrapper function #' -#' @param x categorical values to each y value -#' @param y numeric values -#' @param xlabel xlabel -#' @param ylabel ylabel -#' @param xOrder custom order of x for plotting -#' @param points add points using ggrastr geom_quasirandom? -#' @param size size of barplot lines -#' @param baseSize base size of fonts in plot -#' @param pal color palette see paletteDiscrete for examples +#' @param x A character vector containing the categorical x-axis values for each y-axis value. +#' @param y A numeric vector containing the y-axis values for each point. +#' @param xlabel The label to plot for the x-axis. +#' @param ylabel The label to plot for the y-axis. +#' @param xOrder A character vector indicating a custom order for plotting x-axis categorical values for plotting. Should contain all possible values of `x` in the desired order. +#' @param addPoints A boolean value indicating whether individual points should be added to the plot using `geom_quasirandom`. +#' @param size The line width for boxplot lines. +#' @param baseSize The base font size to use in the plot. +#' @param ratioYX The aspect ratio of the x and y axes on the plot. +#' @param sampleRatio sampling ratio for number of dots to be shown from original data over violins to prevent over-crowding. Default is set to 0.1. +#' @param title The title of the plot. +#' @param pal A custom palette for discrete coloring. #' @export -#' ggViolin <- function( x = NULL, y = NULL, xlabel = NULL, ylabel = NULL, xOrder = NULL, - points = FALSE, + addPoints = FALSE, size = 1, baseSize = 6, ratioYX = NULL, sampleRatio = 0.1, title = "", pal = paletteDiscrete(values=x, set = "stallion"), - ...){ + ... + ){ stopifnot(!is.numeric(x)) stopifnot(is.numeric(y)) @@ -391,7 +402,7 @@ ggViolin <- function( if(!is.null(xOrder)){ if(!all(x %in% xOrder)){ - stop("Not x colors are in xOrder!") + stop("Not all x values are present in xOrder!") } }else{ xOrder <- gtools::mixedsort(unique(x)) @@ -443,25 +454,27 @@ ggViolin <- function( } -#' Ggplot Hexplot summary of points in a standardized manner +#' A ggplot-based Hexplot wrapper function summary of points in a standardized manner #' #' This function will plot x,y coordinates values summarized in hexagons in a standardized manner #' -#' @param x x vector of data to be plot -#' @param y y vector of data to be plot -#' @param color color vector of values to be plot (must be same length as x,y) -#' @param pal custom palette option -#' @param bins number of bins for hexplot -#' @param xlim xlimits for plot -#' @param ylim ylimits for plot -#' @param extend extend limits by this proportion if not set by xlim or ylim -#' @param xlabel label for x-axis -#' @param ylabel label for y-axis -#' @param title title of plot -#' @param colorTitle title for legend corresponding to color -#' @param baseSize baseSize for fonts -#' @param ratioYX ratio of y to x in plot -#' @param FUN function for summarizing hexagons +#' @param x A numeric vector containing the x-axis values for each point. +#' @param y A numeric vector containing the y-axis values for each point. +#' @param color A numeric/categorical vector containing coloring information for each point. +#' @param pal A custom palette for continuous coloring. +#' @param bins The number of bins to be used for plotting the hexplot. `bins` indicates the total number of hexagons that will fit within the surface area of the plot. +#' @param xlim A vector of two numeric values indicating the lower and upper bounds of the x-axis on the plot. +#' @param ylim A vector of two numeric values indicating the lower and upper bounds of the y-axis on the plot. +#' @param extend A numeric value indicating the fraction to extend the x-axis and y-axis beyond the maximum and minimum values if `xlim` and `ylim` are not provided. For example, 0.05 will extend the x-axis and y-axis by 5% on each end. +#' @param xlabel The label to plot for the x-axis. +#' @param ylabel The label to plot for the y-axis. +#' @param title The title of the plot. +#' @param colorTitle The label to use for the legend corresponding to `color`. +#' @param baseSize The base font size to use in the plot. +#' @param ratioYX The aspect ratio of the x and y axes on the plot. +#' @param FUN The function to use for summarizing data into hexagons. Typically "mean" or something similar. +#' @param quantCut If this is not null, a quantile cut is performed to threshold the top and bottom of the distribution. This prevents skewed color scales caused by strong outliers. The format of this should be c(x,y) where x is the upper threshold and y is the lower threshold. For example, quantileCut = c(0.975,0.025) will take the top and bottom 2.5% of values and set them to the value of the 97.5th and 2.5th percentile values respectively. +#' @param addPoints A boolean value indicating whether individual points should be shown on the hexplot. #' @param ... additional params to pass #' @export ggHex <- function( @@ -480,7 +493,7 @@ ggHex <- function( baseSize = 6, ratioYX = 1, FUN = "mean", - quantCut = c(0.01,0.99), + quantCut = c(0.01, 0.99), addPoints = FALSE, ...){ @@ -543,41 +556,31 @@ ggHex <- function( } -#' Align Ggplots vertically or horizontally +#' Align ggplot plots vertically or horizontally #' #' This function aligns ggplots vertically or horizontally #' -#' @param ... ggplots -#' @param sizes sizes are a vector or list of values for each ggplot ie c(1,1) for two plots -#' @param type v,vertical or h,horizontal -#' @param plotList add a list of plots to be aligned -#' @param grobList add a list of grobs to be aligned +#' @param ... All additional arguments will be interpreted as `ggplot2` plot objects and used if and only if `plotList` is `NULL` +#' @param plotList A list of `ggplot2` plot objects to be aligned. +#' @param sizes A numeric vector or list of values indicating the relative size for each of the objects in `plotList` or supplied in `...`. +#' @param type A string indicating wheter vertical ("v") or horizontal ("h") alignment should be used for the multi-plot layout. +#' @param draw A boolean value indicating whether to draw plot or return grob. #' @export -#' -ggAlignPlots <- function(..., sizes, type = "v", plotList = NULL, grobList = NULL, draw = TRUE){ +ggAlignPlots <- function( + ..., plotList = NULL, sizes = NULL, type = "v", draw = TRUE){ #http://stackoverflow.com/a/21503904 .requirePackage("gtable") - if(is.null(grobList)){ - - if(is.null(plotList)){ - plotList <- list(...) - } - - ## test that only passing plots - stopifnot(do.call(all, lapply(plotList, inherits, "gg"))) + if(is.null(plotList)){ + plotList <- list(...) + } - gl <- lapply(plotList, ggplotGrob) + ## test that only passing plots + stopifnot(do.call(all, lapply(plotList, inherits, "gg"))) - }else{ - - gl <- grobList - rm(grobList) - gc() - - } + gl <- lapply(plotList, ggplotGrob) #if ncols do not match fill with empty gtables_add_cols if(type == "v" | type == "vertical"){ @@ -638,22 +641,20 @@ ggAlignPlots <- function(..., sizes, type = "v", plotList = NULL, grobList = NUL #' #' This function returns a ggplot2 theme that is black borded with black font. #' -#' @param color color of theme -#' @param baseSize is the size of the font for the axis text and title -#' @param baseFamily is family for font -#' @param baseLineSize is the size of line -#' @param baseRectSize is the size of rectangle boxes -#' @param plotMarginCm plot margin in cm -#' @param legendPosition where is the legend default bottom -#' @param legendTextSize 0.75*base_size -#' @param axisTickCm axis tick length in cm -#' @param xText90 rotate x axis text 90 degrees -#' @param yText90 rotate y axis text 90 degrees +#' @param color The color to be used for text, lines, ticks, etc for the plot. +#' @param baseSize The base font size to use in the plot. +#' @param baseLineSize The base line width (in points) to be used throughout the plot. +#' @param baseRectSize The base line width (in points) to use for rectangular boxes throughout the plot. +#' @param plotMarginCm The width in centimeters of the whitespace margin around the plot. +#' @param legendPosition The location to put the legend. Valid options are "bottom", "top", "left", and "right. +#' @param legendTextSize 0.75 times the base_size +#' @param axisTickCm The length in centimeters to make the axis ticks. +#' @param xText90 A boolean value indicating whether the x-axis text should be rotated 90 degrees counterclockwise. +#' @param yText90 A boolean value indicating whether the y-axis text should be rotated 90 degrees counterclockwise. #' @export theme_ArchR <- function( color = "black", baseSize = 6, - baseFamily = "", baseLineSize = 0.5, baseRectSize = 0.5, plotMarginCm = 1, @@ -680,6 +681,7 @@ theme_ArchR <- function( legend.text = element_text(color = color, size = legendTextSize), legend.background = element_rect(fill = "transparent"), legend.box.background = element_rect(fill = "transparent"), + legend.position = legendPosition, strip.text = element_text(size = baseSize, color="black"), plot.background = element_rect(fill = "transparent", color = NA) ) diff --git a/R/GroupCoverages.R b/R/GroupCoverages.R index 1f22b3cd..b3c5ba65 100644 --- a/R/GroupCoverages.R +++ b/R/GroupCoverages.R @@ -1,27 +1,26 @@ -#' Add Group Coverages to ArchR Project +#' Add Group Coverages to an ArchRProject object #' -#' This function will merge cells within each group into an insertion -#' coverage file +#' This function will merge cells within each designated cell group for the generation of pseudo-bulk replicates and then merge these replicates into a single insertion coverage file. #' -#' @param ArchRProj ArchRProject -#' @param groupBy group cells by this column in cellColData -#' @param useLabels use sample labels to create sample guided subgroupings as pseudo replicates -#' @param minCells minimum cells per group for coverage files -#' @param maxCells maximum cells per group for coverage files -#' @param maxFragments maximum fragments per group for coverage files (this prevents large files created for optimizing memory) -#' @param minReplicates minimum replicates for group for coverage files -#' @param maxReplicates maximum replicates for group for coverage files -#' @param sampleRatio sampling ratio for pseudo replicates when needed -#' @param kmerLength kmer length for adding Tn5 bias estimation -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force creating coverage files if existed -#' @param verboseHeader verbose sections -#' @param verboseAll verbose sections and subsections +#' @param ArchRProj An `ArchRProject` object. +#' @param groupBy The name of the column in `cellColData` to use for grouping multiple cells together prior to generation of the insertion coverage file. +#' @param useLabels A boolean value indicating whether to use sample labels to create sample-aware subgrouping during as pseudo-bulk replicate generation. +#' @param minCells The minimum number of cells required in a given cell group to permit insertion coverage file generation. +#' @param maxCells The maximum number of cells to use during insertion coverage file generation. +#' @param maxFragments The maximum number of fragments per cell group to use in insertion coverage file generation. This prevents the generation of excessively large files which would negatively impact memory requirements. +#' @param minReplicates The minimum number of pseudo-bulk replicates to be generated. +#' @param maxReplicates The maximum number of pseudo-bulk replicates to be generated. +#' @param sampleRatio The fraction of the total cells that can be sampled to generate any given pseudo-bulk replicate. +#' @param kmerLength The length of the kmer used for estimating Tn5 bias. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value that indicates whether or not to overwrite the relevant data in the `ArchRProject` object if insertion coverage / pseudo-bulk replicate information already exists. +#' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. +#' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args #' @export addGroupCoverages <- function( - ArchRProj, + ArchRProj = NULL, groupBy = "Clusters", useLabels = TRUE, minCells = 40, @@ -31,7 +30,7 @@ addGroupCoverages <- function( maxReplicates = 5, sampleRatio = 0.8, kmerLength = 6, - threads = 16, + threads = 1, parallelParam = "mclapply", force = FALSE, verboseHeader = TRUE, @@ -45,11 +44,11 @@ addGroupCoverages <- function( tstart <- Sys.time() Params <- SimpleList( - groupBy=groupBy, - minCells=minCells, - maxCells=maxCells, - minReplicates=minReplicates, - sampleRatio=sampleRatio, + groupBy = groupBy, + minCells = minCells, + maxCells = maxCells, + minReplicates = minReplicates, + sampleRatio = sampleRatio, kmerLength = kmerLength ) @@ -325,7 +324,7 @@ addGroupCoverages <- function( out <- lapply(seq_len(ncol(maxMat)), function(i){ x[which(maxMat[,i]==1)] }) - return(out) + return(out) } if(is.null(sampleLabels)){ diff --git a/R/HelperUtils.R b/R/HelperUtils.R index ad4f7eaf..4ef6bb2e 100644 --- a/R/HelperUtils.R +++ b/R/HelperUtils.R @@ -8,7 +8,18 @@ if(inherits(genome, "BSgenome")){ return(genome) }else if(is.character(genome)){ - return(BSgenome::getBSgenome(genome, masked = masked)) + genome <- tryCatch({ + .requirePackage(genome) + bsg <- eval(parse(text = genome)) + if(inherits(bsg, "BSgenome")){ + return(bsg) + }else{ + stop("genome is not a BSgenome valid class!") + } + }, error = function(x){ + BSgenome::getBSgenome(genome, masked = masked) + }) + return(genome) }else{ stop("Cannot validate BSgenome options are a valid BSgenome or character for getBSgenome") } @@ -54,20 +65,28 @@ #' Negated Value Matching #' -#' This function is the reciprocal of %in% -#' See match funciton in base R -#' x %ni% table +#' This function is the reciprocal of %in%. See the match funciton in base R. #' -#' @param x x search within table -#' @param table to search x in +#' @param x The value to search for in `table`. +#' @param table The set of values to serve as the base for the match function. #' @export "%ni%" <- function(x, table) !(match(x, table, nomatch = 0) > 0) -#Mainly used for Rle matching generic handling +#' Generic matching function for S4Vector objects +#' +#' This function provides a general matching function for S4Vector objects primarily to avoid ambiguity. +#' +#' @param x An `S4Vector` object search for in `table`. +#' @param table The set of `S4Vector` objects to serve as the base for the match function. #' @export '%bcin%' <- function(x, table) S4Vectors::match(x, table, nomatch = 0) > 0 -#Mainly used for Rle matching generic handling +#' Negated matching function for S4Vector objects +#' +#' This function provides the reciprocal of %bcin% for S4Vector objects primarily to avoid ambiguity. +#' +#' @param x An `S4Vector` object search for in `table`. +#' @param table The set of `S4Vector` objects to serve as the base for the match function. #' @export '%bcni%' <- function(x, table) !(S4Vectors::match(x, table, nomatch = 0) > 0) @@ -487,18 +506,12 @@ return(o) } -#' Get File Extension -#' @param x character string refering to a file you want to get the extension from #' @export .fileExtension <- function (x){ pos <- regexpr("\\.([[:alnum:]]+)$", x) ifelse(pos > -1L, substring(x, pos + 1L), "") } -#' Check path for utility -#' @param u utility that you want to check is in path -#' @param path check on top of path a custom path -#' @param error cause error if not in path #' @export .checkPath <- function(u = NULL, path = NULL, throwError = TRUE){ if(is.null(u)){ @@ -523,10 +536,6 @@ return(out) } -#' Check path for utility -#' @param u utility that you want to check is in path -#' @param path check on top of path a custom path -#' @param error cause error if not in path #' @export .tempfile <- function(pattern = "tmp", tmpdir = "tmp", fileext = "", addDOC = TRUE){ @@ -542,8 +551,6 @@ } -#' This function returns ascii archr LOGO or arrow etc. -#' @param ascii logo, arrow, target #' @export .ArchRLogo <- function(ascii = "Logo"){ Ascii <- list( diff --git a/R/Imputation.R b/R/Imputation.R index 0cc39793..27b9ee44 100644 --- a/R/Imputation.R +++ b/R/Imputation.R @@ -1,16 +1,20 @@ -#' Add TileMatrix to Arrows/ArchRProject +########################################################################################## +# Imputation Methods +########################################################################################## + +#' Add Imputation Weights to ArchRProject #' -#' This function for each sample will independently compute counts for each tile -#' per cell in the Arrow File +#' This function computes imputations weights that describe each cell as a linear combination of many cells based on MAGIC diffusion matrix. #' -#' @param input ArchRProject or ArrowFiles -#' @param chromSizes chromomosome sizes used for identifying number of tiles to count -#' @param windowSize size for each window to break up each chromosome -#' @param binarize save as a Sparse.Binary.Matrix or Sparse.Integer.Matrix -#' @param excludeChr exclude chromosomes from this analysis -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param ArchRProj An `ArchRProject` object. +#' @param reducedDims The name of the `reducedDims` object (i.e. IterativeLSI) to retrieve from the designated `ArchRProject`. +#' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in clustering. +#' @param td diffusion time (number of iterations) for MAGIC +#' @param ka kNN autotune parameter for MAGIC +#' @param sampleCells number of cells to sample per block of estimated imputation matrix +#' @param k number of nearest neighbors to use for MAGIC +#' @param epsilon a value for the standard deviation of the kernel for MAGIC +#' @param ... additional params #' @export addImputeWeights <- function( ArchRProj = NULL, @@ -21,7 +25,7 @@ addImputeWeights <- function( sampleCells = max(5000, floor(nCells(ArchRProj) / 10)), k = 15, epsilon = 1, - weighted = TRUE + ... ){ #Adapted From @@ -83,7 +87,7 @@ addImputeWeights <- function( knnDist <- knnDist / knnDist[,ka] } - if (weighted) { + if(epsilon > 0){ W <- Matrix::sparseMatrix(rep(seq_len(Nx), k), c(knnIdx), x=c(knnDist), dims = c(Nx, Nx)) } else { W <- Matrix::sparseMatrix(rep(seq_len(Nx), k), c(knnIdx), x=1, dims = c(Nx, Nx)) # unweighted kNN graph @@ -135,8 +139,7 @@ addImputeWeights <- function( td = td, k = k, ka = ka, - epsilon = epsilon, - weighted = weighted + epsilon = epsilon ) ) @@ -144,11 +147,11 @@ addImputeWeights <- function( } -#' Get outputDirectory in ArchRProject +#' Get Imputation Weights from ArchRProject #' -#' This function gets outputDirectory from ArchRProject +#' This function gets imputation weights from an ArchRProject to impute numeric values. #' -#' @param ArchRProj ArchRProject +#' @param ArchRProj An `ArchRProject` object. #' @param ... additional args #' @export getImputeWeights <- function(ArchRProj, ...){ @@ -157,14 +160,3 @@ getImputeWeights <- function(ArchRProj, ...){ } - - - - - - - - - - - diff --git a/R/LatentSemanticIndexing.R b/R/LatentSemanticIndexing.R index 9e9cbc7b..af7efac5 100644 --- a/R/LatentSemanticIndexing.R +++ b/R/LatentSemanticIndexing.R @@ -1,30 +1,37 @@ -#' Compute Iterative LSI +########################################################################################## +# LSI Dimensionality Reduction Methods +########################################################################################## + +#' Add an Iterative LSI-based dimensionality reduction to an ArchRProject #' #' This function will compute an iterative LSI dimensionality reduction #' on an ArchRProject. #' -#' @param ArchRProj ArchRProject -#' @param useMatrix use matrix for LSI clustering from Arrow -#' @param reducedDimsOut name of dimensionality reduction to be stored as -#' @param iterations number of LSI iterations to perform -#' @param dimsToUse number of dimensions to compute and use from LSI (TFIDF-SVD) for clustering -#' @param binarize binarize matrix prior to LSI -#' @param sampleCells number of cells to sample for LSI estimation -#' @param varFeatures number of variable features to use for LSI -#' @param selectionMethod selection method for variable features (var or vmr) -#' @param scaleTo scaleTo for Cluster Averages for variance calculation -#' @param totalFeatures number of features to consider (ranked by total number of counts) use for LSI -#' @param filterQuantile filter features for initial LSI that are above this quantile -#' @param saveIterations save LSI iterations as rds in the outDir -#' @param outDir output directory for saving LSI iterations -#' @param clusterParams additional params to pass to addClusters -#' @param runHarmony run harmony batch correction through the iterations -#' @param harmonyParams additional params to pass to harmony -#' @param threads number of threads for parallel execution -#' @param seed seed for analysis -#' @param verboseHeader verbose sections -#' @param verboseAll verbose sections and subsections -#' @param force verbose sections and subsections +#' @param ArchRProj An `ArchRProject` object. +#' @param useMatrix The name of the data matrix to retrieve from the given ArrowFile. Valid options are "TileMatrix" or "PeakMatrix". +#' @param reducedDimsOut The name to use for storage of the IterativeLSI dimensionality reduction in the `ArchRProject` as a `reducedDims` object. +#' @param iterations The number of LSI iterations to perform. +#' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in clustering. +#' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the corCutOff, it will be excluded from analysis. +#' @param LSIMethod A numeric/character indicating the order of operations in the TF-IDF normalization. +#' The 1st option is 1 or "tf-logidf", 2nd is 2 or "log(tf-idf)", and the 3rd option is 3 or "logtf-logidf". +#' @param binarize A boolean value indicating whether the matrix should be binarized before running LSI. This is often desired when working with insertion counts. +#' @param sampleCells An integer specifying number of cells to subset perform estimatedLSI and clustering. +#' @param varFeatures The number of N variable features to use for LSI. The top N features will be used based on the `selectionMethod`. +#' @param selectionMethod The selection method to be used for identifying the top variable features. Valid options are "var" for log-variability or "vmr" for variance-to-mean ratio. +#' @param scaleTo scaleTo normalization depth for Cluster Averages for variance calculation +#' @param totalFeatures The number of features to consider for use in LSI ranked by the total number of insertion counts. +#' @param filterQuantile Remove features that are above this quantile based on insertion counts prior to initial (1st iteration) LSI. +#' @param saveIterations A boolean value indicating whether the different LSI iterations should be saved as compressed `.rds` files in the designated `outDir`. +#' @param outDir The output directory for saving LSI iterations if desired. Default is in outputDirectory of ArchRProject. +#' @param clusterParams Additional parameters to be passed to `ArchR::addClusters()`. +#' @param runHarmony A boolean value indicating whether harmony-based batch correction should be run during the LSI iterations. +#' @param harmonyParams Additional parameters to be passed to `harmony::HarmonyMatrix()`. +#' @param threads The number of threads to be used for parallel computing. +#' @param seed A number to be used as the seed for random number generation required in cluster determination. It is recommended to keep track of the seed used so that you can reproduce results downstream. +#' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. +#' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. +#' @param force A boolean value that indicates whether or not to overwrite relevant data in the `ArchRProject` object. #' @param ... additional args #' @export addIterativeLSI <- function( diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R index a2244aaf..9491b0df 100644 --- a/R/MarkerFeatures.R +++ b/R/MarkerFeatures.R @@ -1,50 +1,52 @@ -#' Identify Marker Features for each Group +########################################################################################## +# Marker Feature Methods +########################################################################################## + +#' Identify Marker Features for each cell grouping #' -#' This function will identify a null set of cells that match biases per cell -#' while maintaining the input group proportions. Then it will compute a pairwise -#' test of the group vs the null set. +#' This function will identify features that are definitional of each provided cell grouping where possible #' -#' @param ArchRProj ArchR Project -#' @param groupBy group cells by this column in cellColData -#' @param useGroups use subset of groups in group column in cellColData for comparisons -#' @param bdgGroups use subset of groups in group column in cellColData for background -#' @param useMatrix matrix name in Arrow Files that will be used for identifying features -#' @param bias biases to account for in selecting null group using info from cellColData -#' @param normBy normalize by column in cellColData prior to test -#' @param testMethod pairwise test method group vs null -#' @param minCells minimum cells per group for testing -#' @param maxCells maximum cells per group for testing -#' @param k knn for matching cell biases -#' @param bufferRatio buffering ratio for matching cell biases -#' @param binarize binarize prior to testing -#' @param method marker identification method -#' @param useSeqnames specific seqnames to use only -#' @param verboseHeader verbose sections -#' @param verboseAll verbose sections and subsections +#' @param ArchRProj An `ArchRProject` object. +#' @param groupBy The name of the column in `cellColData` to use for grouping cells together for marker feature identification. +#' @param useGroups A character vector that is used to select a subset of groups by name from the designated `groupBy` column in `cellColData`. This limits the groups used to perform marker feature identification. +#' @param bgdGroups A character vector that is used to select a subset of groups by name from the designated `groupBy` column in `cellColData` to be used for background calculations in marker feature identification. +#' @param useMatrix The name of the matrix to be used for performing differential analyses. Options include "GeneScoreMatrix", "PeakMatrix", etc. +#' @param bias A character vector indicating the potential bias variables as a function (i.e. c("TSSEnrichment", "log10(nFrags)")) to account for in selecting a matched null group for marker feature identification. These should be column names from `cellColData`. +#' @param normBy The name of a numeric column in `cellColData` that should be normalized across cells (i.e. "ReadsInTSS") prior to performing marker feature identification. +#' @param testMethod The name of the pairwise test method to use in comparing cell groupings to the null cell grouping during marker feature identification. Valid options include "wilcoxon", "ttest", and "binomial". +#' @param maxCells The maximum number of cells to consider from a single cell group when performing marker feature identification. +#' @param scaleTo Normalization depth to center normalization to in normBy (default is 10,000). +#' @param threads The number of threads to be used for parallel computing. +#' @param k The number of nearby cells to use for selecting biased-matched background while accounting for bgdGroups proportions. +#' @param bufferRatio The buffering ratio of cells to enable best biased-matched background while accounting for bgdGroups proportions. +#' @param binarize A boolean value indicating whether to binarize the matrix prior to differential testing. +#' @param useSeqnames A character vector that indicates which seqnames should be used in marker feature identification. Features from seqnames that are not listed will be ignored. +#' @param method The name of the method to be used for marker feature identification. Valid options are "ArchR" which will use the default ArchR method or "Venice" which will use the `Signac::VeniceMarker()` fucntion. +#' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. +#' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args #' @export markerFeatures <- function( - ArchRProj = NULL, - groupBy = "Clusters", - useGroups = NULL, - bdgGroups = NULL, - useMatrix = "GeneScoreMatrix", - bias = c("TSSEnrichment", "log10(nFrags)"), - normBy = NULL, - testMethod = "wilcoxon", - minCells = 50, - maxCells = 500, - scaleTo = 10^4, - threads = 1, - k = 100, - bufferRatio = 0.8, - binarize = FALSE, - useSeqnames = NULL, - method = "ArchR", - verboseHeader = TRUE, - verboseAll = FALSE, - ... - ){ + ArchRProj = NULL, + groupBy = "Clusters", + useGroups = NULL, + bgdGroups = NULL, + useMatrix = "GeneScoreMatrix", + bias = c("TSSEnrichment", "log10(nFrags)"), + normBy = NULL, + testMethod = "wilcoxon", + maxCells = 500, + scaleTo = 10^4, + threads = 1, + k = 100, + bufferRatio = 0.8, + binarize = FALSE, + useSeqnames = NULL, + method = "ArchR", + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ args <- append(args, mget(names(formals()),sys.frame(sys.nframe()))) @@ -76,9 +78,8 @@ markerFeatures <- function( ArchRProj = NULL, groupBy = "Clusters", useGroups = NULL, - bdgGroups = NULL, + bgdGroups = NULL, normBy = NULL, - minCells = 50, maxCells = 500, scaleTo = 10^4, bufferRatio = 0.8, @@ -133,7 +134,7 @@ markerFeatures <- function( input = colDat, groups = groups, useGroups = useGroups, - bdgGroups = bdgGroups, + bgdGroups = bgdGroups, bias = bias, k = k, n = maxCells @@ -198,7 +199,7 @@ markerFeatures <- function( Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), AUC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$auc)) %>% Reduce("cbind",.), - MeanBDG = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) + MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) ), rowData = featureDF ) @@ -210,8 +211,8 @@ markerFeatures <- function( Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), Variance = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), - MeanBDG = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.), - VarianceBDG = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var2)) %>% Reduce("cbind",.) + MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.), + VarianceBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var2)) %>% Reduce("cbind",.) ), rowData = featureDF ) @@ -222,7 +223,7 @@ markerFeatures <- function( Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.), Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), - MeanBDG = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) + MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) ), rowData = featureDF ) @@ -242,11 +243,11 @@ markerFeatures <- function( matchx <- matchObj[[1]][[group]] cellsx <- matchObj[[2]]$cells[matchx$cells] - bdgx <- matchObj[[2]]$cells[matchx$bdg] + bgdx <- matchObj[[2]]$cells[matchx$bgd] if(!is.null(normFactors)){ cellNF <- normFactors[cellsx,1] - bdgNF <- normFactors[bdgx,1] + bgdNF <- normFactors[bgdx,1] } #Add RowNames for Check at the end @@ -262,7 +263,7 @@ markerFeatures <- function( featureDF = featureDFy, threads = threads, useMatrix = useMatrix, - cellNames = c(cellsx, bdgx), + cellNames = c(cellsx, bgdx), progress = FALSE )) rownames(scMaty) <- rownames(featureDFy) @@ -274,10 +275,10 @@ markerFeatures <- function( args <- list() if(!is.null(normFactors)){ args$mat1 <- Matrix::t(Matrix::t(scMaty[, cellsx, drop = FALSE]) * cellNF) - args$mat2 <- Matrix::t(Matrix::t(scMaty[, bdgx, drop = FALSE]) * bdgNF) + args$mat2 <- Matrix::t(Matrix::t(scMaty[, bgdx, drop = FALSE]) * bgdNF) }else{ args$mat1 <- scMaty[, cellsx, drop = FALSE] - args$mat2 <- scMaty[, bdgx, drop = FALSE] + args$mat2 <- scMaty[, bgdx, drop = FALSE] } if(tolower(testMethod) == "wilcoxon"){ @@ -436,7 +437,7 @@ markerFeatures <- function( } -.matchBiasCellGroups <- function(input, groups, useGroups, bdgGroups, bias, k = 100, n = 500, bufferRatio = 0.8){ +.matchBiasCellGroups <- function(input, groups, useGroups, bgdGroups, bias, k = 100, n = 500, bufferRatio = 0.8){ #Summary Function .summarizeColStats <- function(m, name = NULL){ @@ -485,16 +486,16 @@ markerFeatures <- function( useGroups <- gtools::mixedsort(unique(paste0(groups))) } - if(is.null(bdgGroups)){ - bdgGroups <- gtools::mixedsort(unique(paste0(groups))) + if(is.null(bgdGroups)){ + bgdGroups <- gtools::mixedsort(unique(paste0(groups))) } stopifnot(all(useGroups %in% unique(paste0(groups)))) - stopifnot(all(bdgGroups %in% unique(paste0(groups)))) + stopifnot(all(bgdGroups %in% unique(paste0(groups)))) #Get proportion of each group prob <- table(groups) / length(groups) - bdgProb <- prob[which(names(prob) %in% bdgGroups)] / sum(prob[which(names(prob) %in% bdgGroups)]) + bgdProb <- prob[which(names(prob) %in% bgdGroups)] / sum(prob[which(names(prob) %in% bgdGroups)]) pb <- txtProgressBar(min=0,max=100,initial=0,style=3) matchList <- lapply(seq_along(useGroups), function(x){ @@ -505,21 +506,21 @@ markerFeatures <- function( # Organize ############# groupx <- useGroups[x] - idx <- which(names(bdgProb) == groupx) - if(length(idx) > 0 & length(idx) != length(bdgProb)){ - bdgProbx <- bdgProb[-idx]/sum(bdgProb[-idx]) + idx <- which(names(bgdProb) == groupx) + if(length(idx) > 0 & length(idx) != length(bgdProb)){ + bgdProbx <- bgdProb[-idx]/sum(bgdProb[-idx]) }else{ - bdgProbx <- bdgProb + bgdProbx <- bgdProb } idF <- which(groups == groupx) - idB <- which(groups %in% names(bdgProbx)) + idB <- which(groups %in% names(bgdProbx)) - knnx <- computeKNN(inputNormQ[idB, ], inputNormQ[idF, ], k = k) + knnx <- .computeKNN(inputNormQ[idB, ], inputNormQ[idF, ], k = k) sx <- sample(seq_len(nrow(knnx)), nrow(knnx)) minTotal <- min(n, length(sx) * bufferRatio) - nx <- sort(floor(minTotal * bdgProbx)) + nx <- sort(floor(minTotal * bgdProbx)) ############### # ID Matching @@ -590,28 +591,28 @@ markerFeatures <- function( ##################### # Matching Stats Groups ##################### - estBdg <- sort(floor(minTotal * bdgProbx)) - obsBdg <- rep(0, length(estBdg)) - names(obsBdg) <- names(estBdg) + estbgd <- sort(floor(minTotal * bgdProbx)) + obsbgd <- rep(0, length(estbgd)) + names(obsbgd) <- names(estbgd) tabGroups <- table(groups[idY]) - obsBdg[names(tabGroups)] <- tabGroups - estBdgP <- round(100 * estBdg / sum(estBdg),3) - obsBdgP <- round(100 * obsBdg / sum(obsBdg),3) + obsbgd[names(tabGroups)] <- tabGroups + estbgdP <- round(100 * estbgd / sum(estbgd),3) + obsbgdP <- round(100 * obsbgd / sum(obsbgd),3) ##################### # Matching Stats Bias Norm Values ##################### forBias <- .summarizeColStats(inputNorm[idX,], name = "foreground") - bdgBias <- .summarizeColStats(inputNorm[idY,], name = "background") + bgdBias <- .summarizeColStats(inputNorm[idY,], name = "background") out <- list( cells = idX, - bdg = idY, + bgd = idY, summaryCells = forBias, - summaryBdg = bdgBias, - bdgGroups = rbind(estBdg, obsBdg), - bdgGroupsProbs = rbind(estBdgP, obsBdgP), - corBdgGroups = cor(estBdgP, obsBdgP), + summaryBgd = bgdBias, + bgdGroups = rbind(estbgd, obsbgd), + bgdGroupsProbs = rbind(estbgdP, obsbgdP), + corbgdGroups = cor(estbgdP, obsbgdP), n = length(sx), p = it / length(sx), group = groupx @@ -625,7 +626,7 @@ markerFeatures <- function( message("\n") outList <- SimpleList( - matchBdg = matchList, + matchbgd = matchList, info = SimpleList( cells = rownames(input), groups = groups, @@ -640,28 +641,27 @@ markerFeatures <- function( #################################################################################################### -# # Applications of Markers! -# #################################################################################################### #' Plot a Heatmap of Identified Marker Features #' #' This function will plot a heatmap of the results from markerFeatures #' -#' @param seMarker Summarized Experiment result from markerFeatures -#' @param cutoff Logical Statement for Cutoff to Be called a Marker a statement containing assayNames from seMarker -#' @param log2Norm log2 Normalization prior to plotting set true for counting assays (not DeviationsMatrix!) +#' @param seMarker A `SummarizedExperiment` object returned by `ArchR::markerFeatures()`. +#' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker` will be plotted in the heatmap. `cutoff` can contain any of the `assayNames` from `seMarker`. +#' @param log2Norm A boolean value indicating whether a log2 transformation whould be performed on the values in `seMarker` prior to plotting. Should be set to `TRUE` for counts-based assays (but not assays like `DeviationsMatrix`). #' @param scaleTo scale to prior to log2 Normalization, if log2Norm is FALSE this does nothing -#' @param scaleRows compute row z-scores on matrix -#' @param limits heatmap color limits -#' @param grepExclude remove features by grep -#' @param pal palette for heatmap, default will use solar_extra -#' @param binaryClusterRows fast clustering implementation for row clustering by binary sorting -#' @param labelMarkers label specific markers by name on heatmap (matches rownames of seMarker) -#' @param labelTop label the top features for each column in seMarker -#' @param labelRows label all rows -#' @param returnMat return final matrix that is used for plotting heatmap +#' @param scaleRows A boolean value that indicates whether the heatmap should display row-wise z-scores instead of raw values. +#' @param limits A numeric vector of two numbers that represent the lower and upper color limits of the heatmap color scheme. +#' @param grepExclude A character vector or string that indicates the `rownames` or a specific pattern that identifies rownames from `seMarker` to be excluded from the heatmap. +#' @param pal A custom continuous palette (see paletteContinuous) used to override the continuous palette for the heatmap. +#' @param binaryClusterRows A boolean value that indicates whether a binary sorting algorithm should be used for fast clustering of heatmap rows. +#' @param labelMarkers A character vector listing the `rownames` of `seMarker` that should be labeled on the side of the heatmap. +#' @param labelTop A boolean value that indicates whether the top features for each column in `seMarker` should be labeled on the side of the heatmap. +#' @param labelRows A boolean value that indicates whether all rows should be labeled on the side of the heatmap. +#' @param returnMat A boolean value that indicates whether the final heatmap matrix should be returned in lieu of plotting the actual heatmap. +#' @param invert A boolean value that indicates whether the heatmap will be inverted ie when looking for down-regulated markers (Log2FC < 0) instead of up-regulated markers (Log2FC > 0). #' @param ... additional args #' @export markerHeatmap <- function( @@ -694,9 +694,9 @@ markerHeatmap <- function( } #Now Get Values if(plotLog2FC){ - mat <- SummarizedExperiment::assays(seMarker)[["Log2FC"]] + mat <- as.matrix(SummarizedExperiment::assays(seMarker)[["Log2FC"]]) }else{ - mat <- SummarizedExperiment::assays(seMarker)[["Mean"]] + mat <- as.matrix(SummarizedExperiment::assays(seMarker)[["Mean"]]) if(log2Norm){ mat <- log2(t(t(mat)/colSums(mat)) * scaleTo + 1) } @@ -751,10 +751,11 @@ markerHeatmap <- function( if(binaryClusterRows){ if(invert){ bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat)]) + mat <- -bS[[1]][,colnames(mat)] }else{ bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)]) + mat <- bS[[1]][,colnames(mat)] } - mat <- -bS[[1]][,colnames(mat)] clusterRows <- FALSE clusterCols <- bS[[2]] }else{ @@ -843,7 +844,8 @@ markerHeatmap <- function( padding = 45, borderColor = NA, draw = TRUE, - name = ""){ + name = "Heatmap" + ){ #Packages .requirePackage("ComplexHeatmap") @@ -1110,15 +1112,27 @@ markerHeatmap <- function( } +#' Peak Annotation Hypergeometric Enrichment in Marker Peaks. +#' +#' This function will perform hypergeometric enrichment of peakAnnotation within the defined Marker Peaks (see markerFeatures). +#' +#' @param seMarker A `SummarizedExperiment` object returned by `ArchR::markerFeatures()`. +#' @param ArchRProj An `ArchRProject` object. +#' @param peakAnnotation A peakAnnotation in an `ArchRProject` to be used for hypergeometric test. +#' @param matches A custom peakAnnotations matches object used as input (see motifmatchr::matchmotifs). +#' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker`. `cutoff` can contain any of the `assayNames` from `seMarker`. +#' @param background Whether to use background matched peaks "bgdPeaks" to compare against or all peaks "all" (see addBgdPeaks). +#' @param ... additional args #' @export markerAnnoEnrich <- function( seMarker = NULL, ArchRProj = NULL, - annotations = NULL, + peakAnnotation = NULL, matches = NULL, - cutOff = "FDR <= 0.01 & Log2FC >= 0.5", - background = "bdgPeaks", - ...){ + cutOff = "FDR <= 0.1 & Log2FC >= 0.5", + background = "bgdPeaks", + ... + ){ tstart <- Sys.time() if(metadata(seMarker)$Params$useMatrix != "PeakMatrix"){ @@ -1126,7 +1140,7 @@ markerAnnoEnrich <- function( } if(is.null(matches)){ - matches <- getMatches(ArchRProj, annotations) + matches <- getMatches(ArchRProj, peakAnnotation) } r1 <- SummarizedExperiment::rowRanges(matches) @@ -1154,9 +1168,9 @@ markerAnnoEnrich <- function( eval(parse(text=paste0("rm(",an,")"))) } - if(tolower(background) %in% c("backgroundpeaks", "bdgpeaks", "background", "bdg")){ - method <- "bdg" - bdgPeaks <- SummarizedExperiment::assay(getBdgPeaks(ArchRProj)) + if(tolower(background) %in% c("backgroundpeaks", "bgdpeaks", "background", "bgd")){ + method <- "bgd" + bgdPeaks <- SummarizedExperiment::assay(getBgdPeaks(ArchRProj)) }else{ method <- "all" } @@ -1164,8 +1178,8 @@ markerAnnoEnrich <- function( enrichList <- lapply(seq_len(ncol(seMarker)), function(x){ .messageDiffTime(sprintf("Computing Enrichments %s of %s",x,ncol(seMarker)),tstart) idx <- which(passMat[, x]) - if(method == "bdg"){ - .computeEnrichment(matches, idx, c(idx, as.vector(bdgPeaks[idx,]))) + if(method == "bgd"){ + .computeEnrichment(matches, idx, c(idx, as.vector(bgdPeaks[idx,]))) }else{ .computeEnrichment(matches, idx, seq_len(nrow(matches))) } @@ -1229,10 +1243,17 @@ markerAnnoEnrich <- function( } +#' Identify Marker Feature Ranges +#' +#' This function will identify Markers and return a GRangesList for each group of significant marker regions. +#' +#' @param seMarker A `SummarizedExperiment` object returned by `ArchR::markerFeatures()`. +#' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker`. `cutoff` can contain any of the `assayNames` from `seMarker`. +#' @param ... additional args #' @export -markerRanges <- function( +markerGR <- function( seMarker, - cutOff = "FDR <= 0.01 & Log2FC >= 0.5", + cutOff = "FDR <= 0.1 & Log2FC >= 0.5", ... ){ @@ -1269,15 +1290,24 @@ markerRanges <- function( } +#' Plot Differential Markers +#' +#' This function will plot one group/column of a differential markers se as a MA or Volcano plot. +#' +#' @param seMarker A `SummarizedExperiment` object returned by `ArchR::markerFeatures()`. +#' @param name column/group name of seMarker to be plotted. +#' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker` will be plotted. `cutoff` can contain any of the `assayNames` from `seMarker`. +#' @param plotAs plot as "Volcano" or "MA" plot. +#' @param ... additional args #' @export markerPlot <- function( seMarker, name = NULL, cutOff = "FDR <= 0.01 & abs(Log2FC) >= 0.5", plotAs = "Volcano", - log2Norm = TRUE, scaleTo = 10^4, - ...){ + ... + ){ #Evaluate AssayNames assayNames <- names(SummarizedExperiment::assays(seMarker)) @@ -1301,7 +1331,7 @@ markerPlot <- function( FDR <- as.vector(as.matrix(FDR)) FDR[is.na(FDR)] <- 1 - LM <- log2((assays(seMarker[,name])$Mean + assays(seMarker[,name])$MeanBDG)/2 + 1) + LM <- log2((assays(seMarker[,name])$Mean + assays(seMarker[,name])$MeanBGD)/2 + 1) LM <- as.vector(as.matrix(LM)) color <- ifelse(passMat[, name], "Differential", "Not-Differential") diff --git a/R/MarkerHeatmap.R b/R/MarkerHeatmap.R deleted file mode 100644 index 3aac60b4..00000000 --- a/R/MarkerHeatmap.R +++ /dev/null @@ -1,578 +0,0 @@ -#' Plot a Heatmap of Identified Marker Features -#' -#' This function will plot a heatmap of the results from markerFeatures -#' -#' @param seMarker Summarized Experiment result from markerFeatures -#' @param cutoff Logical Statement for Cutoff to Be called a Marker a statement containing assayNames from seMarker -#' @param log2Norm log2 Normalization prior to plotting set true for counting assays (not DeviationsMatrix!) -#' @param scaleTo scale to prior to log2 Normalization, if log2Norm is FALSE this does nothing -#' @param scaleRows compute row z-scores on matrix -#' @param limits heatmap color limits -#' @param grepExclude remove features by grep -#' @param pal palette for heatmap, default will use solar_extra -#' @param binaryClusterRows fast clustering implementation for row clustering by binary sorting -#' @param labelMarkers label specific markers by name on heatmap (matches rownames of seMarker) -#' @param labelTop label the top features for each column in seMarker -#' @param labelRows label all rows -#' @param returnMat return final matrix that is used for plotting heatmap -#' @param ... additional args -#' @export -markerHeatmap <- function( - seMarker, - cutOff = "FDR <= 0.001 & Log2FC >= 0.1", - log2Norm = TRUE, - scaleTo = 10^4, - scaleRows = TRUE, - limits = c(-2,2), - grepExclude = NULL, - pal = NULL, - binaryClusterRows = TRUE, - labelMarkers = NULL, - labelTop = NULL, - labelRows = FALSE, - returnMat = FALSE, - ... - ){ - - #Evaluate AssayNames - assayNames <- names(SummarizedExperiment::assays(seMarker)) - for(an in assayNames){ - eval(parse(text=paste0(an, " <- ", "SummarizedExperiment::assays(seMarker)[['", an, "']]"))) - } - passMat <- eval(parse(text=cutOff)) - for(an in assayNames){ - eval(parse(text=paste0("rm(",an,")"))) - } - - #Now Get Values - mat <- SummarizedExperiment::assays(seMarker)[["Mean"]] - idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0) - if(log2Norm){ - mat <- log2(t(t(mat)/colSums(mat)) * scaleTo + 1) - } - mat <- mat[idx,] - passMat <- passMat[idx,] - - if(scaleRows){ - mat <- sweep(mat - rowMeans(mat), 1, matrixStats::rowSds(mat), `/`) - mat[mat > max(limits)] <- max(limits) - mat[mat < min(limits)] <- min(limits) - } - - if(nrow(mat) == 0){ - stop("No Makers Found!") - } - - #add rownames - rd <- SummarizedExperiment::rowData(seMarker)[idx,] - if(is.null(rd$name)){ - rn <- paste0(rd$seqnames,":",rd$start,"-",rd$end) - }else{ - if(sum(duplicated(rd$name)) > 0){ - rn <- paste0(rd$seqnames,":",rd$name) - }else{ - rn <- rd$name - } - } - rownames(mat) <- rn - rownames(passMat) <- rn - - #identify to remove - if(!is.null(grepExclude) & !is.null(rownames(mat))){ - idx2 <- which(!grepl(grepExclude, rownames(mat))) - mat <- mat[idx2,] - } - - if(nrow(mat)==0){ - stop("No Makers Found!") - } - - if(!is.null(labelTop)){ - spmat <- passMat / rowSums(passMat) - idx2 <- lapply(seq_len(ncol(spmat)), function(x){ - head(order(spmat[,x], decreasing = TRUE), labelTop) - }) %>% unlist %>% unique %>% sort - mat <- mat[idx2,] - labelRows <- TRUE - } - - if(binaryClusterRows){ - bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)]) - mat <- bS[[1]][,colnames(mat)] - clusterRows <- FALSE - clusterCols <- bS[[2]] - }else{ - clusterRows <- TRUE - clusterCols <- TRUE - } - - if(!is.null(labelMarkers)){ - mn <- match(tolower(labelMarkers), tolower(rownames(mat)), nomatch = 0) - mn <- mn[mn > 0] - }else{ - mn <- NULL - } - - if(nrow(mat) == 0){ - stop("No Makers Found!") - } - - message(sprintf("Identified %s markers!", nrow(mat))) - - if(is.null(pal)){ - if(is.null(metadata(seMarker)$Params$useMatrix)){ - pal <- paletteContinuous(set = "solar_extra", n = 100) - }else if(tolower(metadata(seMarker)$Params$useMatrix)=="genescorematrix"){ - pal <- paletteContinuous(set = "blue_yellow", n = 100) - }else{ - pal <- paletteContinuous(set = "solar_extra", n = 100) - } - } - - ht <- .ArchRHeatmap( - mat = mat, - scale = FALSE, - limits = c(min(mat), max(mat)), - color = pal, - clusterCols = clusterCols, - clusterRows = clusterRows, - labelRows = labelRows, - labelCols = TRUE, - customRowLabel = mn, - showColDendrogram = TRUE, - draw = FALSE, - ... - ) - - if(returnMat){ - return(mat) - }else{ - return(ht) - } - -} - -######################################################################################################## -# Helpers for Nice Heatmap with Bioconductors ComplexHeamtap -######################################################################################################## - -.ArchRHeatmap <- function( - mat, - scale = FALSE, - limits = c(min(mat), max(mat)), - colData = NULL, - color = paletteContinuous(set = "solar_extra", n = 100), - clusterCols = TRUE, - clusterRows = FALSE, - labelCols = FALSE, - labelRows = FALSE, - colorMap = NULL, - useRaster = TRUE, - rasterQuality = 5, - split = NULL, - fontsize = 6, - colAnnoPerRow = 4, - showRowDendrogram = FALSE, - showColDendrogram = FALSE, - customRowLabel = NULL, - customRowLabelIDs = NULL, - customColLabel = NULL, - customColLabelIDs = NULL, - customLabelWidth = 0.75, - rasterDevice = "png", - padding = 45, - borderColor = NA, - draw = TRUE, - name = ""){ - - #Packages - .requirePackage("ComplexHeatmap") - .requirePackage("circlize") - - #Z-score - if (scale) { - message("Scaling Matrix...") - mat <- .rowZscores(mat, limit = FALSE) - name <- paste0(name," Z-Scores") - } - - #Get A Color map if null - if (is.null(colorMap)) { - colorMap <- .colorMapAnno(colData) - } - - #Prepare ColorMap format for Complex Heatmap - if (!is.null(colData)){ - colData = data.frame(colData) - colorMap <- .colorMapForCH(colorMap, colData) #change - showLegend <- .checkShowLegend(colorMap[match(names(colorMap), colnames(colData))]) #change - }else { - colorMap <- NULL - showLegend <- NULL - } - - #Prepare Limits if needed - breaks <- NULL - if (!is.null(limits)) { - mat[mat > max(limits)] <- max(limits) - mat[mat < min(limits)] <- min(limits) - breaks <- seq(min(limits), max(limits), length.out = length(color)) - color <- circlize::colorRamp2(breaks, color) - } - - if(exists('anno_mark', where='package:ComplexHeatmap', mode='function')){ - anno_check_version_rows <- ComplexHeatmap::anno_mark - anno_check_version_cols <- ComplexHeatmap::anno_mark - }else{ - anno_check_version_rows <- ComplexHeatmap::row_anno_link - anno_check_version_cols <- ComplexHeatmap::column_anno_link - } - - #Annotation Heatmap - if(!is.null(colData) & !is.null(customColLabel)){ - message("Adding Annotations...") - if(is.null(customColLabelIDs)){ - customColLabelIDs <- colnames(mat)[customRowLabel] - } - ht1Anno <- HeatmapAnnotation( - df = colData, - col = colorMap, - show_legend = showLegend, - show_annotation_name = TRUE, - gp = gpar(col = "NA"), - annotation_legend_param = - list( - nrow = min(colAnnoPerRow, max(round(nrow(colData)/colAnnoPerRow), 1)) - ), - link = anno_check_version_cols( - at = customColLabel, labels = customColLabelIDs), - width = unit(customLabelWidth, "cm") + max_text_width(customColLabelIDs) - - ) - }else if(!is.null(colData)){ - message("Adding Annotations...") - ht1Anno <- HeatmapAnnotation( - df = colData, - col = colorMap, - show_legend = showLegend, - show_annotation_name = TRUE, - gp = gpar(col = "NA"), - annotation_legend_param = - list( - nrow = min(colAnnoPerRow, max(round(nrow(colData)/colAnnoPerRow), 1)) - ) - ) - }else if(is.null(colData) & !is.null(customColLabel)){ - if(is.null(customColLabelIDs)){ - customColLabelIDs <- colnames(mat)[customRowLabel] - } - message("Adding Annotations...") - ht1Anno <- HeatmapAnnotation( - link = anno_check_version_cols( - at = customColLabel, labels = customColLabelIDs), - width = unit(customLabelWidth, "cm") + max_text_width(customColLabelIDs) - ) - }else{ - ht1Anno <- NULL - } - - message("Preparing Main Heatmap...") - ht1 <- Heatmap( - - #Main Stuff - matrix = mat, - name = name, - col = color, - - #Heatmap Legend - heatmap_legend_param = - list(color_bar = "continuous", - legend_direction = "horizontal", - legend_width = unit(5, "cm") - ), - rect_gp = gpar(col = borderColor), - - #Column Options - show_column_names = labelCols, - cluster_columns = clusterCols, - show_column_dend = showColDendrogram, - clustering_method_columns = "ward.D2", - column_names_gp = gpar(fontsize = fontsize), - column_names_max_height = unit(100, "mm"), - - #Row Options - show_row_names = labelRows, - row_names_gp = gpar(fontsize = fontsize), - cluster_rows = clusterRows, - show_row_dend = showRowDendrogram, - clustering_method_rows = "ward.D2", - split = split, - - #Annotation - top_annotation = ht1Anno, - - #Raster Info - use_raster = useRaster, - raster_device = rasterDevice, - raster_quality = rasterQuality - ) - - if(!is.null(customRowLabel)){ - if(is.null(customRowLabelIDs)){ - customRowLabelIDs <- rownames(mat)[customRowLabel] - } - ht1 <- ht1 + rowAnnotation(link = - anno_check_version_rows(at = customRowLabel, labels = customRowLabelIDs), - width = unit(customLabelWidth, "cm") + max_text_width(customRowLabelIDs)) - } - - if(draw){ - draw(ht1, - padding = unit(c(padding, padding, padding, padding), "mm"), - heatmap_legend_side = "bot", - annotation_legend_side = "bot") - }else{ - ht1 - } - -} - -.colorMapForCH <- function(colorMap, colData){ - colorMap <- colorMap[which(names(colorMap) %in% colnames(colData))] - colorMapCH <- lapply(seq_along(colorMap), function(x){ - if(attr(colorMap[[x]],"discrete")){ - colorx <- colorMap[[x]] - }else{ - vals <- colData[[names(colorMap)[x]]][!is.na(colData[[names(colorMap)[x]]])] - s <- seq(min(vals), max(vals), length.out = length(colorMap[[x]])) - colorx <- circlize::colorRamp2(s, colorMap[[x]]) - } - if(any(is.na(names(colorx)))){ - names(colorx)[is.na(names(colorx))] <- paste0("NA",seq_along(names(colorx)[is.na(names(colorx))])) - } - return(colorx) - }) - names(colorMapCH) <- names(colorMap) - return(colorMapCH) -} - -.checkShowLegend <- function(colorMap, max_discrete = 30){ - show <- lapply(seq_along(colorMap), function(x){ - if(attr(colorMap[[x]],"discrete") && length(unique(colorMap[[x]])) > max_discrete){ - sl <- FALSE - }else{ - sl <- TRUE - } - return(sl) - }) %>% unlist - names(show) <- names(colorMap) - return(show) -} - -.colorMapAnno <- function(colData, customAnno = NULL, discreteSet = "stallion", continuousSet = "solar_extra"){ - discreteCols <- sapply(colData,function(x) !is.numeric(x)) - if(!is.null(customAnno)){ - colorMap <- lapply(seq_along(discreteCols),function(x){ - if(discreteCols[x]){ - colors <- paletteDiscrete(values = colData[[names(discreteCols[x])]], set = discreteSet) - names(colors) <- unique(colData[[names(discreteCols[x])]]) - attr(colors, "discrete") <- TRUE - }else{ - colors <- paletteContinuous(set = continuousSet) - attr(colors, "discrete") <- FALSE - } - if(length(which(customAnno[,1] %in% names(discreteCols[x]))) > 0){ - if(length(which(customAnno[,2] %in% names(colors))) > 0){ - customAnnox <- customAnno[which(customAnno[,2] %in% names(colors)),] - colors[which(names(colors) %in% customAnnox[,2])] <- paste0(customAnnox[match(names(colors),customAnnox[,2]),3]) - } - } - return(colors) - }) - names(colorMap) <- colnames(colData) - return(colorMap) - }else{ - colorMap <- lapply(seq_along(discreteCols), function(x){ - if(discreteCols[x]){ - colors <- paletteDiscrete(values = colData[[names(discreteCols[x])]], set = discreteSet) - names(colors) <- unique(colData[[names(discreteCols[x])]]) - attr(colors, "discrete") <- TRUE - }else{ - colors <- paletteContinuous(set = continuousSet) - attr(colors, "discrete") <- FALSE - } - return(colors) - }) - names(colorMap) <- colnames(colData) - return(colorMap) - } - -} - -.binarySort <- function(m, scale = FALSE, cutOff = 1, lmat = NULL, clusterCols = TRUE){ - - if(is.null(lmat)){ - #Compute Row-Zscores - if(scale){ - lmat <- sweep(m - rowMeans(m), 1, matrixStats::rowSds(m), `/`) - }else{ - lmat <- m - } - lmat <- lmat >= cutOff - } - - #Transpose - m <- t(m) - lmat <- t(lmat) - - #Identify Column Ordering - if(clusterCols){ - hc <- hclust(dist(m)) - colIdx <- hc$order - m <- t(m[colIdx,]) - lmat <- t(lmat[colIdx,]) - }else{ - m <- t(m) - lmat <- t(lmat) - hc <- NULL - } - - #Identify Row Ordering - rowIdx <- do.call("order", c(as.data.frame(lmat)[seq_len(ncol(lmat))], list(decreasing = TRUE))) - m <- t(m[rowIdx,]) - lmat <- t(lmat[rowIdx,]) - - #Transpose - m <- t(m) - lmat <- t(lmat) - - return(list(mat = m, hclust = hc)) - -} - -#' @export -markerAnnoEnrich <- function( - seMarker = NULL, - ArchRProj = NULL, - annotations = NULL, - matches = NULL, - cutOff = "FDR <= 0.01 & Log2FC >= 0", - background = "bdgPeaks", - ...){ - - tstart <- Sys.time() - if(metadata(seMarker)$Params$useMatrix != "PeakMatrix"){ - stop("Only markers identified from PeakMatrix can be used!") - } - - if(is.null(matches)){ - matches <- getMatches(ArchRProj, annotations) - } - - r1 <- SummarizedExperiment::rowRanges(matches) - mcols(r1) <- NULL - - r2 <- getPeakSet(ArchRProj) - mcols(r2) <- NULL - - if(length(which(paste0(seqnames(r1),start(r1),end(r1), sep = "_") %ni% paste0(seqnames(r2),start(r2),end(r2),sep="_"))) != 0){ - stop("Peaks from matches do not match peakSet in ArchRProj!") - } - - r3 <- GRanges(rowData(seMarker)$seqnames,IRanges(rowData(seMarker)$start, rowData(seMarker)$end)) - mcols(r3) <- NULL - rownames(matches) <- paste0(seqnames(matches),start(matches),end(matches),sep="_") - matches <- matches[paste0(seqnames(r3),start(r3),end(r3), sep = "_"), ] - - #Evaluate AssayNames - assayNames <- names(SummarizedExperiment::assays(seMarker)) - for(an in assayNames){ - eval(parse(text=paste0(an, " <- ", "SummarizedExperiment::assays(seMarker)[['", an, "']]"))) - } - passMat <- eval(parse(text=cutOff)) - for(an in assayNames){ - eval(parse(text=paste0("rm(",an,")"))) - } - - if(tolower(background) %in% c("backgroundpeaks", "bdgpeaks", "background", "bdg")){ - method <- "bdg" - bdgPeaks <- SummarizedExperiment::assay(getBdgPeaks(ArchRProj)) - }else{ - method <- "all" - } - - enrichList <- lapply(seq_len(ncol(seMarker)), function(x){ - .messageDiffTime(sprintf("Computing Enrichments %s of %s",x,ncol(seMarker)),tstart) - idx <- which(passMat[, x]) - if(method == "bdg"){ - .computeEnrichment(matches, idx, c(idx, as.vector(bdgPeaks[idx,]))) - }else{ - .computeEnrichment(matches, idx, seq_len(nrow(matches))) - } - }) %>% SimpleList - names(enrichList) <- colnames(seMarker) - - assays <- lapply(seq_len(ncol(enrichList[[1]])), function(x){ - d <- lapply(seq_along(enrichList), function(y){ - enrichList[[y]][colnames(matches),x,drop=FALSE] - }) %>% Reduce("cbind",.) - colnames(d) <- names(enrichList) - d - }) %>% SimpleList - names(assays) <- colnames(enrichList[[1]]) - assays <- rev(assays) - out <- SummarizedExperiment::SummarizedExperiment(assays=assays) - - out - -} - -.computeEnrichment <- function(matches, compare, background){ - - matches <- .getAssay(matches, grep("matches", names(assays(matches)), value = TRUE, ignore.case = TRUE)) - - #Compute Totals - matchCompare <- matches[compare, ,drop=FALSE] - matchBackground <- matches[background, ,drop=FALSE] - matchCompareTotal <- Matrix::colSums(matchCompare) - matchBackgroundTotal <- Matrix::colSums(matchBackground) - - #Create Summary DF - pOut <- data.frame( - feature = colnames(matches), - CompareFrequency = matchCompareTotal, - nCompare = nrow(matchCompare), - CompareProportion = matchCompareTotal/nrow(matchCompare), - BackgroundFrequency = matchBackgroundTotal, - nBackground = nrow(matchBackground), - BackgroundProporition = matchBackgroundTotal/nrow(matchBackground) - ) - - #Enrichment - pOut$Enrichment <- pOut$CompareProportion / pOut$BackgroundProporition - - #Get P-Values with Hyper Geometric Test - pOut$mlog10p <- lapply(seq_len(nrow(pOut)), function(x){ - p <- -phyper(pOut$CompareFrequency[x] - 1, # Number of Successes the -1 is due to cdf integration - pOut$BackgroundFrequency[x], # Number of all successes in background - pOut$nBackground[x] - pOut$BackgroundFrequency[x], # Number of non successes in background - pOut$nCompare[x], # Number that were drawn - lower.tail = FALSE, log.p = TRUE)# P[X > x] Returns LN must convert to log10 - return(p/log(10)) - }) %>% unlist %>% round(4) - - #Minus Log10 FDR - pOut$mlog10FDR <- -log10(p.adjust(matrixStats::rowMaxs(cbind(10^-pOut$mlog10p, 4.940656e-324)), method = "fdr")) - pOut <- pOut[order(pOut$mlog10p, decreasing = TRUE), , drop = FALSE] - - pOut - -} - - - - - - - - diff --git a/R/MatrixCNV.R b/R/MatrixCNV.R index 226d7da9..74d9395c 100644 --- a/R/MatrixCNV.R +++ b/R/MatrixCNV.R @@ -1,16 +1,21 @@ -#' Add TileMatrix to Arrows/ArchRProject +#################################################################### +# Copy Number Variation Methods +#################################################################### + +#' Add a CNV matrix to ArrowFiles or an ArchRProject #' -#' This function for each sample will independently compute counts for each tile -#' per cell in the Arrow File +#' This function for each sample will predict copy number variation from accessibility #' -#' @param input ArchRProject or ArrowFiles -#' @param chromSizes chromomosome sizes used for identifying number of tiles to count -#' @param windowSize size for each window to break up each chromosome -#' @param binarize save as a Sparse.Binary.Matrix or Sparse.Integer.Matrix -#' @param excludeChr exclude chromosomes from this analysis -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param input An `ArchRProject` object or character vector of ArrowFiles. +#' @param chromSizes A named numeric vector containing the chromsome names and lengths. The default behavior is to retrieve this from the `ArchRProject` using `ArchR::getChromSizes()`. +#' @param blacklist A `GRanges` object containing genomic regions to blacklist from calling CNVs. The default behavior is to retrieve this from the `ArchRProject` using `ArchR::getBlacklist()`. +#' @param genome The genome used by the `input`. The default behavior is to retrieve this from the `ArchRProject` using `ArchR::getGenome()`. +#' @param windowSize The size in basepairs for the sliding window used to break up each chromosome to look for CNVs. +#' @param stepSize The size in basepairs for the step used to create sliding window bins across each chromosome. +#' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from CNV analysis. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the CNV matrix to be overwritten if it already exist for `input`. #' @export addCNVMatrix <- function( input, @@ -313,8 +318,4 @@ addCNVMatrix <- function( windowNuc } -# pdf("test.pdf", width = 12, height = 8) -# ArchR:::.ArchRHeatmap(mat = round(t(assays(seWindows)$log2GCSmooth), 3), clusterRows = FALSE, clusterCols = FALSE, limits = c(-2,2)) -# dev.off() -# proj2 <- addCNVMatrix(proj, force = TRUE) diff --git a/R/MatrixDeviations.R b/R/MatrixDeviations.R index 55c61728..79832482 100644 --- a/R/MatrixDeviations.R +++ b/R/MatrixDeviations.R @@ -1,22 +1,25 @@ -#' Add DeviationsMatrix to Arrow Files in ArchRProject +#################################################################### +# Transcription Factor Deviation Methods +#################################################################### + +#' Add a matrix of deviations for a given peakAnnotation to Arrow Files in ArchRProject #' -#' This function for each sample will independently compute counts for each tile -#' per cell and then infer gene activity scores. +#' This function will compute peakAnnotation deviations for each ArrowFiles independently while controlling for global biases (low-memory requirement). #' -#' @param ArchRProj ArchRProject -#' @param annotations annotaions name stored in ArchRProject -#' @param matrixName matrixName to be stored as in Arrow Files -#' @param out save ouptut matrices deviations and/or z -#' @param binarize binarize peaks prior to computing deviations -#' @param threads number of threads for parallel execution -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param ArchRProj An `ArchRProject` object. +#' @param peakAnnotation The name of the peakAnnotation name stored in the `ArchRProject`. +#' @param matrixName The name to be used for storage of the deviations matrix in the provided `ArchRProject`. +#' @param out A string or character vector that indicates whether to save the ouptut matrices as deviations ("deviations") z-scores ("z"), or both (c("deviations","z")). +#' @param binarize A boolean value indicating whether the input matrix should be binarized before calculating deviations. This is often desired when working with insertion counts. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the matrix indicated by `matrixName` to be overwritten if it already exist in the given `ArrowFiles`. #' @export addDeviationsMatrix <- function( ArchRProj, - annotations = NULL, + peakAnnotation = NULL, matches = NULL, - bdgPeaks = getBdgPeaks(ArchRProj), + bgdPeaks = getBgdPeaks(ArchRProj), matrixName = NULL, out = c("z", "deviations"), binarize = FALSE, @@ -46,7 +49,7 @@ addDeviationsMatrix <- function( ############################################################## print(matches) if(is.null(matches)){ - anno <- getAnnotation(ArchRProj, annotations) + anno <- getPeakAnnotation(ArchRProj, peakAnnotation) matches <- readRDS(anno$Matches) if(is.null(matrixName)){ matrixName <- paste0(anno$Name, "Matrix") @@ -88,8 +91,9 @@ addDeviationsMatrix <- function( args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) #Add args to list - args$annotations <- NULL - rm(annotations) + args$peakAnnotation <- NULL + rm(peakAnnotation) + args$annotationsMatrix <- annotationsMatrix args$featureDF <- rS args$useMatrix <- useMatrix @@ -117,7 +121,7 @@ addDeviationsMatrix <- function( cellNames = NULL, allCells = NULL, featureDF = NULL, - bdgPeaks = NULL, + bgdPeaks = NULL, binarize = FALSE, useMatrix = "PeakMatrix", matrixName = "Motif", @@ -155,7 +159,7 @@ addDeviationsMatrix <- function( countsMatrix = ., annotationsMatrix = annotationsMatrix, prefix = prefix, - backgroudPeaks = SummarizedExperiment::assay(bdgPeaks), + backgroudPeaks = SummarizedExperiment::assay(bgdPeaks), expectation = featureDF$rowSums/sum(featureDF$rowSums), out = out )} @@ -409,6 +413,14 @@ addDeviationsMatrix <- function( } +#' Get Variable Deviations across cells in ArchRProject. +#' +#' This function will rank the variability of the deviations computed by ArchR and label the top variable annotations. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param name name of DeviationsMatrix see addDeviationsMatrix +#' @param plot plot ranked variability for each annotation +#' @param n number of annotations to label with ggrepel #' @export getVarDeviations <- function(ArchRProj, name = "MotifMatrix", plot = TRUE, n = 25){ @@ -433,8 +445,19 @@ getVarDeviations <- function(ArchRProj, name = "MotifMatrix", plot = TRUE, n = 2 } +#' Add backgroundPeaks to ArchRProject +#' +#' This function will compute backgroundPeaks controlling for total accessibility and GC and add this to an ArchRProject. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param niterations The number of background peaks to sample (see chromVAR::getBackgroundPeaks). +#' @param w The parameter controlling similarity of background peaks (see chromVAR::getBackgroundPeaks). +#' @param binSize the precision with which the similarity is computed (see chromVAR::getBackgroundPeaks). +#' @param seed A number to be used as the seed for random number generation. It is recommended to keep track of the seed used so that you can reproduce results downstream. +#' @param outFile Path to save backgroundPeaks object to for ArchRProject. +#' @param force Force creation of backgroundPeaks even if file exists. #' @export -addBdgPeaks <- function( +addBgdPeaks <- function( ArchRProj, niterations = 50, w = 0.1, @@ -444,22 +467,22 @@ addBdgPeaks <- function( force = FALSE, ...){ - if(!is.null(metadata(getPeakSet(ArchRProj))$bdgPeaks) & !force){ + if(!is.null(metadata(getPeakSet(ArchRProj))$bgdPeaks) & !force){ - if(file.exists(metadata(getPeakSet(ArchRProj))$bdgPeaks)){ + if(file.exists(metadata(getPeakSet(ArchRProj))$bgdPeaks)){ - stop("Background Peaks Already Exist! set force = TRUE to addBdgPeaks!") + stop("Background Peaks Already Exist! set force = TRUE to addBgdPeaks!") }else{ if(force){ message("Previous Background Peaks file does not exist! Identifying Background Peaks!") - bdgPeaks <- .getBdgPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = outFile) + bgdPeaks <- .computeBgdPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = outFile) }else{ - stop("Previous Background Peaks file does not exist! set force = TRUE to addBdgPeaks!") + stop("Previous Background Peaks file does not exist! set force = TRUE to addBgdPeaks!") } @@ -468,46 +491,47 @@ addBdgPeaks <- function( }else{ message("Identifying Background Peaks!") - bdgPeaks <- .getBdgPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = outFile) + bgdPeaks <- .computeBgdPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = outFile) } - if(length(getPeakSet(ArchRProj)) != nrow(bdgPeaks)){ + if(length(getPeakSet(ArchRProj)) != nrow(bgdPeaks)){ stop("Number of rows in Background Peaks does not match peakSet!") } - metadata(ArchRProj@peakSet)$bdgPeaks <- outFile + metadata(ArchRProj@peakSet)$bgdPeaks <- outFile ArchRProj } #' @export -getBdgPeaks <- function( +getBgdPeaks <- function( ArchRProj, niterations = 50, w = 0.1, binSize = 50, seed = 1, force = FALSE, - ...){ + ... + ){ - if(!is.null(metadata(getPeakSet(ArchRProj))$bdgPeaks) & !force){ + if(!is.null(metadata(getPeakSet(ArchRProj))$bgdPeaks) & !force){ - if(file.exists(metadata(getPeakSet(ArchRProj))$bdgPeaks)){ + if(file.exists(metadata(getPeakSet(ArchRProj))$bgdPeaks)){ message("Using Previous Background Peaks!") - bdgPeaks <- readRDS(metadata(getPeakSet(ArchRProj))$bdgPeaks) + bgdPeaks <- readRDS(metadata(getPeakSet(ArchRProj))$bgdPeaks) }else{ if(force){ message("Previous Background Peaks file does not exist! Identifying Background Peaks!") - bdgPeaks <- .getBdgPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = NULL) + bgdPeaks <- .computeBgdPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = NULL) }else{ - stop("Previous Background Peaks file does not exist! set add = TRUE to addBdgPeaks!") + stop("Previous Background Peaks file does not exist! set add = TRUE to addBgdPeaks!") } @@ -516,26 +540,27 @@ getBdgPeaks <- function( }else{ message("Identifying Background Peaks!") - bdgPeaks <- .getBdgPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = NULL) + bgdPeaks <- .computeBgdPeaks(ArchRProj=ArchRProj, niterations=niterations, w=w, binSize=binSize, seed = seed, outFile = NULL) } - if(length(getPeakSet(ArchRProj)) != nrow(bdgPeaks)){ + if(length(getPeakSet(ArchRProj)) != nrow(bgdPeaks)){ stop("Number of rows in Background Peaks does not match peakSet!") } - bdgPeaks + bgdPeaks } -.getBdgPeaks <- function( +.computeBgdPeaks <- function( ArchRProj, niterations = 50, w = 0.1, binSize = 50, seed = 1, outFile = file.path(getOutputDirectory(ArchRProj), "Background-Peaks.rds"), - ...){ + ... + ){ set.seed(1) .requirePackage("chromVAR") @@ -564,7 +589,7 @@ getBdgPeaks <- function( rowData = DataFrame(bias = rS$GC) ) - bdgPeaks <- chromVAR::getBackgroundPeaks( + bgdPeaks <- chromVAR::getBackgroundPeaks( object = se, bias = rowData(se)$bias, niterations = niterations, @@ -572,17 +597,15 @@ getBdgPeaks <- function( bs = binSize ) - bdgPeaks <- SummarizedExperiment(assays = SimpleList(bdgPeaks = bdgPeaks), + bgdPeaks <- SummarizedExperiment(assays = SimpleList(bgdPeaks = bgdPeaks), rowRanges = GRanges(rS$seqnames,IRanges(rS$start,rS$end),value=rS$rowSums,GC=rS$GC)) #Save Background Peaks if(!is.null(outFile)){ - saveRDS(bdgPeaks, outFile, compress = FALSE) + saveRDS(bgdPeaks, outFile, compress = FALSE) } - return(bdgPeaks) + return(bgdPeaks) } - - diff --git a/R/MatrixFeatures.R b/R/MatrixFeatures.R index 09985613..01724d80 100644 --- a/R/MatrixFeatures.R +++ b/R/MatrixFeatures.R @@ -1,16 +1,19 @@ -#' Add FeatureMatrix to Arrows/ArchRProject +#################################################################### +# Peak and Feature Matrix Methods +#################################################################### + +#' Add a feature matrix to an ArchRProject or a set of ArrowFiles #' -#' This function for each sample will independently compute counts for each feature -#' per cell in the Arrow File +#' This function for each sample will independently compute counts for each feature per cell in the provided ArchRProject or set of ArrowFiles. #' -#' @param input ArchRProject or ArrowFiles -#' @param features GRanges to count for each cell -#' @param matrixName matrix output name in ArrowFiles cannot be a protected matrix name -#' @param ceiling ceiling for the number of counts per feature -#' @param binarize binarize matrix -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param input An `ArchRProject` object or character vector of ArrowFiles. +#' @param features A `GRanges` object containing the regions (aka features) to use for counting insertions for each cell. +#' @param matrixName The name to be used for storage of the feature matrix in the provided `ArchRProject` or ArrowFiles. +#' @param ceiling The maximum counts per feature allowed. This is used to prevent large biases in feature counts. +#' @param binarize A boolean value indicating whether the feature matrix should be binarized prior to storage. This can be useful for downstream analyses when working with insertion counts. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the matrix indicated by `matrixName` to be overwritten if it already exist in the given ArrowFiles. #' @export addFeatureMatrix <- function( input, @@ -63,14 +66,14 @@ addFeatureMatrix <- function( #' Add PeakMatrix to Arrows in ArchRProject #' #' This function for each sample will independently compute counts for each peak -#' per cell in the Arrow File +#' per cell in the provided ArchRProject using the "PeakMatrix". #' -#' @param ArchRProj ArchRProject -#' @param ceiling ceiling for the number of counts per feature -#' @param binarize binarize matrix -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param ArchRProj An `ArchRProject` object. +#' @param ceiling The maximum counts per feature allowed. This is used to prevent large biases in peak counts. +#' @param binarize A boolean value indicating whether the feature matrix should be binarized prior to storage. This can be useful for downstream analyses when working with insertion counts. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the matrix indicated by `matrixName` to be overwritten if it already exist in the given ArrowFiles. #' @export addPeakMatrix <- function( ArchRProj, @@ -86,6 +89,10 @@ addPeakMatrix <- function( stop("Adding a PeakMatrix is only for ArchRProject!") } + if(is.null(ArchRProj@peakSet)){ + stop("No peakSet found in ArchRProject!") + } + ArrowFiles <- getArrowFiles(ArchRProj) allCells <- rownames(getCellColData(ArchRProj)) outDir <- getOutputDirectory(ArchRProj) @@ -129,6 +136,7 @@ addPeakMatrix <- function( ){ ArrowFile <- ArrowFiles[i] + sampleName <- .sampleName(ArrowFile) o <- h5closeAll() @@ -206,7 +214,7 @@ addPeakMatrix <- function( o <- h5closeAll() chr <- uniqueChr[z] featurez <- features[BiocGenerics::which(seqnames(features)==chr)] - .messageDiffTime(sprintf("Adding %s for Chromosome %s of %s to Arrow File!", matrixName, z, length(uniqueChr)), tstart) + .messageDiffTime(sprintf("Adding %s to %s for Chr (%s of %s)!", sampleName, matrixName, z, length(uniqueChr)), tstart) #Read in Fragments fragments <- .getFragsFromArrow(ArrowFile, chr = chr, out = "IRanges", cellNames = cellNames) diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R index 1148badd..2fa46079 100644 --- a/R/MatrixGeneScores.R +++ b/R/MatrixGeneScores.R @@ -1,25 +1,31 @@ +#################################################################### +# Gene Activity Score Methods +#################################################################### + #' Add GeneScoreMatrix to Arrows/ArchRProject #' #' This function for each sample will independently compute counts for each tile #' per cell and then infer gene activity scores. #' -#' @param input ArchRProject or ArrowFiles -#' @param genes genes as a GRanges object -#' @param geneModel gene model as a string for weighting peaks for gene score calculation (function of x) -#' @param upstream upstream the Gene Start to consider for calculation -#' @param downstream downstream the Gene Start to consider for calculation -#' @param tileSize tileSize for binning counts prior to gene score calculation -#' @param ceiling ceiling of read counts per tile (prevent huge biases) -#' @param scaleTo scale gene scores to -#' @param excludeChr exclude chromosomes from this analysis -#' @param blacklist blacklist GRanges used to remove tiles prior to calculation -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param input An `ArchRProject` object or character vector of ArrowFiles. +#' @param genes A `GRanges` of all genes coordinates start to end (stranded). +#' @param geneModel A gene model function (function of x describing stranded distance from gene promoter f(x)) as a string for weighting peaks for gene score calculation. +#' @param matrixName The name to be used for storage of the gene activity score matrix in the provided `ArchRProject` or ArrowFiles. +#' @param upstream The number of basepairs upstream of the transcription start site to consider for gene activity score calculation. +#' @param downstream The number of basepairs downstream of the transcription start site to consider for gene activity score calculation. +#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation. +#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts. +#' @param useGeneBoundaries A boolean value indicating whether gene boundaries should be employed during gene activity score calculation. Gene boundaries refers to the process of preventing tiles from contributing to the gene score of a given gene if there is a second gene's transcription start site between the tile and the gene of interest. +#' @param scaleTo A numeric value indicating what to scale the computed geneScores to across all cells. +#' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from this analysis. +#' @param blacklist A `GRanges` object containing genomic regions to blacklist for geneScore biases. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the matrix indicated by `matrixName` to be overwritten if it already exist in the given ArrowFiles. #' @export addGeneScoreMatrix <- function( input = NULL, - genes = NULL, + genes = ifelse(inherits(input, "ArchRProject"), getGenes(input), NULL), geneModel = "max(exp(-abs(x)/5000), exp(-1))", matrixName = "GeneScoreMatrix", upstream = c(5000, 100000), @@ -29,7 +35,7 @@ addGeneScoreMatrix <- function( useGeneBoundaries = TRUE, scaleTo = 10000, excludeChr = c("chrY","chrM"), - blacklist = NULL, + blacklist = ifelse(inherits(input, "ArchRProject"), getBlacklist(input), NULL), threads = 1, parallelParam = NULL, force = FALSE, @@ -101,6 +107,7 @@ addGeneScoreMatrix <- function( ){ ArrowFile <- ArrowFiles[i] + sampleName <- .sampleName(ArrowFile) if(is.null(tmpFile)){ tmpFile <- .tempfile(pattern = paste0("tmp-", .sampleName(ArrowFile))) @@ -158,7 +165,7 @@ addGeneScoreMatrix <- function( geneStarti <- geneStart[[z]] geneStarti <- geneStarti[order(geneStarti$idx)] chri <- paste0(unique(seqnames(geneStarti))) - .messageDiffTime(sprintf("Creating Temporary Gene Score Matrix for Chromosome %s of %s!", z, length(geneStart)), tstart) + .messageDiffTime(sprintf("Creating Temp GeneScoreMatrix for %s, Chr (%s of %s)!", sampleName, z, length(geneStart)), tstart) #Read in Fragments frag <- .getFragsFromArrow(ArrowFile, chr = chri, out = "IRanges", cellNames = cellNames) @@ -207,11 +214,11 @@ addGeneScoreMatrix <- function( idx <- which(width(extenedGeneStart) < (min(upstream) + min(downstream))) - extenedGeneStart[idx] <- ranges(suppressWarnings(extendGRanges(geneStarti[idx], upstream = min(upstream), downstream = min(downstream)))) + extenedGeneStart[idx] <- ranges(suppressWarnings(extendGR(geneStarti[idx], upstream = min(upstream), downstream = min(downstream)))) }else{ - extenedGeneStart <- ranges(suppressWarnings(extendGRanges(geneStarti, upstream = max(upstream), downstream = max(downstream)))) + extenedGeneStart <- ranges(suppressWarnings(extendGR(geneStarti, upstream = max(upstream), downstream = max(downstream)))) } @@ -311,7 +318,7 @@ addGeneScoreMatrix <- function( #Get Chromosome chri <- paste0(unique(seqnames(geneStart[[z]]))) - .messageDiffTime(sprintf("Adding Normalized Gene Score Matrix for Chromosome %s of %s to Arrow File!", z, length(geneStart)), tstart) + .messageDiffTime(sprintf("Adding GeneScoreMatrix to %s for Chr (%s of %s)!", sampleName, z, length(geneStart)), tstart) #Re-Create Matrix for that chromosome! matGS <- readRDS(paste0(tmpFile, "-", chri, ".rds")) @@ -345,329 +352,3 @@ addGeneScoreMatrix <- function( } - - - -# #' Add GeneScoreMatrix to Arrows/ArchRProject -# #' -# #' This function for each sample will independently compute counts for each tile -# #' per cell and then infer gene activity scores. -# #' -# #' @param input ArchRProject or ArrowFiles -# #' @param genes genes as a GRanges object -# #' @param geneModel gene model as a string for weighting peaks for gene score calculation (function of x) -# #' @param upstream upstream the Gene Start to consider for calculation -# #' @param downstream downstream the Gene Start to consider for calculation -# #' @param tileSize tileSize for binning counts prior to gene score calculation -# #' @param ceiling ceiling of read counts per tile (prevent huge biases) -# #' @param scaleTo scale gene scores to -# #' @param excludeChr exclude chromosomes from this analysis -# #' @param blacklist blacklist GRanges used to remove tiles prior to calculation -# #' @param threads number of threads -# #' @param parallelParam parallel parameters for batch style execution -# #' @param force force overwriting previous TileMatrix in ArrowFile -# #' @export -# addGeneScoreMatrix <- function( -# input = NULL, -# genes = NULL, -# geneModel = "exp(-abs(x)/10000)", -# matrixName = "GeneScoreMatrix", -# upstream = 100000, -# downstream = 100000, -# tileSize = 500, -# ceiling = 4, -# scaleTo = 10000, -# excludeChr = c("chrY","chrM"), -# blacklist = NULL, -# threads = 1, -# parallelParam = NULL, -# force = FALSE, -# ... -# ){ - -# matrixName <- .isProtectedArray(matrixName, exclude = "GeneScoreMatrix") - -# if(inherits(input, "ArchRProject")){ -# ArrowFiles <- getArrowFiles(input) -# allCells <- rownames(getCellColData(input)) -# outDir <- getOutputDirectory(input) -# }else if(inherits(input, "character")){ -# outDir <- "" -# ArrowFiles <- input -# allCells <- NULL -# }else{ -# stop("Error Unrecognized Input!") -# } -# if(!all(file.exists(ArrowFiles))){ -# stop("Error Input Arrow Files do not all exist!") -# } - -# #Valid GRanges -# genes <- .validGRanges(genes) - -# #Add args to list -# args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) -# args$ArrowFiles <- ArrowFiles -# args$allCells <- allCells -# args$X <- seq_along(ArrowFiles) -# args$FUN <- .addGeneScoreMat -# args$registryDir <- file.path(outDir, "GeneScoresRegistry") - -# #Run With Parallel or lapply -# outList <- .batchlapply(args) - -# if(inherits(input, "ArchRProject")){ - -# return(input) - -# }else{ - -# return(unlist(outList)) - -# } - -# } - -# .addGeneScoreMat <- function( -# i, -# ArrowFiles, -# genes, -# matrixName = "GeneScoreMatrix", -# cellNames = NULL, -# allCells = NULL, -# upstream = 100000, -# downstream = 100000, -# scaleTo = 10000, -# tileSize = 200, -# ceiling = 4, -# blacklist = NULL, -# geneModel = "exp(-abs(x)/10000)", -# excludeChr = c("chrY","chrM"), -# force = FALSE, -# tmpFile = NULL, -# ... -# ){ - -# ArrowFile <- ArrowFiles[i] - -# if(is.null(tmpFile)){ -# tmpFile <- .tempfile(pattern = paste0("tmp-", .sampleName(ArrowFile))) -# } - -# #Check -# if(!suppressMessages(h5createGroup(file = ArrowFile, matrixName))){ -# if(force){ -# o <- h5delete(file = ArrowFile, name = matrixName) -# o <- h5createGroup(ArrowFile, matrixName) -# }else{ -# stop(matrixName, " Already Exists!, set force = TRUE to override!") -# } -# } - -# o <- h5closeAll() - -# #Add Gene Index -# geneStart <- genes[BiocGenerics::which(seqnames(genes) %bcni% excludeChr)] -# geneStart <- sort(sortSeqlevels(geneStart)) -# seqlevels(geneStart) <- as.character(unique(seqnames(geneStart))) -# geneStart <- geneStart[!is.na(mcols(geneStart)$symbol)] -# geneStart <- resize(geneStart, 1, "start") -# geneStart <- split(geneStart, seqnames(geneStart)) -# geneStart <- lapply(geneStart, function(x){ -# mcols(x)$idx <- seq_along(x) -# return(x) -# }) - -# #Blacklist Split -# if(!is.null(blacklist)){ -# blacklist <- split(blacklist, seqnames(blacklist)) -# } - -# #Get all cell ids before constructing matrix -# if(is.null(cellNames)){ -# cellNames <- .availableCells(ArrowFile) -# } -# if(!is.null(allCells)){ -# cellNames <- cellNames[cellNames %in% allCells] -# } - -# tstart <- Sys.time() - -# totalGS <- rep(0, length(cellNames)) -# names(totalGS) <- cellNames - -# ######################################################################################################### -# #First we will write gene scores to a temporary path! rhdf5 delete doesnt actually delete the memory! -# ######################################################################################################### -# for(z in seq_along(geneStart)){ - -# #Get Gene Starts -# geneStarti <- geneStart[[z]] -# mcols(geneStarti)$idx <- seq_along(geneStarti) -# chri <- paste0(unique(seqnames(geneStarti))) -# .messageDiffTime(sprintf("Creating Temporary Gene Score Matrix for Chromosome %s of %s!", z, length(geneStart)), tstart) - -# #Read in Fragments -# frag <- .getFragsFromArrow(ArrowFile, chr = chri, out = "IRanges", cellNames = cellNames) -# fragSt <- trunc(start(frag)/tileSize) * tileSize -# fragEd <- trunc(end(frag)/tileSize) * tileSize -# fragBC <- rep(S4Vectors::match(mcols(frag)$RG, cellNames), 2) -# rm(frag) -# gc() - -# #Unique Inserts -# uniqIns <- sort(unique(c(fragSt,fragEd))) - -# #Construct tile by cell mat! -# matGS <- Matrix::sparseMatrix( -# i = match(c(fragSt, fragEd), uniqIns), -# j = as.vector(fragBC), -# x = rep(1, 2*length(fragSt)), -# dims = c(length(uniqIns), length(cellNames)) -# ) - -# if(!is.null(ceiling)){ -# matGS@x[matGS@x > ceiling] <- ceiling -# } - -# #Unique Tiles -# uniqueTiles <- IRanges(start = uniqIns, width = tileSize) - -# #Clean Memory -# rm(uniqIns, fragSt, fragEd, fragBC) -# gc() - -# #Time to Overlap Gene Windows -# extenedGeneStart <- ranges(suppressWarnings(extendGRanges(geneStarti, upstream = upstream, downstream = downstream))) #Warning if beyond chromosome this doesnt matter for this analysis -# tmp <- suppressWarnings(findOverlaps(extenedGeneStart, uniqueTiles)) -# x <- distance(ranges(geneStarti)[queryHits(tmp)], uniqueTiles[subjectHits(tmp)]) - -# #Determine Sign for Distance relative to strand -# isMinus <- BiocGenerics::which(strand(geneStarti) == "-") -# signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] - start(ranges(geneStarti))[queryHits(tmp)]) -# signDist[isMinus] <- signDist[isMinus] * -1 - -# #Correct the orientation for the distance! -# x <- x * signDist - -# #Evaluate Input Model -# x <- eval(parse(text=geneModel)) - -# #Remove Blacklisted Tiles! -# if(!is.null(blacklist)){ -# blacklisti <- blacklist[[chri]] -# if(is.null(blacklisti) | length(blacklisti) > 0){ -# tilesBlacklist <- 1 * (!overlapsAny(uniqueTiles, ranges(blacklisti))) -# if(length(tilesBlacklist) > 0){ -# x <- x * tilesBlacklist[subjectHits(tmp)] #Multiply Such That All Blacklisted Tiles weight is now 0! -# } -# } -# } - -# #Clean Memory -# rm(isMinus, signDist, extenedGeneStart, uniqueTiles) -# gc() - -# #Creating Sparse Matrix -# tmp <- Matrix::sparseMatrix( -# i = queryHits(tmp), -# j = subjectHits(tmp), -# x = x, -# dims = c(length(geneStarti), nrow(matGS))) - -# #Calculate Gene Scores -# matGS <- tmp %*% matGS -# colnames(matGS) <- cellNames -# totalGS <- totalGS + Matrix::colSums(matGS) - -# #Save tmp file -# saveRDS(matGS, file = paste0(tmpFile, "-", chri, ".rds"), compress = FALSE) - -# #Clean Memory -# rm(matGS, tmp) -# gc() - -# } - - -# ######################################################################################################### -# #Organize info for ArchR Arrow -# ######################################################################################################### -# featureDF <- Reduce("c",geneStart) %>% -# {data.frame( -# row.names=NULL, -# seqnames=as.character(seqnames(.)), -# start=start(.), -# name=mcols(.)$symbol, -# idx=mcols(.)$idx, -# stringsAsFactors=FALSE)} - -# dfParams <- data.frame( -# upstream = upstream, -# downstream = downstream, -# scaleTo = scaleTo, -# tileSize = tileSize, -# ceiling = ceiling, -# geneModel = geneModel, -# stringsAsFactors=FALSE -# ) - -# ###################################### -# # Initialize SP Mat Group -# ###################################### -# o <- .initializeMat( -# ArrowFile = ArrowFile, -# Group = matrixName, -# Class = "double", -# cellNames = cellNames, -# params = dfParams, -# featureDF = featureDF, -# force = force -# ) - -# #Clean Memory -# rm(dfParams, featureDF, genes) -# gc() - -# #Normalize and add to Arrow File! -# for(z in seq_along(geneStart)){ - -# #Get Chromosome -# chri <- paste0(unique(seqnames(geneStart[[z]]))) - -# .messageDiffTime(sprintf("Adding Normalized Gene Score Matrix for Chromosome %s of %s to Arrow File!", z, length(geneStart)), tstart) - -# #Re-Create Matrix for that chromosome! -# matGS <- readRDS(paste0(tmpFile, "-", chri, ".rds")) -# file.remove(paste0(tmpFile, "-", chri, ".rds")) - -# #Normalize -# matGS@x <- as.numeric(scaleTo * matGS@x/rep.int(totalGS, Matrix::diff(matGS@p))) - -# #Round to Reduce Digits After Final Normalization -# matGS@x <- round(matGS@x, 2) -# matGS <- Matrix::drop0(matGS) - -# #Write sparseMatrix to Arrow File! -# o <- .addMatToArrow( -# mat = matGS, -# ArrowFile = ArrowFile, -# Group = paste0(matrixName, "/", chri), -# binarize = FALSE, -# addColSums = TRUE, -# addRowSums = TRUE -# ) -# gc() - -# #Clean Memory -# rm(matGS) -# gc() - -# } - -# return(ArrowFile) - -# } - - - diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R index 724bed13..e0c03def 100644 --- a/R/MatrixTiles.R +++ b/R/MatrixTiles.R @@ -1,21 +1,26 @@ +#################################################################### +# Tile Matrix Methods +#################################################################### + #' Add TileMatrix to Arrows/ArchRProject #' #' This function for each sample will independently compute counts for each tile #' per cell in the Arrow File #' -#' @param input ArchRProject or ArrowFiles -#' @param chromSizes chromomosome sizes used for identifying number of tiles to count -#' @param tileSize size for each tile to break up each chromosome -#' @param binarize save as a Sparse.Binary.Matrix or Sparse.Integer.Matrix -#' @param excludeChr exclude chromosomes from this analysis -#' @param threads number of threads -#' @param parallelParam parallel parameters for batch style execution -#' @param force force overwriting previous TileMatrix in ArrowFile +#' @param input An `ArchRProject` object or character vector of ArrowFiles. +#' @param chromSizes A named numeric vector containing the chromsome names and lengths. The default behavior is to retrieve this from the `ArchRProject` using `ArchR::getChromSizes()`. +#' @param blacklist A `GRanges` object containing genomic regions to blacklist counting in these tiles. The default behavior is to retrieve this from the `ArchRProject` using `ArchR::getBlacklist()`. +#' @param tileSize The size of the tiles used for binning counts in the `TileMatrix`. +#' @param binarize A boolean value indicating whether the `TileMatrix` should be binarized prior to storage. +#' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from CNV analysis. +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the `TileMatrix` to be overwritten if it already exist in the given ArrowFiles. #' @export addTileMatrix <- function( - input, - chromSizes = getChromSizes(input), - blacklist = getBlacklist(input), + input = NULL, + chromSizes = ifelse(inherits(input, "ArchRProject"), chromSizes(input), NULL), + blacklist = ifelse(inherits(input, "ArchRProject"), getBlacklist(input), NULL), tileSize = 500, binarize = TRUE, excludeChr = c("chrM","chrY"), @@ -72,9 +77,11 @@ addTileMatrix <- function( blacklist = NULL, chromLengths = NULL, force = FALSE, - ...){ + ... + ){ ArrowFile <- ArrowFiles[i] + sampleName <- .sampleName(ArrowFile) o <- h5closeAll() @@ -145,7 +152,7 @@ addTileMatrix <- function( o <- h5closeAll() chr <- names(chromLengths)[z] - .messageDiffTime(sprintf("Adding Tile Matrix for Chromosome %s of %s to Arrow File!", z, length(chromLengths)), tstart) + .messageDiffTime(sprintf("Adding TileMatrix to %s for Chr (%s of %s)!", sampleName, z, length(chromLengths)), tstart) #Read in Fragments fragments <- .getFragsFromArrow(ArrowFile, chr = chr, out = "IRanges", cellNames = cellNames) diff --git a/R/PeakAnnotation.R b/R/PeakAnnotation.R new file mode 100644 index 00000000..ad3a3fd4 --- /dev/null +++ b/R/PeakAnnotation.R @@ -0,0 +1,338 @@ + +########################################################################################## +# Annotation Methods +########################################################################################## + +#' Get peakAnnotation from an ArchRProject +#' +#' This function gets a peakAnnotation from a given ArchRProject. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param name The name of the peakAnnotation object (i.e. Motifs) to retrieve from the designated `ArchRProject`. +#' @param ... additional args +#' @export +getPeakAnnotation <- function(ArchRProj, name = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@peakAnnotation)){ + stop("Name is not in peakAnnotation!") + } + } + ArchRProj@peakAnnotation[[name]] +} + +#' Get peakAnnotation positions from an ArchRProject +#' +#' This function gets the peakAnnotation positions (i.e. Motifs) from a given ArchRProject. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param name The name of the peakAnnotation object (i.e. Motifs) to retrieve from the designated `ArchRProject`. +#' @param annoName The name of a specific annotation to subset within the peakAnnotation. +#' @param ... additional args +#' @export +getPositions <- function(ArchRProj, name = NULL, annoName = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@peakAnnotation)){ + stop("Name is not in peakAnnotation!") + } + } + anno <- ArchRProj@peakAnnotation[[name]] + idx <- grep("positions", names(anno), ignore.case=TRUE) + if(length(idx)==0){ + stop("peakAnnotation does not contain positions!") + } + positions <- readRDS(anno[[idx]]) + if(!is.null(annoName)){ + idx <- grep(annoName, names(positions), ignore.case=TRUE) + if(length(idx)==0){ + stop("Positons do not contain annoName!") + } + positions <- positions[idx] + } + positions +} + +#' Get peakAnnotation matches from an ArchRProject +#' +#' This function gets peakAnnotation matches from a given ArchRProject. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param name The name of the annotation object (i.e. Motifs) to retrieve from the designated `ArchRProject`. +#' @param annoName The name of a specific annotation to subset within the peakAnnotation. +#' @param ... additional args +#' @export +getMatches <- function(ArchRProj, name = NULL, annoName = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@peakAnnotation)){ + stop("Name is not in peakAnnotation!") + } + } + anno <- ArchRProj@peakAnnotation[[name]] + idx <- grep("matches", names(anno), ignore.case=TRUE) + if(length(idx)==0){ + stop("peakAnnotation does not contain positions!") + } + matches <- readRDS(anno[[idx]]) + if(!is.null(annoName)){ + idx <- grep(annoName, colnames(matches), ignore.case=TRUE) + if(length(idx)==0){ + stop("Matches do not contain annoName!") + } + matches <- matches[, idx, drop=FALSE] + } + matches +} + +#' Add motif annotations to an ArchRProject +#' +#' This function adds information about which peaks contain motifs to a given ArchRProject. For each peak, a binary value is stored indicating whether each motif is observed within the peak region. +#' +#' @param ArchRProj An `ArchRProject` object. +#' @param motifSet The motif set to be used for annotation. Options include: (i) "JASPAR2016", "JASPAR2018", "JASPAR2020" which gives the 2016, 2018 or 2020 version of JASPAR motifs or (ii) one of "cisbp", "encode", or "homer" which gives the corresponding motif sets from the chromVAR package. +#' @param name The name of peakAnnotations to be stored as in `ArchRProject` +#' @param species The name of the species relevant to the supplied `ArchRProject`. This is used for identifying which motif to be used from CisBP/JASPAR. By default, this function will attempt to guess the species based on the value from `getGenome()`. +#' @param collection If one of the JASPAR motif sets is used via `motifSet`, this parameter allows you to indicate the JASPAR collection to be used. Possible options include "CORE", etc. +#' @param cutOff The p-value cutoff to be used for motif search (see the `motimatchr` package for more information). +#' @param w The width in basepairs to consider for motif matches (see the `motimatchr` package for more information). +#' @param ... additional args +#' @export +addMotifAnnotations <- function( + ArchRProj = NULL, + motifSet = "cisbp", + name = "Motif", + species = NULL, + collection = "CORE", + cutOff = 5e-05, + w = 7, + ... + ){ + + .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")') + ArchRProj <- .validArchRProject(ArchRProj) + + if(grepl("JASPAR|CISBP", motifSet, ignore.case = TRUE) & is.null(species)){ + if(grepl("hg19",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Homo sapiens" + } + if(grepl("hg38",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Homo sapiens" + } + if(grepl("mm9",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Mus musculus" + } + if(grepl("mm10",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Mus musculus" + } + } + + ############################################################# + # Get PWM List adapted from chromVAR! + ############################################################# + tstart <- Sys.time() + .messageDiffTime(paste0("Gettting Motif Set, Species : ", species), tstart) + + if(tolower(motifSet)=="jaspar2020"){ + + .requirePackage("JASPAR2020",installInfo='BiocManager::install("JASPAR2020")') + args <- list(species = species, collection = collection, ...) + motifs <- TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020, args) + obj <- .summarizeJASPARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + + }else if(tolower(motifSet)=="jaspar2016"){ + + .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")') + args <- list(species = species, collection = collection, ...) + motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args) + obj <- .summarizeJASPARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + + }else if(tolower(motifSet)=="jaspar2016"){ + + .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")') + args <- list(species = species, collection = collection, ...) + motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args) + obj <- .summarizeJASPARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + + }else if(tolower(motifSet)=="cisbp"){ + + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + if(tolower(species) == "mus musculus"){ + data("mouse_pwms_v2") + motifs <- mouse_pwms_v2 + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(species) == "homo sapiens"){ + data("human_pwms_v2") + motifs <- human_pwms_v2 + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else{ + stop("Species not recognized homo sapiens, mus musculus supported by CisBP!") + } + + }else if(tolower(motifSet)=="encode"){ + + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("encode_pwms") + motifs <- encode_pwms + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + + }else if(tolower(motifSet)=="homer"){ + + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("homer_pwms") + motifs <- homer_pwms + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + + }else{ + + stop("Error MotifSet Not Recognized!") + + } + + ############################################################# + # Get BSgenome Information! + ############################################################# + genome <- ArchRProj@genomeAnnotation$genome + .requirePackage(genome) + BSgenome <- eval(parse(text = genome)) + BSgenome <- .validBSgenome(BSgenome) + + ############################################################# + # Calculate Motif Positions + ############################################################# + .messageDiffTime("Finding Motif Positions with motifmatchr!", tstart) + peakSet <- ArchRProj@peakSet + motifPositions <- motifmatchr::matchMotifs( + pwms = motifs, + subject = peakSet, + genome = BSgenome, + out = "positions", + p.cutoff = cutOff, + w = w + ) + + ############################################################# + # Motif Overlap Matrix + ############################################################# + .messageDiffTime("Creating Motif Overlap Matrix", tstart) + allPositions <- unlist(motifPositions) + overlapMotifs <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE) + motifMat <- Matrix::sparseMatrix( + i = queryHits(overlapMotifs), + j = match(names(allPositions),names(motifPositions))[subjectHits(overlapMotifs)], + x = rep(TRUE, length(overlapMotifs)), + dims = c(length(peakSet), length(motifPositions)) + ) + colnames(motifMat) <- names(motifPositions) + motifMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = motifMat), rowRanges = peakSet) + .messageDiffTime("Finished Getting Motif Info!", tstart) + + out <- SimpleList( + motifSummary = motifSummary, + motifMatches = motifMat, + motifPositions = motifPositions, + motifList = motifs, + date = Sys.Date() + ) + + dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) + savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds")) + saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) + + ArchRProj@peakAnnotation[[name]]$Name <- name + ArchRProj@peakAnnotation[[name]]$motifs <- motifs + ArchRProj@peakAnnotation[[name]]$motifSummary <- motifSummary + ArchRProj@peakAnnotation[[name]]$Positions <- savePositions + ArchRProj@peakAnnotation[[name]]$Matches <- saveMatches + + saveRDS(out, file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE) + saveRDS(out$motifPositions, savePositions, compress = FALSE) + saveRDS(out$motifMatches, saveMatches, compress = FALSE) + + return(ArchRProj) + +} + +.summarizeJASPARMotifs <- function(motifs){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + data.frame( + row.names = motifNames[x], + name = motifs[[x]]@name[[1]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) , + family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA), + alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA), + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + +.summarizeChromVARMotifs <- function(motifs){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + data.frame( + row.names = motifNames[x], + name = motifs[[x]]@name[[1]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + tags = motifs[[x]]@tags, + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + + diff --git a/R/ReproduciblePeakSet.R b/R/ReproduciblePeakSet.R index a680bb9a..6b6f198d 100644 --- a/R/ReproduciblePeakSet.R +++ b/R/ReproduciblePeakSet.R @@ -1,29 +1,35 @@ -#' Add Reproducible Peak Set to ArchR Project +#################################################################### +# Peak Set Creation Methods +#################################################################### + +#' Add a Reproducible Peak Set to an ArchRProject #' -#' This function will get insertions from coverage files call peaks -#' and merge to get a Union Reproducible Peak Set +#' This function will get insertions from coverage files, call peaks, +#' and merge peaks to get a "Union Reproducible Peak Set" #' -#' @param ArchRProj ArchRProject -#' @param groupBy use groupings for peak calling matching group coverage files -#' @param reproducibility reproducibility for peak calling (string that is a function of n) -#' @param peaksPerCell number of peaks that can be identified per cell on average -#' @param excludeChr exclude chromosomes from peak calling -#' @param pathToMacs2 path to macs2 executable (see Macs2) -#' @param genomeSize genome size for peak calling (see Macs2) -#' @param shift shift of Tn5 insertions (<- | ) (see Macs2) -#' @param extsize extension of Tn5 insertions (|<- ->|) (see Macs2) -#' @param method significance method for Macs2 (see Macs2) -#' @param cutOff significance cutoff for Macs2 (see Macs2) -#' @param extendSummits extend peak summits for final fixed-width peaks -#' @param promoterDist promoter distance from TSS for annotating peaks -#' @param genomeAnno genome annotation for ArchRProject -#' @param geneAnno gene annotation for ArchRProject -#' @param additionalParams additional parameters to pass to Macs2 (see Macs2) -#' @param threads number of threads for parallel execution -#' @param parallelParam parallel parameters for batch style execution -#' @param force force creating peakSet if existed -#' @param verboseHeader verbose sections -#' @param verboseAll verbose sections and subsections +#' @param ArchRProj An `ArchRProject` object. +#' @param groupBy The name of the column in `cellColData` to use for grouping cells together for peak calling. +#' @param reproducibility A string that indicates how peak reproducibility should be handled. This string is dynamic and can be a function of `n` where `n` is the number of samples being assessed. For example, `reproducibility = "2"` means at least 2 samples must have a peak call at this locus and `reproducibility = "(n+1)/2"` means that the majority of samples must have a peak call at this locus. +#' @param peaksPerCell The limit of number of peaks that can be identified per cell (this is useful for controlling how many peaks can be called from low cell groups). +#' @param maxPeaks A numeric threshold for the maximum peaks to retain per group in `groupBy` in the union reproducible peak set. +#' @param minCells The minimum number of unique cells that was used to create the coverage files on which peaks are called. This is important to allow for exclusion of pseudo-bulk replicates derived from very low cell numbers. +#' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from peak calling. +#' @param pathToMacs2 The full path to the MACS2 executable. +#' @param genomeSize The genome size to be used for MACS2 peak calling (see MACS2 documentation). +#' @param shift The number of basepairs to shift each Tn5 insertion. When combined with `extsize` this allows you to create proper fragments, centered at the Tn5 insertion site, for use with MACS2 (see MACS2 documentation). +#' @param extsize The number of basepairs to extend the MACS2 fragment after `shift` has been applied. When combined with `extsize` this allows you to create proper fragments, centered at the Tn5 insertion site, for use with MACS2 (see MACS2 documentation). +#' @param method The method to use for significance testing in MACS2. Options are "p" for p-value and "q" for q-value. When combined with `cutOff` this gives the method and significance threshold for peak calling (see MACS2 documentation). +#' @param cutOff The numeric significance cutoff for the testing method indicated by `method` (see MACS2 documentation). +#' @param extendSummits The number of basepairs to extend peak summits (in both directions) to obtain final fixed-width peaks. For example, `extendSummits = 250` will create 501-bp fixed-width peaks from the 1-bp summits. +#' @param promoterDist The maximum distance in basepairs from the peak summit to the nearest transcriptional start site to allow for a peak to be annotated as a "promoter" peak. +#' @param genomeAnno The genomeAnnotation (see createGenomeAnnotation) is used for downstream analyses for genome information such as nucleotide information (GC info) or chromosome sizes. +#' @param geneAnno The geneAnnotation (see createGeneAnnotation) is used for peak labeling as promoter etc. +#' @param additionalParams A string of additional parameters to pass to MACS2 (see MACS2 documentation). +#' @param threads The number of threads to be used for parallel computing. +#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param force A boolean value indicating whether to force the reproducible peak set to be overwritten if it already exist in the given `ArchRProject` peakSet. +#' @param verboseHeader A boolean value that determines whether standard output includes verbose sections. +#' @param verboseAll A boolean value that determines whether standard output includes verbose subsections. #' @param ... additional args #' @export addReproduciblePeakSet <- function( @@ -179,7 +185,7 @@ addReproduciblePeakSet <- function( #Construct Union Peak Set .messageDiffTime("Creating Union Peak Set!", tstart) unionPeaks <- Reduce("c",groupPeaks) - unionPeaks <- nonOverlappingGRanges(unionPeaks, by = "groupScoreQuantile", decreasing = TRUE) + unionPeaks <- nonOverlappingGR(unionPeaks, by = "groupScoreQuantile", decreasing = TRUE) #Summarize Output peakDF <- lapply(seq_along(groupPeaks), function(x){ @@ -197,7 +203,7 @@ addReproduciblePeakSet <- function( .messageDiffTime(sprintf("Finished Creating Union Peak Set (%s)!", length(unionPeaks)), tstart) - closeAllConnections() + suppressWarnings(sink()) return(ArchRProj) @@ -289,15 +295,18 @@ addReproduciblePeakSet <- function( grx <- subsetByOverlaps(grx, blacklist, invert = TRUE) #Not Overlapping Blacklist! grx$GroupReplicate <- paste0(summitNames[x]) grx - }) %>% Reduce("c", .) + }) + summits <- Reduce("c", as(summits, "GRangesList")) extendedSummits <- resize(summits, extendSummits * 2 + 1, "center") extendedSummits <- lapply(split(extendedSummits, extendedSummits$GroupReplicate), function(x){ - nonES <- nonOverlappingGRanges(x, by = "score", decreasing = TRUE) + nonES <- nonOverlappingGR(x, by = "score", decreasing = TRUE) nonES$replicateScoreQuantile <- round(.getQuantiles(nonES$score),3) nonES - }) %>% Reduce("c", .) - nonOverlapES <- nonOverlappingGRanges(extendedSummits, by = "replicateScoreQuantile", decreasing = TRUE) + }) + extendedSummits <- Reduce("c", as(extendedSummits, "GRangesList")) + + nonOverlapES <- nonOverlappingGR(extendedSummits, by = "replicateScoreQuantile", decreasing = TRUE) overlapMat <- lapply(split(extendedSummits, extendedSummits$GroupReplicate), function(x){ overlapsAny(nonOverlapES, x) @@ -402,6 +411,10 @@ addReproduciblePeakSet <- function( } +#' Find the installed location of the MACS2 executable +#' +#' This function attempts to find the path to the MACS2 executable by serting the path and python's pip. +#' #' @export findMacs2 <- function(){ diff --git a/R/Trajectory.R b/R/Trajectory.R index 145838b4..526823fd 100644 --- a/R/Trajectory.R +++ b/R/Trajectory.R @@ -1,29 +1,32 @@ +#################################################################### +# Trajectory Analysis Methods +#################################################################### + #' Add Supervised Trajectory to an ArchR Project #' #' This function will fit a supervised trajectory in a lower dimensional space that #' can then be used for downstream analyses. #' -#' @param ArchRProj ArchRProject -#' @param name name of fitted trajectory to be added in cellColData -#' @param trajectory trajectory of groups to constrain supervised fitting (in order) +#' @param ArchRProj An `ArchRProject` object. +#' @param name A string indicating the name of the fitted trajectory to be added in `cellColData`. +#' @param trajectory Supervised order of groups to constrain supervised fitting (ie c("Cluster1", "Cluster2", "Cluster3") ) #' @param groupBy initial group column in cellColData to constrain supervised fit -#' @param name name of column in cellColData or Feature in Array in Arrows -#' @param reducedDims name of reduced dimensions used for distance computation -#' @param preFilter pre filtering quantile for supervised trajectory fit -#' @param postFilter post filtering quantile for supervised trajectory fit -#' @param dof degrees of freedom -#' @param spar sparsity -#' @param force force addition into ArchRProject if the column name exists +#' @param reducedDims A string indicating the name of the `reducedDims` object from the `ArchRProject` that should be used for distance computation. +#' @param preQuantile Prior to supervised trajectory fitting, the quantile for filtering cells that are far (by euclidean distance) from cluster centers. +#' @param postQuantile Post supervised trajectory fitting, the quantile for determining the cutoff for cells not in the groups to be aligned to the trajectory. +#' @param dof The number of degrees of freedom to be used in spline fit (see smooth.spline) +#' @param spar The sparsity to be used in spline fit (see smooth.spline) +#' @param force A boolean value indicating whether to force the trajactory indicated by `name` to be overwritten if it already exist in the given `ArchRProject`. #' @param ... additional args #' @export addTrajectory <- function( - ArchRProj, + ArchRProj = NULL, name = "Trajectory", trajectory = NULL, groupBy = "Clusters", reducedDims = "IterativeLSI", - preFilter = 0.1, - postFilter = 0.1, + preQuantile = 0.1, + postQuantile = 0.1, dof = 250, spar = 1, force = FALSE, @@ -60,7 +63,7 @@ addTrajectory <- function( #Filter Distance matMeanx <- colMeans(matx) diffx <- sqrt(colSums((t(matx) - matMeanx)^2)) - idxKeep <- which(diffx <= quantile(diffx, 1 - preFilter)) + idxKeep <- which(diffx <= quantile(diffx, 1 - preQuantile)) #Filter list(mat = matx[idxKeep,,drop=FALSE], groups = groupsx[idxKeep]) @@ -123,7 +126,7 @@ addTrajectory <- function( knnDistQ <- .getQuantiles(knnDist[,1]) #Filter Outlier Cells to Trajectory for High Resolution - idxKeep <- which(knnDist[,1] <= quantile(knnDist[,1], 1 - postFilter)) + idxKeep <- which(knnDist[,1] <= quantile(knnDist[,1], 1 - postQuantile)) dfTrajectory <- DataFrame( row.names = rownames(mat), Distance = knnDist[, 1], @@ -182,15 +185,17 @@ addTrajectory <- function( #' This function will fit get a supervised trajectory from an ArchRProject and aggregate signal #' from a matrix and smooth across the trajectory #' -#' @param ArchRProj ArchRProject -#' @param name name of fitted trajectory in cellColData -#' @param useMatrix matrix to summarize across the trajectory -#' @param groupEvery group cells every x quantile -#' @param scaleTo scale summarized matrix to -#' @param log2Norm log2 normalize the scaled summarized matrix -#' @param threads number of threads to use -#' @param smooth smooth matrix row-wise? -#' @param smoothFormula smoothing formula for mgcv::gam +#' @param ArchRProj An `ArchRProject` object. +#' @param name A string indicating the name of the fitted trajectory in `cellColData` to retrieve from the given `ArchRProject`. +#' @param useMatrix The name of the data matrix from the `ArrowFiles` to get numerical values for each cell from. +#' @param varCutOff Variance Quantile Cutoff for identifying top variable features across trajectory. +#' @param maxFeatures The maximum number of features (ordered by variance) to consider from `useMatrix` when generating a trajectory. +#' @param groupEvery The number of sequential percentiles (0-100 every x percentile) to group together when generating a trajectory (similar to smoothing). +#' @param threads The number of threads to be used for parallel computing. +#' @param scaleTo Scale group data matrix to this value for normalizaiton. +#' @param log2Norm A boolean value that indicates whether the summarized trajectory matrix should be log2 transformed. +#' @param smooth A boolean value indicating whether the summarized trajectory matrix should be smoothed in a row-wise fashion. +#' @param smoothFormula The smoothing formula to use in the generalized additive model. See the `formula` parameter in `mgcv::gam()`. #' @param ... additional args #' @export getTrajectory <- function( @@ -300,15 +305,15 @@ getTrajectory <- function( #' #' This function will plot a heatmap of the results from getTrajectory #' -#' @param seTrajectory Summarized Experiment result from markerFeatures -#' @param varCutOff variance cut off for selecting features varying across trajectory -#' @param scaleRows compute row z-scores on matrix -#' @param limits heatmap color limits -#' @param pal palette for heatmap, default will use solar_extra -#' @param labelMarkers label specific markers by name on heatmap (matches rownames of seTrajectory) -#' @param labelTop label the top features for each column in seTrajectory -#' @param labelRows label all rows -#' @param returnMat return final matrix that is used for plotting heatmap +#' @param seTrajectory A `SummarizedExperiment` object that results from `markerFeatures()`. +#' @param scaleRows A boolean value that indicates whether row-wise z-scores should be computed on matrix in `seTrajectory`. +#' @param limits A numeric vector of two numbers that represent the lower and upper color limits of the heatmap color scheme. +#' @param grepExclude A character vector or string that indicates the `rownames` or a specific pattern that identifies rownames from `seTrajectory` to be excluded from the heatmap. +#' @param pal A custom continuous palette (see paletteContinuous) used to override the continuous palette for the heatmap. +#' @param labelMarkers A character vector listing the `rownames` of `seTrajectory` that should be labeled on the side of the heatmap. +#' @param labelTop A boolean value that indicates whether the top features for each column in `seTrajectory` should be labeled on the side of the heatmap. +#' @param labelRows A boolean value that indicates whether all rows should be labeled on the side of the heatmap. +#' @param returnMat A boolean value that indicates whether the final heatmap matrix should be returned in lieu of plotting the actual heatmap. #' @param ... additional args #' @export trajectoryHeatmap <- function( @@ -387,28 +392,32 @@ trajectoryHeatmap <- function( } -#' Visualize Embedding from ArchR Project +#' Visualize a Trajectory from ArchR Project #' -#' This function will plot an embedding that was created from -#' computeEmbedding +#' This function will plot a trajectory that was created from addTrajectory onto an embedding. #' -#' @param ArchRProj ArchRProject -#' @param embedding embedding to visualize (see computeEmbedding) -#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) -#' @param name name of column in cellColData or Feature in Array in Arrows -#' @param log2Norm log2 Normalize features if they are continuous -#' @param pal custom palette to use for plotting -#' @param size size of points in plot -#' @param rastr rastr points in plot -#' @param quantCut quantile cut of continuous features -#' @param quantHex quantile evaluation for each hex in geom_hex -#' @param discreteSet discrete palette for visualizing embedding -#' @param continuousSet continuous palette for visualizing embedding -#' @param randomize randomize points prior to plotting -#' @param keepAxis keep x and y axis for plot -#' @param baseSize base size for text in plot -#' @param plotAs how to plot (points vs hex) -#' @param plotParams additional params to pass to ggPoint/ggHex +#' @param ArchRProj An `ArchRProject` object. +#' @param embedding The name of the embedding to use to visualize the given `trajectory`. See `ArchR::addEmbedding()` for more information. +#' @param trajectory The name of the trajectory as a column in `cellColData` to plot. +#' @param colorBy A string indicating whether points in the plot should be colored by a column in cellColData ("cellColData") or by a data matrix in the ArrowFiles (i.e. "GeneScoreMatrix", "MotifMatrx", "PeakMatrix"). +#' @param name The name of the trajectory as a column in `cellColData` to plot. +#' @param log2Norm A boolean value indicating whether a log2 transformation should be performed on the values prior to plotting. +#' @param pal The name of a custom palette from `ArchRPalettes` to use for coloring cells. +#' @param size A number indicating the size of the points to plot if `plotAs` is set to "points". +#' @param rastr A boolean value that indicates whether the plot should be rasterized. This does not rasterize lines and labels, just the internal portions of the plot. +#' @param quantCut If this is not null, a quantile cut is performed to threshold the top and bottom of the distribution of numerical values. +#' This prevents skewed color scales caused by strong outliers. The format of this should be c(x,y) where x is the upper threshold and y is +#' the lower threshold. For example, quantileCut = c(0.975,0.025) will take the top and bottom 2.5% of values and set them to the value of +#' the 97.5th and 2.5th percentile values respectively. +#' @param quantHex The quantile to +#' @param discreteSet The name of a discrete palette from `ArchRPalettes` for visualizing colorBy in the embedding. +#' @param continuousSet The name of a continuous palette from `ArchRPalettes` for visualizing colorBy in the embedding. +#' @param randomize A boolean value that indicates whether to randomize points prior to plotting to prevent cells from one cluster being present at the front of the plot. +#' @param keepAxis A boolean value that indicates whether the x and y axis ticks and labels should be plotted. +#' @param baseSize The base font size to use in the plot. +#' @param addArrow A boolean value that indicates whether to add a smoothed arrow in the embedding based on the aligned trajectory. +#' @param plotAs A string that indicates whether points ("points") should be plotted or a hexplot ("hex") should be plotted. +#' @param plotParams Additional parameters to pass to `ggPoint()` or `ggHex()`. #' @param ... additional args #' @export plotTrajectory <- function( @@ -439,7 +448,7 @@ plotTrajectory <- function( ############################## # Plot Helpers ############################## - .quantileCut <- function (x, lo = 0, hi = 0.975, rm0 = TRUE){ + .quantileCut0 <- function (x, lo = 0, hi = 0.975, rm0 = TRUE){ q <- quantile(x, probs = c(lo, hi), na.rm = TRUE) x[x < q[1]] <- q[1] x[x > q[2]] <- q[2] @@ -522,7 +531,7 @@ plotTrajectory <- function( plotParams$color[idxRemove] <- NA if(!plotParams$discrete){ - plotParams$color <- .quantileCut(plotParams$color, min(quantCut), max(quantCut)) + plotParams$color <- .quantileCut0(plotParams$color, min(quantCut), max(quantCut)) plotParams$pal <- paletteContinuous(set = plotParams$continuousSet) if(tolower(plotAs) == "hex" | tolower(plotAs) == "hexplot"){ plotParams$addPoints <- TRUE diff --git a/R/VisualizeData.R b/R/VisualizeData.R index 48bdeec9..3c015617 100644 --- a/R/VisualizeData.R +++ b/R/VisualizeData.R @@ -1,25 +1,32 @@ +#################################################################### +# Visualization Methods +#################################################################### + #' Visualize Embedding from ArchR Project #' #' This function will plot an embedding that was created from #' computeEmbedding #' -#' @param ArchRProj ArchRProject -#' @param embedding embedding to visualize (see computeEmbedding) -#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) -#' @param name name of column in cellColData or Feature in Array in Arrows -#' @param log2Norm log2 Normalize features if they are continuous -#' @param pal custom palette to use for plotting -#' @param size size of points in plot -#' @param rastr rastr points in plot -#' @param quantCut quantile cut of continuous features -#' @param quantHex quantile evaluation for each hex in geom_hex -#' @param discreteSet discrete palette for visualizing embedding -#' @param continuousSet continuous palette for visualizing embedding -#' @param randomize randomize points prior to plotting -#' @param keepAxis keep x and y axis for plot -#' @param baseSize base size for text in plot -#' @param plotAs how to plot (points vs hex) -#' @param plotParams additional params to pass to ggPoint/ggHex +#' @param ArchRProj An `ArchRProject` object. +#' @param embedding The name of the embedding to plot. See `ArchR::computeEmbedding()` for more information. +#' @param colorBy A string indicating whether points in the plot should be colored by a column in cellColData ("cellColData") or by a data matrix in the ArrowFiles (i.e. "GeneScoreMatrix", "MotifMatrx", "PeakMatrix"). +#' @param name The name of the column in `cellColData` or the featureName in the data matrix. +#' @param log2Norm A boolean value indicating whether a log2 transformation should be performed on the values (if continuous) in plotting. +#' @param imputeWeights imputation weights for imputing numerical values for each cell as a linear combination of other cells values (see add/getImutationWeights). +#' @param pal A custom palette used to override discreteSet/continuousSet for coloring vector. +#' @param size A number indicating the size of the points to plot if `plotAs` is set to "points". +#' @param rastr A boolean value that indicates whether the plot should be rasterized with ggrastr. This does not rasterize lines and labels, just the internal portions of the plot. +#' @param quantCut If this is not null, a quantile cut is performed to threshold the top and bottom of the distribution of numerical values. +#' This prevents skewed color scales caused by strong outliers. The format of this should be c(x,y) where x is the upper threshold and y is +#' the lower threshold. For example, quantileCut = c(0.975,0.025) will take the top and bottom 2.5% of values and set them to the value of +#' the 97.5th and 2.5th percentile values respectively. +#' @param discreteSet The name of a discrete palette from `ArchRPalettes` for visualizing colorBy in the embedding. +#' @param continuousSet The name of a continuous palette from `ArchRPalettes` for visualizing colorBy in the embedding. +#' @param randomize A boolean value that indicates whether to randomize points prior to plotting to prevent cells from one cluster being present at the front of the plot. +#' @param keepAxis A boolean value that indicates whether the x and y axis ticks and labels should be plotted. +#' @param baseSize The base font size to use in the plot. +#' @param plotAs A string that indicates whether points ("points") should be plotted or a hexplot ("hex") should be plotted. +#' @param plotParams Additional parameters to pass to `ggPoint()` or `ggHex()`. #' @param ... additional args #' @export plotEmbedding <- function( @@ -179,19 +186,18 @@ plotEmbedding <- function( #' Visualize Groups from ArchR Project #' -#' This function will plot an embedding that was created from -#' computeEmbedding +#' This function will plot an embedding that was created from computeEmbedding #' -#' @param ArchRProj ArchRProject +#' @param ArchRProj An `ArchRProject` object. #' @param groupBy use groupings in cellColData for summarizing and plotting -#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) -#' @param name name of column in cellColData or Feature in Array in Arrows -#' @param pal custom palette to use for plotting -#' @param ylim limits for features in plot -#' @param size size of points in ggplot -#' @param baseSize rastr points in ggplot -#' @param ratioYX ratio of Y axis to X axis -#' @param points add points to plot using quasirandom +#' @param colorBy A string indicating whether numeric values in violin plot should be from a column in cellColData ("cellColData") or by a data matrix in the ArrowFiles (i.e. "GeneScoreMatrix", "MotifMatrx", "PeakMatrix"). +#' @param name The name of the column in `cellColData` or the featureName in the data matrix. +#' @param pal A custom palette used to override discreteSet/continuousSet for coloring vector. +#' @param ylim A vector of two numeric values indicating the lower and upper bounds of the y-axis on the plot. +#' @param size The numeric size of the points to be plotted. +#' @param baseSize The base font size to use in the plot. +#' @param ratioYX The aspect ratio of the x and y axes on the plot. +#' @param points A boolean value that indicates whether points should be added to the plot using `geom_quasirandom()` #' @param ... additional args #' @export plotGroups <- function( @@ -467,36 +473,8 @@ plotGroups <- function( valueOnly = TRUE ) - # message(pw, " ", slw) - # message(ph* 0.1, " ", slh) - # message("\n") - } - # scaleBy <- 1 / max(c(slw/pw, 4 * slh/ph)) - - # gl$heights <- lapply(seq_along(gl$heights), function(x){ - # if(convertHeight(gl$heights[x], unitTo="in", valueOnly = TRUE) != 0){ - # unit(convertHeight(gl$heights[x], unitTo="in", valueOnly = TRUE) * scaleBy, "in") - # }else{ - # if(grepl("null", gl$heights[x])){ - # unit(as.numeric(gsub("null","",gl$heights[x])) * scaleBy, "null") - # } - # gl$heights[x] - # } - # }) %>% Reduce("unit.c", .) - - # gl$widths <- lapply(seq_along(gl$widths), function(x){ - # if(convertHeight(gl$widths[x], unitTo="in", valueOnly = TRUE) != 0){ - # unit(convertHeight(gl$widths[x], unitTo="in", valueOnly = TRUE) * scaleBy, "in") - # }else{ - # if(grepl("null", gl$widths[x])){ - # unit(as.numeric(gsub("null","",gl$widths[x])) * scaleBy, "null") - # } - # gl$widths[x] - # } - # }) %>% Reduce("unit.c", .) - p <- grid.arrange(g, gl, ncol=1, nrow=2, heights = unit.c(unit(sgh,"in"), unit(min(slh, 0.2 * pw), "in")), newpage = newPage diff --git a/README.md b/README.md index 150a1c93..4fd75327 100755 --- a/README.md +++ b/README.md @@ -1,48 +1,14 @@ -During Alpha release, see [this website](https://web.stanford.edu/~mcorces/ArchR/) for documentation. - -# ArchR ![](man/figures/ArchR_dartLogo_small.jpg) +# ArchR ![](ArchR_dartLogo_small.jpg) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) -ArchR is a full-featured R package for processing and analyzing single-cell ATAC-seq data. Its strength is speed and resource usage, making it possible to analyze 1 million cells in QQQ hours on a macbook pro laptop. It provides facile tools to do just about anything you would want to do with single-cell ATAC-seq data. For a more detailed description of the software, please see the [publication](https://greenleaf.stanford.edu/assets/pdf/) ([pdf](http://greenleaf.stanford.edu/assets/pdf/), [supplement](http://greenleaf.stanford.edu/assets/pdf/)) or the [vignettes](https://web.stanford.edu/~mcorces/ArchR/articles/index.html). - -# Installation of ArchR - -ArchR installation currently requires devtools. The following commands will use the Bioconductor BiocManager to install required dependences: - -```{r} -if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") -if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") - -devtools::install_github("GreenleafLab/ArchR", - auth_token = token, #Need a token to download (see personalized access tokens) - repos = BiocManager::repositories() -) -``` - -### Additional packages that are used from github -To complete installation, you also must maually install the following packages using these devtools commands: - -```{r} -# ggrastr is a package for plotting ggplots with rastr'd points which is super helpful for large UMAPs etc -# # You need to have Cairo installed for ggrastr to work -# # On Mac OSx you need to have XQuartz (https://www.xquartz.org/) -devtools::install_github('VPetukhov/ggrastr') - -# harmony is a package that can correct batch effects -devtools::install_github("immunogenomics/harmony") - -# presto is a package that has efficient tools for wilcoxon tests on sparseMatrices -devtools::install_github('immunogenomics/presto') -``` - -# ArchR Workflow -![ArchR Workflow](https://web.stanford.edu/~mcorces/ArchR_Workflow_v1.PNG) +ArchR is a full-featured R package for processing and analyzing single-cell ATAC-seq data. Its strength is speed and resource usage, making it possible to analyze 1 million cells in QQQ hours on a macbook pro laptop. It provides facile tools to do just about anything you would want to do with single-cell ATAC-seq data. For a more detailed description of the software, please see the [publication](https://greenleaf.stanford.edu/assets/pdf/) ([pdf](http://greenleaf.stanford.edu/assets/pdf/), [supplement](http://greenleaf.stanford.edu/assets/pdf/)) or the [vignettes](articles/index.html). # Documentation -Please see the navigation bar at the top of this page for links to [a brief ArchR tutorial](https://web.stanford.edu/~mcorces/ArchR/articles/ArchR.html) as well as detailed [vignettes and examples](https://web.stanford.edu/~mcorces/ArchR/articles/index.html) for each of the major ArchR analytical components. + +Please visit [the ArchR website](https://web.stanford.edu/~mcorces/ArchR/) for detailed installation and usage instructions. # Issues using ArchR? -If you find a bug, please report it on [Github](https://github.com/GreenleafLab/ArchR/issues). If you have questions about ArchR usage, please refer to the [publication](https://greenleaf.stanford.edu/assets/pdf/), the [vignettes](https://web.stanford.edu/~mcorces/ArchR/articles/index.html), or the [FAQ section](articles/faq.html). +If you find a bug, please report it on [Github](https://github.com/GreenleafLab/ArchR/issues). If you have questions about ArchR usage, please refer to the [publication](https://greenleaf.stanford.edu/assets/pdf/), the [vignettes](articles/index.html), or the [FAQ section](articles/faq.html). diff --git a/_pkgdown.yml b/_pkgdown.yml index 156acc6c..dd3bf184 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -5,11 +5,14 @@ template: authors: Jeffrey Granja: href: https://twitter.com/JeffreyGranja + Ryan Corces: + href: https://twitter.com/doctorcorces + navbar: left: - - text: Get Started - href: articles/ArchR.html + - text: Tutorial + href: articles/Articles/tutorial.html - text: Vignettes menu: - text: Doublet Removal @@ -20,19 +23,75 @@ navbar: href: articles/Articles/peakCalling.html - text: Dimensionality Reduction href: articles/Articles/dimReduction.html - - text: Gene Scores and Marker Genes + - text: Gene Activity Scores href: articles/Articles/geneScores.html + - text: Marker Feature Identification + href: articles/Articles/markerFeatures.html - text: Signal Track Ploting href: articles/Articles/signalTracks.html - text: TF Footprinting href: articles/Articles/footprinting.html + - text: Pseudobulk Replicate Generation + href: articles/Articles/pseudobulkGeneration.html - text: Gene and Genome Annotation href: articles/Articles/annotations.html - text: FAQ - href: articles/faq.html + href: articles/Articles/faq.html - text: Reference href: reference/index.html right: - text: ArchR on GitHub icon: fa-github href: https://github.com/GreenleafLab/ArchR + +reference: + - title: Arrow file creation + desc: Functions that help create / set up an ArrowFile + contents: + - createArrowFiles + - filterCells + - filterDoublets + - filterPlot + - nCells + - title: Add-ers + desc: Functions that add information to ArrowFiles or an ArchRProject + contents: + - starts_with("add") + - title: Get-ers + desc: Functions that get information from ArrowFiles or an ArchRProject + contents: + - starts_with("get") + - title: Plotting / aesthetic functions + desc: Functions for plotting in ArchR + contents: + - ArchRPalettes + - ArchRRegionTrack + - starts_with("gg") + - starts_with("palette") + - starts_with("plot") + - starts_with("theme") + - title: Helper functions + desc: Generic helper functions + contents: + - '%ni%' + - '%bcin%' + - '%bcni%' + - findMacs2 + - title: GRanges manipulation + desc: Functions to manipulate GRanges objects + contents: + - columnOverlaps + - constructGRanges + - extendGRanges + - keepFilteredChromosomes + - mergeGRanges + - nonOverlappingGRanges + - overlappingBP + - overlapsMany + - shuffleGRanges + - subsetSeqnames + - title: Marker feature / trajectory identification + desc: Functions to identify and visualize marker features and cellular trajectories + contents: + - starts_with("marker") + - trajectoryHeatmap diff --git a/docs/404.html b/docs/404.html index ad2f7d4f..70c16dd4 100644 --- a/docs/404.html +++ b/docs/404.html @@ -8,6 +8,13 @@ Page not found (404) • ArchR + + + + + + + @@ -34,6 +41,9 @@ + + + @@ -70,7 +80,7 @@ @@ -156,7 +182,7 @@

All vignettes