From bbacea4319c64e6c5cef5f9a023a2e90e3fdc72e Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Sun, 31 Jul 2022 11:46:41 -0700 Subject: [PATCH] [feat] Add an option to sort by sequence role in CollectAlternateContigNames --- .../fasta/CollectAlternateContigNames.scala | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNames.scala b/src/main/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNames.scala index d94dff186..88dcca6ca 100644 --- a/src/main/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNames.scala +++ b/src/main/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNames.scala @@ -96,8 +96,9 @@ object SequenceRole extends FgBioEnum[SequenceRole] { SequenceRole(roleValue) } - case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" } + // NB: the order in which these are defined is important for when `--sort-by-sequence-role` is used case object AssembledMolecule extends SequenceRole { val key: String = "assembled-molecule" } + case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" } case object FixPatch extends SequenceRole { val key: String = "fix-patch" } case object NovelPatch extends SequenceRole { val key: String = "novel-patch" } case object UnlocalizedScaffold extends SequenceRole { val key: String = "unlocalized-scaffold" } @@ -131,9 +132,11 @@ class CollectAlternateContigNames @arg(flag='a', doc="The assembly report column(s) for the alternate contig name(s)", minElements=1) val alternates: Seq[AssemblyReportColumn], @arg(flag='s', doc="Only output sequences with the given sequence roles. If none given, all sequences will be output.", minElements=0) val sequenceRoles: Seq[SequenceRole] = Seq.empty, - @arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.") val existing: Option[PathToSequenceDictionary] = None, + @arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.", mutex=Array("sortBySequencingRole")) val existing: Option[PathToSequenceDictionary] = None, @arg(flag='x', doc="Allow mismatching sequence lengths when using an existing sequence dictionary file.") val allowMismatchingLengths: Boolean = false, - @arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true + @arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true, + @arg(doc="Sort by the sequencing role (only when not updating an existing sequence dictionary file). Uses the order from `--sequence-roles` if provided.", mutex=Array("existing")) + val sortBySequencingRole: Boolean = false, ) extends FgBioTool with LazyLogging { import com.fulcrumgenomics.fasta.{AssemblyReportColumn => Column} @@ -213,7 +216,18 @@ class CollectAlternateContigNames // Apply to an existing sequence dictionary if necessary val dict: SequenceDictionary = existing match { - case None => SequenceDictionary(metadatas.toSeq:_*) + case None => + if (!sortBySequencingRole) SequenceDictionary(metadatas.toSeq:_*) else { + // Use the user-provided roles, otherwise use all role + val roles = (if (sequenceRoles.nonEmpty) sequenceRoles.toIndexedSeq else SequenceRole.values) + .zipWithIndex + .map { case (role, index) => role.key -> index } + .toMap + val metas = metadatas.toSeq.sortBy { metadata => + metadata.get(Column.SequenceRole.tag).map { name => roles(name) }.getOrElse(roles.size) + }.zipWithIndex.map { case (metadata, index) => metadata.copy(index=index) } + SequenceDictionary(metas:_*) + } case Some(path) => val assemblyReportMetadataMap = metadatas.map { m => (m.name, m) }.toMap val updatedMetadatas = SequenceDictionary(path).map { existingMetadata =>