Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] Add an option to sort by sequence role in CollectAlternateContigNames #868

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ object SequenceRole extends FgBioEnum[SequenceRole] {
SequenceRole(roleValue)
}

case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" }
// NB: the order in which these are defined is important for when `--sort-by-sequence-role` is used
case object AssembledMolecule extends SequenceRole { val key: String = "assembled-molecule" }
case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" }
case object FixPatch extends SequenceRole { val key: String = "fix-patch" }
case object NovelPatch extends SequenceRole { val key: String = "novel-patch" }
case object UnlocalizedScaffold extends SequenceRole { val key: String = "unlocalized-scaffold" }
Expand Down Expand Up @@ -131,9 +132,11 @@ class CollectAlternateContigNames
@arg(flag='a', doc="The assembly report column(s) for the alternate contig name(s)", minElements=1) val alternates: Seq[AssemblyReportColumn],
@arg(flag='s', doc="Only output sequences with the given sequence roles. If none given, all sequences will be output.", minElements=0)
val sequenceRoles: Seq[SequenceRole] = Seq.empty,
@arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.") val existing: Option[PathToSequenceDictionary] = None,
@arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.", mutex=Array("sortBySequencingRole")) val existing: Option[PathToSequenceDictionary] = None,
@arg(flag='x', doc="Allow mismatching sequence lengths when using an existing sequence dictionary file.") val allowMismatchingLengths: Boolean = false,
@arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true
@arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true,
@arg(doc="Sort by the sequencing role (only when not updating an existing sequence dictionary file). Uses the order from `--sequence-roles` if provided.", mutex=Array("existing"))
val sortBySequencingRole: Boolean = false,
) extends FgBioTool with LazyLogging {

import com.fulcrumgenomics.fasta.{AssemblyReportColumn => Column}
Expand Down Expand Up @@ -213,7 +216,18 @@ class CollectAlternateContigNames

// Apply to an existing sequence dictionary if necessary
val dict: SequenceDictionary = existing match {
case None => SequenceDictionary(metadatas.toSeq:_*)
case None =>
if (!sortBySequencingRole) SequenceDictionary(metadatas.toSeq:_*) else {
// Use the user-provided roles, otherwise use all role
val roles = (if (sequenceRoles.nonEmpty) sequenceRoles.toIndexedSeq else SequenceRole.values)
.zipWithIndex
.map { case (role, index) => role.key -> index }
.toMap
val metas = metadatas.toSeq.sortBy { metadata =>
metadata.get(Column.SequenceRole.tag).map { name => roles(name) }.getOrElse(roles.size)
}.zipWithIndex.map { case (metadata, index) => metadata.copy(index=index) }
SequenceDictionary(metas:_*)
}
case Some(path) =>
val assemblyReportMetadataMap = metadatas.map { m => (m.name, m) }.toMap
val updatedMetadatas = SequenceDictionary(path).map { existingMetadata =>
Expand Down