Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding files to do with SAM format #218

Closed
wants to merge 18 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Check SAM file problems
sophiemathias committed Sep 13, 2014
commit 3fbde7cd6872cb730734ce042f3075a052cc6bbd
2 changes: 1 addition & 1 deletion dark/alignments.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

from dark.taxonomy import LineageFetcher
# from dark.taxonomy import LineageFetcher
from dark.filter import TitleFilter
from dark.score import HigherIsBetterScore

8 changes: 4 additions & 4 deletions dark/sam/alignments.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,8 @@ class SamReadsAlignments(ReadsAlignments):
files.
"""
def __init__(self, reads, samFilename):
self.samFilename = samFilename
if checkSAMfile(samFilename):
self.samFilename = samFilename
self.reads = reads
# Prepare application parameters in order to initialize self.
self.head = self._convertSamHeaderToDict()
@@ -87,9 +88,8 @@ def _getReader(self, filename, applicationParams, scoreClass):
@param filename: The C{str} file name holding the SAM records.
@param scoreClass: A class to hold and compare scores (see scores.py).
"""
if checkSAMfile(filename):
return SAMRecordsReader(self.samFilename, self.head,
self.scoreClass)
return SAMRecordsReader(self.samFilename, self.head,
self.scoreClass)

def iter(self):
"""
2 changes: 1 addition & 1 deletion dark/sam/conversion.py
Original file line number Diff line number Diff line change
@@ -247,7 +247,7 @@ def _lineToHSP(self, line):
if 'read reverse strand' in explain_sam_flags(flag):
# What it says on the tin...
pass

# TODO add in readStartinsbjct etc
hsp = HSP(score, readStart=readStart, readEnd=readEnd,
subjectStart=subjStart, subjectEnd=subjEnd)
return hsp
110 changes: 57 additions & 53 deletions dark/sam/hacks.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,63 @@
import subprocess as sp


def checkSAMfile(samFilename):
"""
Checks that the file inputted as a SAM file is indeed a SAM file.
@param samFilename: a C{str} of a SAM file.
"""
headerLines = 0
with open(samFilename) as samFile:
for line in samFile:
if not str(line):
raise ValueError('SAM file %s was empty.' % samFilename)
elif line[0] == '@':
headerLines += 1
else:
elements = line.strip().split()
assert len(elements) > 10, ('SAM file %s does not contain '
'at least 11 fields.'
% samFilename)
assert headerLines > 0, ('SAM file %s does not contain header.'
% samFilename)
return True


def checkFASTAfile(fastaFilename):
"""
Checks that the file inputted as a FASTA file is indeed a FASTA file.
@param fastaFilename: a C{str} of a FASTA file.
"""
with open(fastaFilename) as fastaFile:
# Make into an iterable so can compare two lines.
fastaFile = iter(fastaFile)
for line in fastaFile:
assert line[0] == '>', 'FASTA file does not begin with a title'
line = next(fastaFile)
# Invalid if two lines begin with >
assert line[0] != '>' and line, 'Invalid FASTA file format'


def checkFASTQfile(fastqFilename):
"""
TODO: Finish
Checks that the file inputted as a FASTQ file is indeed a FASTQ file.
@param fastqFilename: a C{str} of a FASTQ file.
"""
with open(fastqFilename) as fastqFile:
fastqFile = iter(fastqFile)
for line in fastqFile:
header = line[1:]
assert line[0] == '@', 'Invalid header of entry: %s' % header
line = next(fastqFile)
assert line, 'Empty raw sequence in entry: %s' % header
line = next(fastqFile)
assert line[0] == '+', 'Invalid third line of entry: %s' % header
line = next(fastqFile)
assert line, 'Empty quality score in entry: %s' % header
return True


def samSubtract(samFile, outFile):
"""
Takes a SAM file, makes a set of the seqids of unaligned sequences.
@@ -95,56 +152,3 @@ def findMD(samFile, fastaFile):
sp.Popen(['samtools', 'fillmd', '-S', samFile, fastaFile, '>',
samFileNew], stderr=sp.PIPE)
return samFileNew


def checkSAMfile(samFilename):
"""
Checks that the file inputted as a SAM file is indeed a SAM file.
@param samFilename: a C{str} of a SAM file.
"""
headerLines = 0
with open(samFilename) as samFile:
for line in samFile:
if line[0] == '@':
headerLines += 1
else:
elements = line.strip().split()
assert len(elements) > 10, ('SAM file does not contain at '
'least 11 fields.')
assert headerLines > 0, 'SAM file does not contain header.'
return True


def checkFASTAfile(fastaFilename):
"""
Checks that the file inputted as a FASTA file is indeed a FASTA file.
@param fastaFilename: a C{str} of a FASTA file.
"""
with open(fastaFilename) as fastaFile:
# Make into an iterable so can compare two lines.
fastaFile = iter(fastaFile)
for line in fastaFile:
assert line[0] == '>', 'FASTA file does not begin with a title'
line = next(fastaFile)
# Invalid if two lines begin with >
assert line[0] != '>' and line, 'Invalid FASTA file format'


def checkFASTQfile(fastqFilename):
"""
TODO: Finish
Checks that the file inputted as a FASTQ file is indeed a FASTQ file.
@param fastqFilename: a C{str} of a FASTQ file.
"""
with open(fastqFilename) as fastqFile:
fastqFile = iter(fastqFile)
for line in fastqFile:
header = line[1:]
assert line[0] == '@', 'Invalid header of entry: %s' % header
line = next(fastqFile)
assert line, 'Empty raw sequence in entry: %s' % header
line = next(fastqFile)
assert line[0] == '+', 'Invalid third line of entry: %s' % header
line = next(fastqFile)
assert line, 'Empty quality score in entry: %s' % header
return True
6 changes: 3 additions & 3 deletions test/sam/test_alignments.py
Original file line number Diff line number Diff line change
@@ -47,7 +47,7 @@ def testEmptySAMInput(self):
mockOpener = mockOpen()
with patch('__builtin__.open', mockOpener, create=True):
reads = Reads()
error = "SAM file 'file.SAM' was empty."
error = "SAM file file.SAM was empty."
self.assertRaisesRegexp(
ValueError, error, SamReadsAlignments, reads, 'file.SAM')

@@ -60,9 +60,9 @@ def testNonSAMInput(self):
mockOpener = mockOpen(read_data='not SAM\n')
with patch('__builtin__.open', mockOpener, create=True):
reads = Reads()
error = ("No header lines in file.SAM")
error = "SAM file file.SAM does not contain at least 11 fields."
self.assertRaisesRegexp(
ValueError, error, SamReadsAlignments, reads, 'file.SAM')
AssertionError, error, SamReadsAlignments, reads, 'file.SAM')

def testApplicationParams(self):
"""