Skip to content

Commit

Permalink
feat: illumnia sample sheet parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Routsong committed Dec 2, 2023
1 parent 8ae14bf commit b6a2d08
Showing 1 changed file with 126 additions and 0 deletions.
126 changes: 126 additions & 0 deletions scripts/ss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from csv import DictReader
from io import StringIO
from datetime import datetime
from operator import itemgetter
from dateutil import parser as dateparser


class IllumniaSampleSheet():
"""Class to parse illumnia sample sheet information according to the
sample sheet format specified by [Illumina](https://support-docs.illumina.com/SHARE/SampleSheetv2/Content/SHARE/SampleSheetv2/SampleSheetStructure.htm)
Properties:
pairedend(bool): True for paired end seqencing data, False for single end sequencing data
project(str): Title of the project for this sequencing run
run(str): Run Identifer this single sequencing run
"""

def __init__(self, samplesheet, end=None):
self.sheet = self.parse_sheet(samplesheet)
self.force_endedness = end

def parse_sheet(self, sheet):
sheet_sections = dict()
with open(sheet) as opensheet:
this_section = None
for _line in opensheet:
line = _line.strip()
first = _line.split(',')[0]
if first.startswith('[') and first.endswith(']'):
sheet_sections[first[1:-1]] = []
this_section = first[1:-1]
else:
sheet_sections[this_section].append(line)

if 'Header' in sheet_sections:
self.process_simple_section(sheet_sections['Header'])

if 'Settings' in sheet_sections:
self.process_simple_section(sheet_sections['Settings'])

if 'Reads' in sheet_sections:
self.process_simple_section(sheet_sections['Reads'])

assert 'Data' in sheet_sections, 'No sample data within this sample sheet'
self.process_csv_section(sheet_sections['Data'])


def process_simple_section(self, section):
"""Simple section processing for Illumnia sample sheet.
Objective:
Disgard rows of all commas, collect rows with single values into self attributes which
are
"""
for _line in section:
index = _line.split(',')[0]
if ' ' in index:
index = index.replace(' ', '_')
if "\n" in _line:
_line = _line.replace("\n", ' ')
rest = ','.join(_line.split(',')[1:])
if set(list(rest)) == ',':
continue
else:
second = _line.split(',')[1]
if index == 'Date':
setattr(self, index, dateparser.parse(second))
else:
setattr(self, index, second)
return

def process_csv_section(self, section):
section_io = StringIO("\n".join(section))
section_csv = DictReader(section_io, delimiter=',')
setattr(self, 'data', list(section_csv))


@property
def samples(self):
return getattr(self, 'data', None)

@property
def project(self):
sheet_projects = list(set(map(itemgetter('Sample_Project'), self.data)))
assert len(sheet_projects) == 1, 'Sample sheet containers multiple project, please limit to single project per sheet.'
return sheet_projects[0]

@property
def instrument(self):
return getattr(self, 'Instrument', None)

@property
def adapters(self):
r1 = getattr(self, 'Read01', None)
r2 = getattr(self, 'Read02', None)
return list(map(int, filter(None, [r1, r2])))

@property
def indexes(self):
i1 = getattr(self, 'Index01', None)
i2 = getattr(self, 'Index02', None)
return list(map(int, filter(None, [i1, i2])))

@property
def is_paired_end(self):
if len(self.adapters) == 1:
return False
elif len(self.adapters) == 2:
return True
else:
raise ValueError('Unknown endedness from sample sheet')

@property
def is_single_end(self):
if len(self.adapters) == 1:
return True
elif len(self.adapters) == 2:
return False
else:
raise ValueError('Unknown endedness from sample sheet')




0 comments on commit b6a2d08

Please sign in to comment.