Skip to content

Commit

Permalink
umi-tools compatability
Browse files Browse the repository at this point in the history
Hi Rory,

We've been using umis for our single cell RNAseq and really like it and
made some modification to it. We'll be putting the pipeline on github
and wanted to ask you if you have any preference how to put the umis
branch up or if you would perhaps include our changes in the master
branch. I'll briefly describe our pipeline, and the additions to umis.
1. celseq2 reads
2. umis fastqtransforum
3. umis cb_filter
4. umis sb_filter (new function to correct the sample reads with nedit 1
to the true sample barcode)
5. umis mb_filter (new function to remove any umi reads with non ACGT
bases , N bases in our case)
6. umis add_uid (new function to add unique identifier (UID) tag to read
name consisting of the concatenation of SB CB MB, this is needed to
allow UMI-tools dedup on bam files with multiple cells/samples
7. align STAR
8. count metafeatures with featurecounts and transfer geneID to bam file
with XF:Z: tag (htseq can also do this but is slower)
9. UMI-tools dedup on a per gene basis using the geneID tag in the bam
file
10. expression matrix from bam file We're now using our own script to
get the expression matrix. if you would make umis tag_count compatible
with this pipeline, I suggest having both the sample and cell barcode in
the column headers in stead of only the cell barcode, and allow
tagcounts to count the XF:Z: tags.
We also changed the barcodehash in umis to include N bases so that those
get also corrected.

There seems to be interest in such a pipeline as there are quite a few
request in the umi-tools comment section on how to preprocess
single-cell reads. umis is ideal for this and that is why we modified
it. The bc-bio pipeline currently only works with the pseudo aligners,
which don't work so well on umi end tagged libraries, so I think an
option to do traditional alignment/gene counting in the bc-bio pipeline
would be useful. (as least we would have used that if it was available)

I'm mostly running experiments so this is my first github project so not
really sure how to best branch umis or whatever. I think our preferred
way would be to have our changes in the umis master branch and maybe a
link on the bc-bio scRANseq page to our pipeline for people that would
like to do traditional gene alignement/counting. But let us know what
you prefer. We could also just have the branch at the umis or our page.

https://github.com/MarinusVL/scRNApipe

in the pull request the umis.py and barcodes.py, they work on our
cellseq2 data but I think there may be some bugs for data in other
formats, (without sample barcodes or with multiple cell barcodes most
likely)

Thanks, Marinus
  • Loading branch information
MarinusVL committed May 3, 2017
1 parent 7fa0583 commit 15b879f
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 3 deletions.
63 changes: 62 additions & 1 deletion umis/barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,34 @@ def correcting_barcode_filter(chunk, bc1hash, bc2hash):
kept.append(read)
return kept

def exact_sample_filter2(chunk, barcodes):
parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P<SB>.*)\\n(.*)\\n\\+\\n(.*)\\n')
kept = []
for read in chunk:
match = parser_re.search(read).groupdict()
sample = match['SB']
if sample not in barcodes:
continue
kept.append(read)
return kept

def correcting_sample_filter2(chunk, barcodehash):
parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P<SB>.*)\\n(.*)\\n\\+\\n(.*)\\n')
kept = []
for read in chunk:
match = parser_re.search(read).groupdict()
sample = match['SB']
barcodecorrected = barcodehash[sample]
if not barcodecorrected:
continue
correctbc = barcodecorrected
if correctbc == match['SB']:
kept.append(read)
else:
read = read.replace("SAMPLE_" + match['SB'], "SAMPLE_" + correctbc)
kept.append(read)
return kept

def exact_sample_filter(read, barcodes):
parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P<SB>.*)\\n(.*)\\n\\+\\n(.*)\\n')
match = parser_re.search(read).groupdict()
Expand All @@ -50,6 +78,32 @@ def exact_sample_filter(read, barcodes):
return None
return read

def umi_filter(chunk):
parser_re = re.compile('(.*):CELL_(.*):UMI_(?P<MB>.*):SAMPLE_(.*)\\n(.*)\\n\\+\\n(.*)\\n')
kept = []
for read in chunk:
match = parser_re.search(read).groupdict()
MB = match['MB']
if not acgt_match(MB):
continue
else:
kept.append(read)
return kept

def append_uids(chunk):
parser_re = re.compile('(.*):CELL_(?P<CB>.*):UMI_(?P<MB>.*):SAMPLE_(?P<SB>.*)\\n(.*)\\n\\+\\n(.*)\\n')
kept = []
for read in chunk:
match = parser_re.search(read).groupdict()
CB = match['CB']
MB = match['MB']
SB = match['SB']
sample = "SAMPLE_"+ match['SB']
idx = read.find(sample)+len(sample)
read = read[:idx]+":UID_" + SB + CB + MB+ read[idx:]
kept.append(read)
return kept

def correcting_sample_filter(read, barcodehash):
parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P<SB>.*)\\n(.*)\\n\\+\\n(.*)\\n')
match = parser_re.search(read).groupdict()
Expand Down Expand Up @@ -114,11 +168,18 @@ def generate_idx(maxlen, nedit):
this covers all edits < nedit as well since some of the specified
substitutions will not change the base
"""
ALPHABET = ["A", "C", "G", "T"]
ALPHABET = ["A", "C", "G", "T", "N"]
indexlists = []
ALPHABETS = [ALPHABET for x in range(nedit)]
return list(itertools.product(itertools.combinations(range(maxlen), nedit),
*ALPHABETS))

def acgt_match(string):
"""
returns True if sting consist of only "A "C" "G" "T"
"""
search = re.compile(r'[^ACGT]').search
return not bool(search(string))

def mutate_string(string, tomutate):
strlist = list(string)
Expand Down
66 changes: 64 additions & 2 deletions umis/umis.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import toolz as tz

from .barcodes import (exact_barcode_filter, correcting_barcode_filter,
exact_sample_filter, correcting_sample_filter,
MutationHash)
exact_sample_filter, correcting_sample_filter, exact_sample_filter2
, correcting_sample_filter2, umi_filter, append_uids, MutationHash)
import numpy as np
import scipy.io, scipy.sparse

Expand Down Expand Up @@ -738,6 +738,65 @@ def cb_filter(fastq, bc1, bc2, cores, nedit):
for read in chunk:
sys.stdout.write(read)

@click.command()
@click.argument('fastq', type=click.File('r'))
@click.option('--bc', type=click.File('r'))
@click.option('--cores', default=1)
@click.option('--nedit', default=0)
def sb_filter(fastq, bc, cores, nedit):
''' Filters reads with non-matching sample barcodes
Expects formatted fastq files.
'''
barcodes = set(sb.strip() for sb in bc)
if nedit == 0:
filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
else:
barcodehash = MutationHash(barcodes, nedit)
filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
p = multiprocessing.Pool(cores)

chunks = tz.partition_all(10000, stream_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_sb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)

@click.command()
@click.argument('fastq', type=click.File('r'))
@click.option('--cores', default=1)
def mb_filter(fastq, cores):
''' Filters umis with non-ACGT bases
Expects formatted fastq files.
'''
filter_mb = partial(umi_filter)
p = multiprocessing.Pool(cores)

chunks = tz.partition_all(10000, stream_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_mb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)

@click.command()
@click.argument('fastq', type=click.File('r'))
@click.option('--cores', default=1)
def add_uid(fastq, cores):
''' Adds UID:[sampebc cellbc umi] to readname for umi-tools deduplication
Expects formatted fastq files with correct sample and cell barcodes.
'''

uids = partial(append_uids)
p = multiprocessing.Pool(cores)

chunks = tz.partition_all(10000, stream_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(uids, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)

def write_kallisto_chunk(out_dir, cb, chunk):
fq_fn = os.path.join(out_dir, cb + ".fq")
umi_fn = os.path.join(out_dir, cb + ".umi")
Expand Down Expand Up @@ -937,6 +996,9 @@ def umis():
umis.add_command(cb_histogram)
umis.add_command(umi_histogram)
umis.add_command(cb_filter)
umis.add_command(sb_filter)
umis.add_command(mb_filter)
umis.add_command(add_uid)
umis.add_command(kallisto)
umis.add_command(bamtag)
umis.add_command(demultiplex_samples)
Expand Down

0 comments on commit 15b879f

Please sign in to comment.