Skip to content

Commit

Permalink
Lamanno intron (#19)
Browse files Browse the repository at this point in the history
* update directory structure

* update lamanno intron generation

* ignore 5' and 3' utrs
  • Loading branch information
Lioscro authored Nov 1, 2019
1 parent b8c112b commit 34e15fd
Show file tree
Hide file tree
Showing 12 changed files with 146 additions and 56 deletions.
85 changes: 64 additions & 21 deletions kb_python/fasta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re

from .gtf import GTF
from .utils import open_as_text
Expand All @@ -12,6 +13,8 @@ class FASTA:
:param fasta_path: path to FASTA file
:type fasta_path: str
"""
PARSER = re.compile(r'^>(?P<sequence_id>\S+)(?P<group>.*)')
GROUP_PARSER = re.compile(r'(?P<key>\S+?):(?P<value>\S+)')
BASEPAIRS = {
'a': 'T',
'A': 'T',
Expand Down Expand Up @@ -48,15 +51,22 @@ def make_header(seq_id, attributes):

@staticmethod
def parse_header(line):
"""Get the sequence ID from a FASTA header.
"""Parse information from a FASTA header.
:param line: FASTA header line
:type line: str
:return: sequence ID
:rtype: str
:return: parsed information
:rtype: dict
"""
return line.strip().split(' ', 1)[0][1:]
match = FASTA.PARSER.match(line)
if match:
groupdict = match.groupdict()
groupdict['group'] = dict(
FASTA.GROUP_PARSER.findall(groupdict.get('group', ''))
)
return groupdict
return None

@staticmethod
def reverse_complement(sequence):
Expand All @@ -77,20 +87,20 @@ def entries(self):
:rtype: generator
"""
with open_as_text(self.fasta_path, 'r') as f:
sequence_id = None
info = None
sequence = ''
for line in f:
if line.startswith('>'):
if sequence_id:
yield sequence_id, sequence
if info:
yield info, sequence
sequence = ''

sequence_id = FASTA.parse_header(line)
info = FASTA.parse_header(line)
else:
sequence += line.strip()

if sequence_id:
yield sequence_id, sequence
if info:
yield info, sequence

def sort(self, out_path):
"""Sort the FASTA file by sequence ID.
Expand All @@ -105,7 +115,9 @@ def sort(self, out_path):

while line:
if line.startswith('>'):
to_sort.append([FASTA.parse_header(line), position, None])
to_sort.append([
FASTA.parse_header(line)['sequence_id'], position, None
])

position = f.tell()
line = f.readline()
Expand Down Expand Up @@ -153,7 +165,8 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):

with open_as_text(out_path, 'w') as f:
previous_gtf_entry = None
for sequence_id, sequence in fasta.entries():
for info, sequence in fasta.entries():
sequence_id = info['sequence_id']
logger.debug(
'Generating cDNA from chromosome {}'.format(sequence_id)
)
Expand Down Expand Up @@ -235,7 +248,7 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
return out_path


def generate_intron_fasta(fasta_path, gtf_path, out_path):
def generate_intron_fasta(fasta_path, gtf_path, out_path, flank=30):
"""Generate an intron FASTA using the genome and GTF.
This function assumes the order in which the chromosomes appear in the
Expand All @@ -245,13 +258,17 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
1. transcript - exons
2. 5' UTR
3. 3' UTR
Additionally, append 30-bp (k - 1 where k = 31) flanks to each intron,
combining sections that overlap into a single FASTA entry.
:param fasta_path: path to genomic FASTA file
:type fasta_path: str
:param gtf_path: path to GTF file
:type gtf_path: str
:param out_path: path to intron FASTA to generate
:type out_path: str
:param flank: the size of intron flanks, in bases, defaults to `30`
:type flank: int, optional
:return: path to generated intron FASTA
:rtype: str
Expand All @@ -262,7 +279,8 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):

with open_as_text(out_path, 'w') as f:
previous_gtf_entry = None
for sequence_id, sequence in fasta.entries():
for info, sequence in fasta.entries():
sequence_id = info['sequence_id']
logger.debug(
'Generating introns from chromosome {}'.format(sequence_id)
)
Expand Down Expand Up @@ -294,7 +312,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
transcript = '{}.{}'.format(
transcript_id, transcript_version
) if transcript_version else transcript_id
transcript += '-I'

transcript_exons.setdefault(transcript,
[]).append((start, end))
Expand All @@ -306,7 +323,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
transcript = '{}.{}'.format(
transcript_id, transcript_version
) if transcript_version else transcript_id
transcript += '-I'

gene_id = gtf_entry['group']['gene_id']
gene_version = gtf_entry['group'].get('gene_version', None)
Expand Down Expand Up @@ -354,14 +370,40 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
else:
introns.append(transcript_interval)

intron = ''
index = 1
flank_start = None
flank_end = None
for start, end in introns:
intron += sequence[start - 1:end]

if intron:
if flank_start is None:
flank_start = max(start - flank, transcript_interval[0])
if flank_end is None or start - flank <= flank_end:
flank_end = min(end + flank, transcript_interval[1])
else:
intron = sequence[flank_start - 1:flank_end]
f.write(
'{}\n'.format(
FASTA.make_header(
'{}-I.{}'.format(transcript, index),
attributes
)
)
)
f.write(
'{}\n'.format(
intron if dict(attributes)['strand'] ==
'+' else FASTA.reverse_complement(intron)
)
)
index += 1
flank_start = max(start - flank, transcript_interval[0])
flank_end = min(end + flank, transcript_interval[1])
if flank_start is not None and flank_end is not None:
intron = sequence[flank_start - 1:flank_end]
f.write(
'{}\n'.format(
FASTA.make_header(transcript, attributes)
FASTA.make_header(
'{}-I.{}'.format(transcript, index), attributes
)
)
)
f.write(
Expand All @@ -370,6 +412,7 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
'+' else FASTA.reverse_complement(intron)
)
)
index += 1

return out_path

Expand Down
4 changes: 1 addition & 3 deletions kb_python/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,7 @@ def sort(self, out_path):
while line:
if not line.startswith('#') and not line.isspace():
entry = GTF.parse_entry(line)
if entry['feature'] in ('transcript', 'exon',
'five_prime_utr',
'three_prime_utr'):
if entry['feature'] in ('transcript', 'exon'):
to_sort.append(
(entry['seqname'], entry['start'], position)
)
Expand Down
31 changes: 28 additions & 3 deletions kb_python/ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,30 @@ def sort_fasta(fasta_path, out_path):
return out_path


def create_t2g_from_fasta(fasta_path, t2g_path):
"""Parse FASTA headers to get transcripts-to-gene mapping.
:param fasta_path: path to FASTA file
:type fasta_path: str
:param t2g_path: path to output transcript-to-gene mapping
:type t2g_path: str
:return: dictionary containing path to generated t2g mapping
:rtype: dict
"""
logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path))
with open_as_text(t2g_path, 'w') as f:
fasta = FASTA(fasta_path)
for info, _ in fasta.entries():
f.write(
'{}\t{}\t{}\n'.format(
info['sequence_id'], info['group']['gene_id'],
info['group'].get('gene_name', '')
)
)
return {'t2g': t2g_path}


def create_t2g_from_gtf(gtf_path, t2g_path, intron=False):
"""Creates a transcript-to-gene mapping from a GTF file.
Expand Down Expand Up @@ -114,7 +138,8 @@ def create_t2c(fasta_path, t2c_path):
"""
fasta = FASTA(fasta_path)
with open_as_text(t2c_path, 'w') as f:
for sequence_id, _ in fasta.entries():
for info, _ in fasta.entries():
sequence_id = info['sequence_id']
f.write('{}\n'.format(sequence_id))
return {'t2c': t2c_path}

Expand Down Expand Up @@ -284,8 +309,6 @@ def ref_lamanno(
:rtype: dict
"""
results = {}
t2g_result = create_t2g_from_gtf(gtf_path, t2g_path, intron=True)
results.update(t2g_result)
if not os.path.exists(index_path) or overwrite:
sorted_fasta_path = sort_fasta(
fasta_path, os.path.join(temp_dir, SORTED_FASTA_FILENAME)
Expand Down Expand Up @@ -321,6 +344,8 @@ def ref_lamanno(
out_path=os.path.join(temp_dir, COMBINED_FILENAME),
temp_dir=temp_dir
)
t2g_result = create_t2g_from_fasta(combined_path, t2g_path)
results.update(t2g_result)
index_result = kallisto_index(combined_path, index_path)
results.update(index_result)
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/fasta/cdna_split.fa
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
>TRANSCRIPT_A.1 gene_id:GENE_A.1 gene_name:GENE_A_NAME chr:1 start:1 end:2 strand:+
CA
>TRANSCRIPT_B gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
TCGATC
TGATC
>TRANSCRIPT_C gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
TAATCGA
10 changes: 6 additions & 4 deletions tests/fixtures/fasta/intron_split.fa
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
>TRANSCRIPT_B-I gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
AGTG
>TRANSCRIPT_C-I gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
AGATCC
>TRANSCRIPT_B-I.1 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
ATCGG
>TRANSCRIPT_B-I.2 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
CTG
>TRANSCRIPT_C-I.1 gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
CAGATCCG
2 changes: 1 addition & 1 deletion tests/fixtures/gtf/not_sorted.gtf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
2 havana exon 2 3 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
2 havana exon 10 14 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
2 havana transcript 2 14 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
2 havana exon 2 3 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana exon 2 2 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana exon 5 8 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana transcript 1 10 . + . gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
1 havana exon 1 2 . + . gene_id "GENE_A"; gene_version "1"; transcript_id "TRANSCRIPT_A"; transcript_version "1"; exon_number "1"; gene_name "GENE_A_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; exon_id "ENSMUSE00001343744"; exon_version "1"; tag "basic"; transcript_support_level "NA";
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/gtf/sorted.gtf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
2 havana transcript 1 10 . + . gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
2 havana exon 2 3 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
2 havana transcript 2 14 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
2 havana exon 2 3 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana exon 2 2 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana exon 5 8 . + . gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
2 havana exon 10 14 . - . gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
9 changes: 3 additions & 6 deletions tests/fixtures/gtf/t2g_intron.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
TRANSCRIPT_C GENE_C GENE_C_NAME
TRANSCRIPT_C-I GENE_C GENE_C_NAME
TRANSCRIPT_B GENE_B.1 GENE_B_NAME
TRANSCRIPT_B-I GENE_B.1 GENE_B_NAME
TRANSCRIPT_A.1 GENE_A.1 GENE_A_NAME
TRANSCRIPT_A.1-I GENE_A.1 GENE_A_NAME
TRANSCRIPT_B-I.1 GENE_B.1 GENE_B_NAME
TRANSCRIPT_B-I.2 GENE_B.1 GENE_B_NAME
TRANSCRIPT_C-I.1 GENE_C GENE_C_NAME
6 changes: 6 additions & 0 deletions tests/fixtures/gtf/t2g_intron_gtf.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
TRANSCRIPT_C GENE_C GENE_C_NAME
TRANSCRIPT_C-I GENE_C GENE_C_NAME
TRANSCRIPT_B GENE_B.1 GENE_B_NAME
TRANSCRIPT_B-I GENE_B.1 GENE_B_NAME
TRANSCRIPT_A.1 GENE_A.1 GENE_A_NAME
TRANSCRIPT_A.1-I GENE_A.1 GENE_A_NAME
5 changes: 4 additions & 1 deletion tests/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ def setUpClass(cls):
cls.unsorted_gtf_path = os.path.join(cls.gtf_dir, 'not_sorted.gtf')
cls.sorted_gtf_path = os.path.join(cls.gtf_dir, 'sorted.gtf')
cls.gtf_t2g_path = os.path.join(cls.gtf_dir, 't2g.txt')
cls.gtf_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
cls.fasta_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
cls.gtf_t2g_intron_path = os.path.join(
cls.gtf_dir, 't2g_intron_gtf.txt'
)

cls.fasta_dir = os.path.join(cls.fixtures_dir, 'fasta')
cls.unsorted_fasta_path = os.path.join(cls.fasta_dir, 'not_sorted.fa')
Expand Down
13 changes: 9 additions & 4 deletions tests/test_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ def test_make_header(self):
)

def test_parse_header(self):
header = '>transcript_id TEST'
self.assertEqual('transcript_id', fasta.FASTA.parse_header(header))
header = '>transcript_id TEST:testing'
self.assertEqual({
'sequence_id': 'transcript_id',
'group': {
'TEST': 'testing'
}
}, fasta.FASTA.parse_header(header))

def test_reverse_complement(self):
sequence = 'ATCG'
Expand All @@ -34,7 +39,7 @@ def test_entries(self):

def test_sort(self):
out_path = os.path.join(
tempfile.gettempdir(), '{}.gtf'.format(uuid.uuid4())
tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
)
fa = fasta.FASTA(self.unsorted_fasta_path)
fa.sort(out_path)
Expand All @@ -59,7 +64,7 @@ def test_generate_intron_fasta(self):
tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
)
fasta.generate_intron_fasta(
self.sorted_fasta_path, self.sorted_gtf_path, out_path
self.sorted_fasta_path, self.sorted_gtf_path, out_path, flank=1
)
with open(out_path, 'r') as f, open(self.split_intron_fasta_path,
'r') as split:
Expand Down
Loading

0 comments on commit 34e15fd

Please sign in to comment.