Lamanno intron (#19)

* update directory structure * update lamanno intron generation * ignore 5' and 3' utrs
pachterlab · Nov 1, 2019 · 34e15fd · 34e15fd
1 parent b8c112b
commit 34e15fd
Show file tree

Hide file tree

Showing 12 changed files with 146 additions and 56 deletions.
diff --git a/kb_python/fasta.py b/kb_python/fasta.py
@@ -1,4 +1,5 @@
 import logging
+import re
 
 from .gtf import GTF
 from .utils import open_as_text
@@ -12,6 +13,8 @@ class FASTA:
     :param fasta_path: path to FASTA file
     :type fasta_path: str
     """
+    PARSER = re.compile(r'^>(?P<sequence_id>\S+)(?P<group>.*)')
+    GROUP_PARSER = re.compile(r'(?P<key>\S+?):(?P<value>\S+)')
     BASEPAIRS = {
         'a': 'T',
         'A': 'T',
@@ -48,15 +51,22 @@ def make_header(seq_id, attributes):
 
     @staticmethod
     def parse_header(line):
-        """Get the sequence ID from a FASTA header.
+        """Parse information from a FASTA header.
 
         :param line: FASTA header line
         :type line: str
 
-        :return: sequence ID
-        :rtype: str
+        :return: parsed information
+        :rtype: dict
         """
-        return line.strip().split(' ', 1)[0][1:]
+        match = FASTA.PARSER.match(line)
+        if match:
+            groupdict = match.groupdict()
+            groupdict['group'] = dict(
+                FASTA.GROUP_PARSER.findall(groupdict.get('group', ''))
+            )
+            return groupdict
+        return None
 
     @staticmethod
     def reverse_complement(sequence):
@@ -77,20 +87,20 @@ def entries(self):
         :rtype: generator
         """
         with open_as_text(self.fasta_path, 'r') as f:
-            sequence_id = None
+            info = None
             sequence = ''
             for line in f:
                 if line.startswith('>'):
-                    if sequence_id:
-                        yield sequence_id, sequence
+                    if info:
+                        yield info, sequence
                         sequence = ''
 
-                    sequence_id = FASTA.parse_header(line)
+                    info = FASTA.parse_header(line)
                 else:
                     sequence += line.strip()
 
-            if sequence_id:
-                yield sequence_id, sequence
+            if info:
+                yield info, sequence
 
     def sort(self, out_path):
         """Sort the FASTA file by sequence ID.
@@ -105,7 +115,9 @@ def sort(self, out_path):
 
             while line:
                 if line.startswith('>'):
-                    to_sort.append([FASTA.parse_header(line), position, None])
+                    to_sort.append([
+                        FASTA.parse_header(line)['sequence_id'], position, None
+                    ])
 
                 position = f.tell()
                 line = f.readline()
@@ -153,7 +165,8 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
 
     with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
-        for sequence_id, sequence in fasta.entries():
+        for info, sequence in fasta.entries():
+            sequence_id = info['sequence_id']
             logger.debug(
                 'Generating cDNA from chromosome {}'.format(sequence_id)
             )
@@ -235,7 +248,7 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
     return out_path
 
 
-def generate_intron_fasta(fasta_path, gtf_path, out_path):
+def generate_intron_fasta(fasta_path, gtf_path, out_path, flank=30):
     """Generate an intron FASTA using the genome and GTF.
 
     This function assumes the order in which the chromosomes appear in the
@@ -245,13 +258,17 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
     1. transcript - exons
     2. 5' UTR
     3. 3' UTR
+    Additionally, append 30-bp (k - 1 where k = 31) flanks to each intron,
+    combining sections that overlap into a single FASTA entry.
 
     :param fasta_path: path to genomic FASTA file
     :type fasta_path: str
     :param gtf_path: path to GTF file
     :type gtf_path: str
     :param out_path: path to intron FASTA to generate
     :type out_path: str
+    :param flank: the size of intron flanks, in bases, defaults to `30`
+    :type flank: int, optional
 
     :return: path to generated intron FASTA
     :rtype: str
@@ -262,7 +279,8 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
 
     with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
-        for sequence_id, sequence in fasta.entries():
+        for info, sequence in fasta.entries():
+            sequence_id = info['sequence_id']
             logger.debug(
                 'Generating introns from chromosome {}'.format(sequence_id)
             )
@@ -294,7 +312,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                     transcript = '{}.{}'.format(
                         transcript_id, transcript_version
                     ) if transcript_version else transcript_id
-                    transcript += '-I'
 
                     transcript_exons.setdefault(transcript,
                                                 []).append((start, end))
@@ -306,7 +323,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                     transcript = '{}.{}'.format(
                         transcript_id, transcript_version
                     ) if transcript_version else transcript_id
-                    transcript += '-I'
 
                     gene_id = gtf_entry['group']['gene_id']
                     gene_version = gtf_entry['group'].get('gene_version', None)
@@ -354,14 +370,40 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                 else:
                     introns.append(transcript_interval)
 
-                intron = ''
+                index = 1
+                flank_start = None
+                flank_end = None
                 for start, end in introns:
-                    intron += sequence[start - 1:end]
-
-                if intron:
+                    if flank_start is None:
+                        flank_start = max(start - flank, transcript_interval[0])
+                    if flank_end is None or start - flank <= flank_end:
+                        flank_end = min(end + flank, transcript_interval[1])
+                    else:
+                        intron = sequence[flank_start - 1:flank_end]
+                        f.write(
+                            '{}\n'.format(
+                                FASTA.make_header(
+                                    '{}-I.{}'.format(transcript, index),
+                                    attributes
+                                )
+                            )
+                        )
+                        f.write(
+                            '{}\n'.format(
+                                intron if dict(attributes)['strand'] ==
+                                '+' else FASTA.reverse_complement(intron)
+                            )
+                        )
+                        index += 1
+                        flank_start = max(start - flank, transcript_interval[0])
+                        flank_end = min(end + flank, transcript_interval[1])
+                if flank_start is not None and flank_end is not None:
+                    intron = sequence[flank_start - 1:flank_end]
                     f.write(
                         '{}\n'.format(
-                            FASTA.make_header(transcript, attributes)
+                            FASTA.make_header(
+                                '{}-I.{}'.format(transcript, index), attributes
+                            )
                         )
                     )
                     f.write(
@@ -370,6 +412,7 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                             '+' else FASTA.reverse_complement(intron)
                         )
                     )
+                    index += 1
 
     return out_path
 

diff --git a/kb_python/gtf.py b/kb_python/gtf.py
@@ -79,9 +79,7 @@ def sort(self, out_path):
             while line:
                 if not line.startswith('#') and not line.isspace():
                     entry = GTF.parse_entry(line)
-                    if entry['feature'] in ('transcript', 'exon',
-                                            'five_prime_utr',
-                                            'three_prime_utr'):
+                    if entry['feature'] in ('transcript', 'exon'):
                         to_sort.append(
                             (entry['seqname'], entry['start'], position)
                         )

diff --git a/kb_python/ref.py b/kb_python/ref.py
@@ -54,6 +54,30 @@ def sort_fasta(fasta_path, out_path):
     return out_path
 
 
+def create_t2g_from_fasta(fasta_path, t2g_path):
+    """Parse FASTA headers to get transcripts-to-gene mapping.
+
+    :param fasta_path: path to FASTA file
+    :type fasta_path: str
+    :param t2g_path: path to output transcript-to-gene mapping
+    :type t2g_path: str
+
+    :return: dictionary containing path to generated t2g mapping
+    :rtype: dict
+    """
+    logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path))
+    with open_as_text(t2g_path, 'w') as f:
+        fasta = FASTA(fasta_path)
+        for info, _ in fasta.entries():
+            f.write(
+                '{}\t{}\t{}\n'.format(
+                    info['sequence_id'], info['group']['gene_id'],
+                    info['group'].get('gene_name', '')
+                )
+            )
+    return {'t2g': t2g_path}
+
+
 def create_t2g_from_gtf(gtf_path, t2g_path, intron=False):
     """Creates a transcript-to-gene mapping from a GTF file.
 
@@ -114,7 +138,8 @@ def create_t2c(fasta_path, t2c_path):
     """
     fasta = FASTA(fasta_path)
     with open_as_text(t2c_path, 'w') as f:
-        for sequence_id, _ in fasta.entries():
+        for info, _ in fasta.entries():
+            sequence_id = info['sequence_id']
             f.write('{}\n'.format(sequence_id))
     return {'t2c': t2c_path}
 
@@ -284,8 +309,6 @@ def ref_lamanno(
     :rtype: dict
     """
     results = {}
-    t2g_result = create_t2g_from_gtf(gtf_path, t2g_path, intron=True)
-    results.update(t2g_result)
     if not os.path.exists(index_path) or overwrite:
         sorted_fasta_path = sort_fasta(
             fasta_path, os.path.join(temp_dir, SORTED_FASTA_FILENAME)
@@ -321,6 +344,8 @@ def ref_lamanno(
             out_path=os.path.join(temp_dir, COMBINED_FILENAME),
             temp_dir=temp_dir
         )
+        t2g_result = create_t2g_from_fasta(combined_path, t2g_path)
+        results.update(t2g_result)
         index_result = kallisto_index(combined_path, index_path)
         results.update(index_result)
     else:

diff --git a/tests/fixtures/fasta/cdna_split.fa b/tests/fixtures/fasta/cdna_split.fa
@@ -1,6 +1,6 @@
 >TRANSCRIPT_A.1 gene_id:GENE_A.1 gene_name:GENE_A_NAME chr:1 start:1 end:2 strand:+
 CA
 >TRANSCRIPT_B gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
-TCGATC
+TGATC
 >TRANSCRIPT_C gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
 TAATCGA
diff --git a/tests/fixtures/fasta/intron_split.fa b/tests/fixtures/fasta/intron_split.fa
@@ -1,4 +1,6 @@
->TRANSCRIPT_B-I gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
-AGTG
->TRANSCRIPT_C-I gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
-AGATCC
+>TRANSCRIPT_B-I.1 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
+ATCGG
+>TRANSCRIPT_B-I.2 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
+CTG
+>TRANSCRIPT_C-I.1 gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
+CAGATCCG
diff --git a/tests/fixtures/gtf/not_sorted.gtf b/tests/fixtures/gtf/not_sorted.gtf
@@ -2,7 +2,7 @@
 2	havana	exon	2	3	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	10	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	2	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
-2	havana	exon	2	3	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
+2	havana	exon	2	2	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	5	8	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	1	10	.	+	.	gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
 1	havana	exon	1	2	.	+	.	gene_id "GENE_A"; gene_version "1"; transcript_id "TRANSCRIPT_A"; transcript_version "1"; exon_number "1"; gene_name "GENE_A_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; exon_id "ENSMUSE00001343744"; exon_version "1"; tag "basic"; transcript_support_level "NA";

diff --git a/tests/fixtures/gtf/sorted.gtf b/tests/fixtures/gtf/sorted.gtf
@@ -3,6 +3,6 @@
 2	havana	transcript	1	10	.	+	.	gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
 2	havana	exon	2	3	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	2	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
-2	havana	exon	2	3	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
+2	havana	exon	2	2	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	5	8	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	10	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
diff --git a/tests/fixtures/gtf/t2g_intron.txt b/tests/fixtures/gtf/t2g_intron.txt
@@ -1,6 +1,3 @@
-TRANSCRIPT_C	GENE_C	GENE_C_NAME
-TRANSCRIPT_C-I	GENE_C	GENE_C_NAME
-TRANSCRIPT_B	GENE_B.1	GENE_B_NAME
-TRANSCRIPT_B-I	GENE_B.1	GENE_B_NAME
-TRANSCRIPT_A.1	GENE_A.1	GENE_A_NAME
-TRANSCRIPT_A.1-I	GENE_A.1	GENE_A_NAME
+TRANSCRIPT_B-I.1	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_B-I.2	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_C-I.1	GENE_C	GENE_C_NAME
diff --git a/tests/fixtures/gtf/t2g_intron_gtf.txt b/tests/fixtures/gtf/t2g_intron_gtf.txt
@@ -0,0 +1,6 @@
+TRANSCRIPT_C	GENE_C	GENE_C_NAME
+TRANSCRIPT_C-I	GENE_C	GENE_C_NAME
+TRANSCRIPT_B	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_B-I	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_A.1	GENE_A.1	GENE_A_NAME
+TRANSCRIPT_A.1-I	GENE_A.1	GENE_A_NAME
diff --git a/tests/mixins.py b/tests/mixins.py
@@ -82,7 +82,10 @@ def setUpClass(cls):
         cls.unsorted_gtf_path = os.path.join(cls.gtf_dir, 'not_sorted.gtf')
         cls.sorted_gtf_path = os.path.join(cls.gtf_dir, 'sorted.gtf')
         cls.gtf_t2g_path = os.path.join(cls.gtf_dir, 't2g.txt')
-        cls.gtf_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
+        cls.fasta_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
+        cls.gtf_t2g_intron_path = os.path.join(
+            cls.gtf_dir, 't2g_intron_gtf.txt'
+        )
 
         cls.fasta_dir = os.path.join(cls.fixtures_dir, 'fasta')
         cls.unsorted_fasta_path = os.path.join(cls.fasta_dir, 'not_sorted.fa')

diff --git a/tests/test_fasta.py b/tests/test_fasta.py
@@ -21,8 +21,13 @@ def test_make_header(self):
         )
 
     def test_parse_header(self):
-        header = '>transcript_id TEST'
-        self.assertEqual('transcript_id', fasta.FASTA.parse_header(header))
+        header = '>transcript_id TEST:testing'
+        self.assertEqual({
+            'sequence_id': 'transcript_id',
+            'group': {
+                'TEST': 'testing'
+            }
+        }, fasta.FASTA.parse_header(header))
 
     def test_reverse_complement(self):
         sequence = 'ATCG'
@@ -34,7 +39,7 @@ def test_entries(self):
 
     def test_sort(self):
         out_path = os.path.join(
-            tempfile.gettempdir(), '{}.gtf'.format(uuid.uuid4())
+            tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
         )
         fa = fasta.FASTA(self.unsorted_fasta_path)
         fa.sort(out_path)
@@ -59,7 +64,7 @@ def test_generate_intron_fasta(self):
             tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
         )
         fasta.generate_intron_fasta(
-            self.sorted_fasta_path, self.sorted_gtf_path, out_path
+            self.sorted_fasta_path, self.sorted_gtf_path, out_path, flank=1
         )
         with open(out_path, 'r') as f, open(self.split_intron_fasta_path,
                                             'r') as split: