From 34e15fdacedea3d4486c6a5e0a1b01321bbfbdc0 Mon Sep 17 00:00:00 2001
From: Joseph Min <kmin@caltech.edu>
Date: Thu, 31 Oct 2019 20:05:41 -0700
Subject: [PATCH] Lamanno intron (#19)

* update directory structure

* update lamanno intron generation

* ignore 5' and 3' utrs
---
 kb_python/fasta.py                    | 85 ++++++++++++++++++++-------
 kb_python/gtf.py                      |  4 +-
 kb_python/ref.py                      | 31 +++++++++-
 tests/fixtures/fasta/cdna_split.fa    |  2 +-
 tests/fixtures/fasta/intron_split.fa  | 10 ++--
 tests/fixtures/gtf/not_sorted.gtf     |  2 +-
 tests/fixtures/gtf/sorted.gtf         |  2 +-
 tests/fixtures/gtf/t2g_intron.txt     |  9 +--
 tests/fixtures/gtf/t2g_intron_gtf.txt |  6 ++
 tests/mixins.py                       |  5 +-
 tests/test_fasta.py                   | 13 ++--
 tests/test_ref.py                     | 33 +++++++----
 12 files changed, 146 insertions(+), 56 deletions(-)
 create mode 100644 tests/fixtures/gtf/t2g_intron_gtf.txt
diff --git a/kb_python/fasta.py b/kb_python/fasta.py
index 6d13099..33c51ff 100644
--- a/kb_python/fasta.py
+++ b/kb_python/fasta.py
@@ -1,4 +1,5 @@
 import logging
+import re
 
 from .gtf import GTF
 from .utils import open_as_text
@@ -12,6 +13,8 @@ class FASTA:
     :param fasta_path: path to FASTA file
     :type fasta_path: str
     """
+    PARSER = re.compile(r'^>(?P<sequence_id>\S+)(?P<group>.*)')
+    GROUP_PARSER = re.compile(r'(?P<key>\S+?):(?P<value>\S+)')
     BASEPAIRS = {
         'a': 'T',
         'A': 'T',
@@ -48,15 +51,22 @@ def make_header(seq_id, attributes):
 
     @staticmethod
     def parse_header(line):
-        """Get the sequence ID from a FASTA header.
+        """Parse information from a FASTA header.
 
         :param line: FASTA header line
         :type line: str
 
-        :return: sequence ID
-        :rtype: str
+        :return: parsed information
+        :rtype: dict
         """
-        return line.strip().split(' ', 1)[0][1:]
+        match = FASTA.PARSER.match(line)
+        if match:
+            groupdict = match.groupdict()
+            groupdict['group'] = dict(
+                FASTA.GROUP_PARSER.findall(groupdict.get('group', ''))
+            )
+            return groupdict
+        return None
 
     @staticmethod
     def reverse_complement(sequence):
@@ -77,20 +87,20 @@ def entries(self):
         :rtype: generator
         """
         with open_as_text(self.fasta_path, 'r') as f:
-            sequence_id = None
+            info = None
             sequence = ''
             for line in f:
                 if line.startswith('>'):
-                    if sequence_id:
-                        yield sequence_id, sequence
+                    if info:
+                        yield info, sequence
                         sequence = ''
 
-                    sequence_id = FASTA.parse_header(line)
+                    info = FASTA.parse_header(line)
                 else:
                     sequence += line.strip()
 
-            if sequence_id:
-                yield sequence_id, sequence
+            if info:
+                yield info, sequence
 
     def sort(self, out_path):
         """Sort the FASTA file by sequence ID.
@@ -105,7 +115,9 @@ def sort(self, out_path):
 
             while line:
                 if line.startswith('>'):
-                    to_sort.append([FASTA.parse_header(line), position, None])
+                    to_sort.append([
+                        FASTA.parse_header(line)['sequence_id'], position, None
+                    ])
 
                 position = f.tell()
                 line = f.readline()
@@ -153,7 +165,8 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
 
     with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
-        for sequence_id, sequence in fasta.entries():
+        for info, sequence in fasta.entries():
+            sequence_id = info['sequence_id']
             logger.debug(
                 'Generating cDNA from chromosome {}'.format(sequence_id)
             )
@@ -235,7 +248,7 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
     return out_path
 
 
-def generate_intron_fasta(fasta_path, gtf_path, out_path):
+def generate_intron_fasta(fasta_path, gtf_path, out_path, flank=30):
     """Generate an intron FASTA using the genome and GTF.
 
     This function assumes the order in which the chromosomes appear in the
@@ -245,6 +258,8 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
     1. transcript - exons
     2. 5' UTR
     3. 3' UTR
+    Additionally, append 30-bp (k - 1 where k = 31) flanks to each intron,
+    combining sections that overlap into a single FASTA entry.
 
     :param fasta_path: path to genomic FASTA file
     :type fasta_path: str
@@ -252,6 +267,8 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
     :type gtf_path: str
     :param out_path: path to intron FASTA to generate
     :type out_path: str
+    :param flank: the size of intron flanks, in bases, defaults to `30`
+    :type flank: int, optional
 
     :return: path to generated intron FASTA
     :rtype: str
@@ -262,7 +279,8 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
 
     with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
-        for sequence_id, sequence in fasta.entries():
+        for info, sequence in fasta.entries():
+            sequence_id = info['sequence_id']
             logger.debug(
                 'Generating introns from chromosome {}'.format(sequence_id)
             )
@@ -294,7 +312,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                     transcript = '{}.{}'.format(
                         transcript_id, transcript_version
                     ) if transcript_version else transcript_id
-                    transcript += '-I'
 
                     transcript_exons.setdefault(transcript,
                                                 []).append((start, end))
@@ -306,7 +323,6 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                     transcript = '{}.{}'.format(
                         transcript_id, transcript_version
                     ) if transcript_version else transcript_id
-                    transcript += '-I'
 
                     gene_id = gtf_entry['group']['gene_id']
                     gene_version = gtf_entry['group'].get('gene_version', None)
@@ -354,14 +370,40 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                 else:
                     introns.append(transcript_interval)
 
-                intron = ''
+                index = 1
+                flank_start = None
+                flank_end = None
                 for start, end in introns:
-                    intron += sequence[start - 1:end]
-
-                if intron:
+                    if flank_start is None:
+                        flank_start = max(start - flank, transcript_interval[0])
+                    if flank_end is None or start - flank <= flank_end:
+                        flank_end = min(end + flank, transcript_interval[1])
+                    else:
+                        intron = sequence[flank_start - 1:flank_end]
+                        f.write(
+                            '{}\n'.format(
+                                FASTA.make_header(
+                                    '{}-I.{}'.format(transcript, index),
+                                    attributes
+                                )
+                            )
+                        )
+                        f.write(
+                            '{}\n'.format(
+                                intron if dict(attributes)['strand'] ==
+                                '+' else FASTA.reverse_complement(intron)
+                            )
+                        )
+                        index += 1
+                        flank_start = max(start - flank, transcript_interval[0])
+                        flank_end = min(end + flank, transcript_interval[1])
+                if flank_start is not None and flank_end is not None:
+                    intron = sequence[flank_start - 1:flank_end]
                     f.write(
                         '{}\n'.format(
-                            FASTA.make_header(transcript, attributes)
+                            FASTA.make_header(
+                                '{}-I.{}'.format(transcript, index), attributes
+                            )
                         )
                     )
                     f.write(
@@ -370,6 +412,7 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
                             '+' else FASTA.reverse_complement(intron)
                         )
                     )
+                    index += 1
 
     return out_path
 
diff --git a/kb_python/gtf.py b/kb_python/gtf.py
index f6a6ffa..7994421 100644
--- a/kb_python/gtf.py
+++ b/kb_python/gtf.py
@@ -79,9 +79,7 @@ def sort(self, out_path):
             while line:
                 if not line.startswith('#') and not line.isspace():
                     entry = GTF.parse_entry(line)
-                    if entry['feature'] in ('transcript', 'exon',
-                                            'five_prime_utr',
-                                            'three_prime_utr'):
+                    if entry['feature'] in ('transcript', 'exon'):
                         to_sort.append(
                             (entry['seqname'], entry['start'], position)
                         )
diff --git a/kb_python/ref.py b/kb_python/ref.py
index 5487b90..1b6bde4 100644
--- a/kb_python/ref.py
+++ b/kb_python/ref.py
@@ -54,6 +54,30 @@ def sort_fasta(fasta_path, out_path):
     return out_path
 
 
+def create_t2g_from_fasta(fasta_path, t2g_path):
+    """Parse FASTA headers to get transcripts-to-gene mapping.
+
+    :param fasta_path: path to FASTA file
+    :type fasta_path: str
+    :param t2g_path: path to output transcript-to-gene mapping
+    :type t2g_path: str
+
+    :return: dictionary containing path to generated t2g mapping
+    :rtype: dict
+    """
+    logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path))
+    with open_as_text(t2g_path, 'w') as f:
+        fasta = FASTA(fasta_path)
+        for info, _ in fasta.entries():
+            f.write(
+                '{}\t{}\t{}\n'.format(
+                    info['sequence_id'], info['group']['gene_id'],
+                    info['group'].get('gene_name', '')
+                )
+            )
+    return {'t2g': t2g_path}
+
+
 def create_t2g_from_gtf(gtf_path, t2g_path, intron=False):
     """Creates a transcript-to-gene mapping from a GTF file.
 
@@ -114,7 +138,8 @@ def create_t2c(fasta_path, t2c_path):
     """
     fasta = FASTA(fasta_path)
     with open_as_text(t2c_path, 'w') as f:
-        for sequence_id, _ in fasta.entries():
+        for info, _ in fasta.entries():
+            sequence_id = info['sequence_id']
             f.write('{}\n'.format(sequence_id))
     return {'t2c': t2c_path}
 
@@ -284,8 +309,6 @@ def ref_lamanno(
     :rtype: dict
     """
     results = {}
-    t2g_result = create_t2g_from_gtf(gtf_path, t2g_path, intron=True)
-    results.update(t2g_result)
     if not os.path.exists(index_path) or overwrite:
         sorted_fasta_path = sort_fasta(
             fasta_path, os.path.join(temp_dir, SORTED_FASTA_FILENAME)
@@ -321,6 +344,8 @@ def ref_lamanno(
             out_path=os.path.join(temp_dir, COMBINED_FILENAME),
             temp_dir=temp_dir
         )
+        t2g_result = create_t2g_from_fasta(combined_path, t2g_path)
+        results.update(t2g_result)
         index_result = kallisto_index(combined_path, index_path)
         results.update(index_result)
     else:
diff --git a/tests/fixtures/fasta/cdna_split.fa b/tests/fixtures/fasta/cdna_split.fa
index 4bfd9b1..6d18966 100644
--- a/tests/fixtures/fasta/cdna_split.fa
+++ b/tests/fixtures/fasta/cdna_split.fa
@@ -1,6 +1,6 @@
 >TRANSCRIPT_A.1 gene_id:GENE_A.1 gene_name:GENE_A_NAME chr:1 start:1 end:2 strand:+
 CA
 >TRANSCRIPT_B gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
-TCGATC
+TGATC
 >TRANSCRIPT_C gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
 TAATCGA
diff --git a/tests/fixtures/fasta/intron_split.fa b/tests/fixtures/fasta/intron_split.fa
index a62b3ce..88a1954 100644
--- a/tests/fixtures/fasta/intron_split.fa
+++ b/tests/fixtures/fasta/intron_split.fa
@@ -1,4 +1,6 @@
->TRANSCRIPT_B-I gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
-AGTG
->TRANSCRIPT_C-I gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
-AGATCC
+>TRANSCRIPT_B-I.1 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
+ATCGG
+>TRANSCRIPT_B-I.2 gene_id:GENE_B.1 gene_name:GENE_B_NAME chr:2 start:1 end:10 strand:+
+CTG
+>TRANSCRIPT_C-I.1 gene_id:GENE_C gene_name:GENE_C_NAME chr:2 start:2 end:14 strand:-
+CAGATCCG
diff --git a/tests/fixtures/gtf/not_sorted.gtf b/tests/fixtures/gtf/not_sorted.gtf
index fd8cbc7..74fafef 100644
--- a/tests/fixtures/gtf/not_sorted.gtf
+++ b/tests/fixtures/gtf/not_sorted.gtf
@@ -2,7 +2,7 @@
 2	havana	exon	2	3	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	10	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	2	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
-2	havana	exon	2	3	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
+2	havana	exon	2	2	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	5	8	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	1	10	.	+	.	gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
 1	havana	exon	1	2	.	+	.	gene_id "GENE_A"; gene_version "1"; transcript_id "TRANSCRIPT_A"; transcript_version "1"; exon_number "1"; gene_name "GENE_A_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; exon_id "ENSMUSE00001343744"; exon_version "1"; tag "basic"; transcript_support_level "NA";
diff --git a/tests/fixtures/gtf/sorted.gtf b/tests/fixtures/gtf/sorted.gtf
index 245393b..b7ffa53 100644
--- a/tests/fixtures/gtf/sorted.gtf
+++ b/tests/fixtures/gtf/sorted.gtf
@@ -3,6 +3,6 @@
 2	havana	transcript	1	10	.	+	.	gene_id "GENE_B"; gene_version "1"; transcript_id "TRANSCRIPT_B"; gene_name "GENE_B_NAME"; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
 2	havana	exon	2	3	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
 2	havana	transcript	2	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C"; ; gene_source "havana"; gene_biotype "TEC"; transcript_name "4933401J01Rik-201"; transcript_source "havana"; transcript_biotype "TEC"; tag "basic"; transcript_support_level "NA";
-2	havana	exon	2	3	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
+2	havana	exon	2	2	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	5	8	.	+	.	gene_id "GENE_B"; gene_version "1"; gene_name "GENE_B_NAME"; transcript_id "TRANSCRIPT_B", gene_source "havana"; gene_biotype "TEC";
 2	havana	exon	10	14	.	-	.	gene_id "GENE_C"; gene_name "GENE_C_NAME"; transcript_id "TRANSCRIPT_C", gene_source "havana"; gene_biotype "TEC";
diff --git a/tests/fixtures/gtf/t2g_intron.txt b/tests/fixtures/gtf/t2g_intron.txt
index 0e8fd14..564f681 100644
--- a/tests/fixtures/gtf/t2g_intron.txt
+++ b/tests/fixtures/gtf/t2g_intron.txt
@@ -1,6 +1,3 @@
-TRANSCRIPT_C	GENE_C	GENE_C_NAME
-TRANSCRIPT_C-I	GENE_C	GENE_C_NAME
-TRANSCRIPT_B	GENE_B.1	GENE_B_NAME
-TRANSCRIPT_B-I	GENE_B.1	GENE_B_NAME
-TRANSCRIPT_A.1	GENE_A.1	GENE_A_NAME
-TRANSCRIPT_A.1-I	GENE_A.1	GENE_A_NAME
+TRANSCRIPT_B-I.1	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_B-I.2	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_C-I.1	GENE_C	GENE_C_NAME
diff --git a/tests/fixtures/gtf/t2g_intron_gtf.txt b/tests/fixtures/gtf/t2g_intron_gtf.txt
new file mode 100644
index 0000000..0e8fd14
--- /dev/null
+++ b/tests/fixtures/gtf/t2g_intron_gtf.txt
@@ -0,0 +1,6 @@
+TRANSCRIPT_C	GENE_C	GENE_C_NAME
+TRANSCRIPT_C-I	GENE_C	GENE_C_NAME
+TRANSCRIPT_B	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_B-I	GENE_B.1	GENE_B_NAME
+TRANSCRIPT_A.1	GENE_A.1	GENE_A_NAME
+TRANSCRIPT_A.1-I	GENE_A.1	GENE_A_NAME
diff --git a/tests/mixins.py b/tests/mixins.py
index ed3f092..3604dbe 100644
--- a/tests/mixins.py
+++ b/tests/mixins.py
@@ -82,7 +82,10 @@ def setUpClass(cls):
         cls.unsorted_gtf_path = os.path.join(cls.gtf_dir, 'not_sorted.gtf')
         cls.sorted_gtf_path = os.path.join(cls.gtf_dir, 'sorted.gtf')
         cls.gtf_t2g_path = os.path.join(cls.gtf_dir, 't2g.txt')
-        cls.gtf_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
+        cls.fasta_t2g_intron_path = os.path.join(cls.gtf_dir, 't2g_intron.txt')
+        cls.gtf_t2g_intron_path = os.path.join(
+            cls.gtf_dir, 't2g_intron_gtf.txt'
+        )
 
         cls.fasta_dir = os.path.join(cls.fixtures_dir, 'fasta')
         cls.unsorted_fasta_path = os.path.join(cls.fasta_dir, 'not_sorted.fa')
diff --git a/tests/test_fasta.py b/tests/test_fasta.py
index d6684ad..1ff30be 100644
--- a/tests/test_fasta.py
+++ b/tests/test_fasta.py
@@ -21,8 +21,13 @@ def test_make_header(self):
         )
 
     def test_parse_header(self):
-        header = '>transcript_id TEST'
-        self.assertEqual('transcript_id', fasta.FASTA.parse_header(header))
+        header = '>transcript_id TEST:testing'
+        self.assertEqual({
+            'sequence_id': 'transcript_id',
+            'group': {
+                'TEST': 'testing'
+            }
+        }, fasta.FASTA.parse_header(header))
 
     def test_reverse_complement(self):
         sequence = 'ATCG'
@@ -34,7 +39,7 @@ def test_entries(self):
 
     def test_sort(self):
         out_path = os.path.join(
-            tempfile.gettempdir(), '{}.gtf'.format(uuid.uuid4())
+            tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
         )
         fa = fasta.FASTA(self.unsorted_fasta_path)
         fa.sort(out_path)
@@ -59,7 +64,7 @@ def test_generate_intron_fasta(self):
             tempfile.gettempdir(), '{}.fa'.format(uuid.uuid4())
         )
         fasta.generate_intron_fasta(
-            self.sorted_fasta_path, self.sorted_gtf_path, out_path
+            self.sorted_fasta_path, self.sorted_gtf_path, out_path, flank=1
         )
         with open(out_path, 'r') as f, open(self.split_intron_fasta_path,
                                             'r') as split:
diff --git a/tests/test_ref.py b/tests/test_ref.py
index 69251e1..7ac6a4f 100644
--- a/tests/test_ref.py
+++ b/tests/test_ref.py
@@ -37,6 +37,17 @@ def test_kallisto_index(self):
         for key, path in result.items():
             self.assertTrue(os.path.exists(path))
 
+    def test_create_t2g_from_fasta(self):
+        t2g_path = os.path.join(
+            tempfile.gettempdir(), '{}.txt'.format(uuid.uuid4())
+        )
+        result = ref.create_t2g_from_fasta(
+            self.split_intron_fasta_path, t2g_path
+        )
+        with open(result['t2g'], 'r') as f, open(self.fasta_t2g_intron_path,
+                                                 'r') as t2g:
+            self.assertEqual(f.read(), t2g.read())
+
     def test_create_t2g_from_gtf(self):
         t2g_path = os.path.join(
             tempfile.gettempdir(), '{}.txt'.format(uuid.uuid4())
@@ -282,7 +293,7 @@ def test_ref_overwrite(self):
             kallisto_index.assert_called_once_with(cdna_fasta_path, index_path)
 
     def test_ref_lamanno(self):
-        with mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\
+        with mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\
             mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\
             mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\
             mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\
@@ -307,7 +318,7 @@ def test_ref_lamanno(self):
             generate_cdna_fasta.return_value = cdna_fasta_path
             generate_intron_fasta.return_value = intron_fasta_path
             kallisto_index.return_value = {'index': index_path}
-            create_t2g_from_gtf.return_value = {'t2g': t2g_path}
+            create_t2g_from_fasta.return_value = {'t2g': t2g_path}
             create_t2c.side_effect = [{
                 't2c': cdna_t2c_path
             }, {
@@ -333,8 +344,8 @@ def test_ref_lamanno(self):
                                  intron_t2c_path,
                                  temp_dir=temp_dir
                              ))
-            create_t2g_from_gtf.assert_called_once_with(
-                self.gtf_path, t2g_path, intron=True
+            create_t2g_from_fasta.assert_called_once_with(
+                combined_path, t2g_path
             )
             sort_fasta.assert_called_once_with(
                 self.fasta_path, os.path.join(temp_dir, SORTED_FASTA_FILENAME)
@@ -356,7 +367,7 @@ def test_ref_lamanno(self):
             kallisto_index.assert_called_once_with(combined_path, index_path)
 
     def test_ref_lamanno_exists(self):
-        with mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\
+        with mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\
             mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\
             mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\
             mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\
@@ -370,13 +381,13 @@ def test_ref_lamanno_exists(self):
             index_path = mock.MagicMock()
             t2g_path = mock.MagicMock()
             kallisto_index.return_value = {'index': index_path}
-            create_t2g_from_gtf.return_value = {'t2g': t2g_path}
+            create_t2g_from_fasta.return_value = {'t2g': t2g_path}
             self.assertEqual({'t2g': t2g_path},
                              ref.ref(
                                  self.fasta_path, self.gtf_path,
                                  cdna_fasta_path, index_path, t2g_path
                              ))
-            create_t2g_from_gtf.assert_called_once_with(self.gtf_path, t2g_path)
+            create_t2g_from_fasta.assert_not_called()
             sort_fasta.assert_not_called()
             sort_gtf.assert_not_called()
             generate_cdna_fasta.assert_not_called()
@@ -386,7 +397,7 @@ def test_ref_lamanno_exists(self):
             kallisto_index.assert_not_called()
 
     def test_ref_lamanno_overwrite(self):
-        with mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\
+        with mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\
             mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\
             mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\
             mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\
@@ -411,7 +422,7 @@ def test_ref_lamanno_overwrite(self):
             generate_cdna_fasta.return_value = cdna_fasta_path
             generate_intron_fasta.return_value = intron_fasta_path
             kallisto_index.return_value = {'index': index_path}
-            create_t2g_from_gtf.return_value = {'t2g': t2g_path}
+            create_t2g_from_fasta.return_value = {'t2g': t2g_path}
             create_t2c.side_effect = [{
                 't2c': cdna_t2c_path
             }, {
@@ -438,8 +449,8 @@ def test_ref_lamanno_overwrite(self):
                                  temp_dir=temp_dir,
                                  overwrite=True
                              ))
-            create_t2g_from_gtf.assert_called_once_with(
-                self.gtf_path, t2g_path, intron=True
+            create_t2g_from_fasta.assert_called_once_with(
+                combined_path, t2g_path
             )
             sort_fasta.assert_called_once_with(
                 self.fasta_path, os.path.join(temp_dir, SORTED_FASTA_FILENAME)