From 1f270fb351afda15c7c2c8b9ba5191d7237c2197 Mon Sep 17 00:00:00 2001
From: peterjc
Date: Thu, 10 Nov 2016 17:40:48 +0000
Subject: [PATCH 1/2] Include XX line between features (FT) and sequence (SQ)
---
gff3toembl/EMBLContig.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gff3toembl/EMBLContig.py b/gff3toembl/EMBLContig.py
index 94d2648..3ff143e 100644
--- a/gff3toembl/EMBLContig.py
+++ b/gff3toembl/EMBLContig.py
@@ -410,8 +410,8 @@ def calculate_nucleotide_counts(self, sequence):
return counts
def format_header(self, nucleotide_counts):
- # This line can exceed 80 characters
- template = "SQ Sequence {total} BP; {a} A; {c} C; {g} G; {t} T; {other} other;"
+ # The SQ line can exceed 80 characters
+ template = "XX\nSQ Sequence {total} BP; {a} A; {c} C; {g} G; {t} T; {other} other;"
total_counts = sum(nucleotide_counts.values())
nucleotide_counts['total'] = total_counts
return template.format(**nucleotide_counts)
From 9286488eb7935939b4eb4ef7f734f72ed2f3c701 Mon Sep 17 00:00:00 2001
From: peterjc
Date: Thu, 10 Nov 2016 17:45:11 +0000
Subject: [PATCH 2/2] Test get XX line between features (FT) and sequence (SQ)
---
gff3toembl/tests/EMBLContig_test.py | 13 +++++++------
.../tests/data/expected_duplicate_coords.embl | 1 +
.../tests/data/expected_large_annotation.embl | 10 ++++++++++
gff3toembl/tests/data/expected_single_feature.embl | 1 +
.../data/expected_single_feature_new_locus_tag.embl | 1 +
.../expected_single_feature_translation_table.embl | 1 +
6 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/gff3toembl/tests/EMBLContig_test.py b/gff3toembl/tests/EMBLContig_test.py
index 05098a2..94c6eea 100644
--- a/gff3toembl/tests/EMBLContig_test.py
+++ b/gff3toembl/tests/EMBLContig_test.py
@@ -852,7 +852,7 @@ def create_uninitialized_sequence(self):
def test_init(self):
sequence = EMBLSequence('AAAACCCGGTNN')
- expected_header = "SQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
+ expected_header = "XX\nSQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
expected_body = ' aaaacccggt nn 12\n'
self.assertEqual(sequence.header, expected_header)
self.assertEqual(sequence.body, expected_body)
@@ -860,10 +860,11 @@ def test_init(self):
def test_format(self):
sequence = self.create_uninitialized_sequence()
- sequence.header = "SQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
+ sequence.header = "XX\nSQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
sequence.body = ' aaaacccggt nn 12\n'
calculated_string = sequence.format()
expected_string = """\
+XX
SQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;
aaaacccggt nn 12
"""
@@ -891,19 +892,19 @@ def test_format_header(self):
sequence = self.create_uninitialized_sequence()
neucleotide_counts = {'a': 4, 'c': 3, 'g': 2, 't': 1, 'other': 2}
calculated_header = sequence.format_header(neucleotide_counts)
- expected_header = "SQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
+ expected_header = "XX\nSQ Sequence 12 BP; 4 A; 3 C; 2 G; 1 T; 2 other;"
neucleotide_counts = {'a': 12, 'c': 0, 'g': 0, 't': 0, 'other': 0}
calculated_header = sequence.format_header(neucleotide_counts)
- expected_header = "SQ Sequence 12 BP; 12 A; 0 C; 0 G; 0 T; 0 other;"
+ expected_header = "XX\nSQ Sequence 12 BP; 12 A; 0 C; 0 G; 0 T; 0 other;"
neucleotide_counts = {'a': 0, 'c': 0, 'g': 0, 't': 0, 'other': 12}
calculated_header = sequence.format_header(neucleotide_counts)
- expected_header = "SQ Sequence 12 BP; 0 A; 0 C; 0 G; 0 T; 12 other;"
+ expected_header = "XX\nSQ Sequence 12 BP; 0 A; 0 C; 0 G; 0 T; 12 other;"
neucleotide_counts = {'a': 2, 'c': 2, 'g': 2, 't': 6, 'other': 0}
calculated_header = sequence.format_header(neucleotide_counts)
- expected_header = "SQ Sequence 12 BP; 2 A; 2 C; 2 G; 6 T; 0 other;"
+ expected_header = "XX\nSQ Sequence 12 BP; 2 A; 2 C; 2 G; 6 T; 0 other;"
def test_split_line_of_sequence(self):
sequence = self.create_uninitialized_sequence()
diff --git a/gff3toembl/tests/data/expected_duplicate_coords.embl b/gff3toembl/tests/data/expected_duplicate_coords.embl
index aa48351..0d7f37b 100644
--- a/gff3toembl/tests/data/expected_duplicate_coords.embl
+++ b/gff3toembl/tests/data/expected_duplicate_coords.embl
@@ -32,6 +32,7 @@ FT rRNA 6029..6142
FT /product="5S ribosomal RNA"
FT /inference="COORDINATES:profile:RNAmmer:1.2"
FT /locus_tag="6730_5#19_02228"
+XX
SQ Sequence 6181 BP; 1767 A; 1220 C; 1744 G; 1449 T; 1 other;
gttgaagaaa tgaacattga aaactgaatg acaatatgtc aacgttaatt ccaataaagt 60
aacttaaatg ttacaaacac tatttagtat tatgagctaa tcaaacatca taaattttta 120
diff --git a/gff3toembl/tests/data/expected_large_annotation.embl b/gff3toembl/tests/data/expected_large_annotation.embl
index c9a1cee..95b3905 100644
--- a/gff3toembl/tests/data/expected_large_annotation.embl
+++ b/gff3toembl/tests/data/expected_large_annotation.embl
@@ -1716,6 +1716,7 @@ FT tRNA complement(174883..174959)
FT /product="tRNA-Ile(gat)"
FT /inference="COORDINATES:profile:Aragorn:1.2.34"
FT /locus_tag="8233_4#93_02312"
+XX
SQ Sequence 175120 BP; 55731 A; 31266 C; 25419 G; 62704 T; 0 other;
tctgacaatc gctttcttta aaaagaaact attgtcgaga atttgcatta gcaatatcac 60
tttgtcaaaa agatgtttga atgttaaata aacattcaaa actgaataca atatgtcacg 120
@@ -5768,6 +5769,7 @@ FT tRNA 108243..108316
FT /product="tRNA-Gly(tcc)"
FT /inference="COORDINATES:profile:Aragorn:1.2.34"
FT /locus_tag="8233_4#93_02421"
+XX
SQ Sequence 108420 BP; 39263 A; 15735 C; 20016 G; 33406 T; 0 other;
ctatagcaag gaggtcacac ctgttcccat gccgaacaca gaagttaagc tccttagcgt 60
cgatggtagt tggacttacg ttccgctaga gtagaacgtt gccaggctat atattattcc 120
@@ -8172,6 +8174,7 @@ FT /inference="ab initio prediction:Prodigal:2.60"
FT /inference="similar to AA sequence:RefSeq:YP_005742519.1"
FT /locus_tag="8233_4#93_02500"
FT /transl_table=11
+XX
SQ Sequence 52142 BP; 18748 A; 7364 C; 8937 G; 17093 T; 0 other;
cacctgttcc catgccgaac acagaagtta agctccttag cgtcgatggt agttggactt 60
acgttccgct agagtagaac gttgccaggc aagatatatt tggagaatta gctcagctgg 120
@@ -9502,6 +9505,7 @@ FT /db_xref="CDD:PRK09737"
FT /db_xref="PFAM:PF01420.13"
FT /locus_tag="8233_4#93_02548"
FT /transl_table=11
+XX
SQ Sequence 51716 BP; 16183 A; 10270 C; 6898 G; 18365 T; 0 other;
tttttggaat taacgttgac atattgtcat tcagttttca atgttcattt ttcttaccga 60
caagaattaa ttatacattt tatcaacatt taaatcaaca actttttgaa attaattttt 120
@@ -10864,6 +10868,7 @@ FT tRNA 39165..39240
FT /product="tRNA-Ala(tgc)"
FT /inference="COORDINATES:profile:Aragorn:1.2.34"
FT /locus_tag="8233_4#93_02595"
+XX
SQ Sequence 39433 BP; 14273 A; 5313 C; 7870 G; 11977 T; 0 other;
gacgaatact aatcgatcga agacttaatc aaaataaatg ttttgcgaag caaaatcact 60
tttacttact atctagtttt gaatgtataa tctacattcg tatgtctggt gactatagca 120
@@ -11760,6 +11765,7 @@ FT /inference="ab initio prediction:Prodigal:2.60"
FT /inference="similar to AA sequence:RefSeq:YP_302592.1"
FT /locus_tag="8233_4#93_02624"
FT /transl_table=11
+XX
SQ Sequence 21936 BP; 8100 A; 2662 C; 3389 G; 7784 T; 1 other;
gactatataa aaagaaccgc agatctcttc agatctacgg gttttcgcca tgccgtgtaa 60
ttagcatcat gctagctagt taatacgaag tattatttta aacataaggt tagacactta 120
@@ -12199,6 +12205,7 @@ FT /inference="ab initio prediction:Prodigal:2.60"
FT /inference="similar to AA sequence:RefSeq:YP_005759108.1"
FT /locus_tag="8233_4#93_02631"
FT /transl_table=11
+XX
SQ Sequence 4804 BP; 1694 A; 684 C; 780 G; 1645 T; 1 other;
tgtaattatc atttgaggtt tgccaaattg tttaataaga cgtttaataa acgtatatgc 60
tgagtgatta tctcgtttct tacgtaacga aatatctagt gtatggccat ctgcatcaat 120
@@ -12315,6 +12322,7 @@ FT /inference="ab initio prediction:Prodigal:2.60"
FT /db_xref="TIGRFAM:TIGR01295"
FT /locus_tag="8233_4#93_02633"
FT /transl_table=11
+XX
SQ Sequence 969 BP; 336 A; 157 C; 116 G; 360 T; 0 other;
gactatataa aaagaaccgc agatctcttc agatctacgg gttttcgcca tgccgtgtaa 60
ttagcatcat gctagctagt taatacgaag tattatttta aacataaggt tagacactta 120
@@ -12360,6 +12368,7 @@ FT tRNA complement(370..446)
FT /product="tRNA-Ile(gat)"
FT /inference="COORDINATES:profile:Aragorn:1.2.34"
FT /locus_tag="8233_4#93_02634"
+XX
SQ Sequence 656 BP; 184 A; 154 C; 100 G; 218 T; 0 other;
aagcatatcg tcgttagtaa cgtccttcat cggcttctag tgccaaggca tccaccgtgc 60
gcccttaata acttaatcta tgtttccacc atttttacaa gtcaaacgct cacatactgt 120
@@ -12395,6 +12404,7 @@ FT /organism="Organism"
FT /mol_type="genomic DNA"
FT /db_xref="taxon:1234"
FT /note="ERS154949|SC|contig000012"
+XX
SQ Sequence 465 BP; 139 A; 103 C; 64 G; 159 T; 0 other;
ccccaaagca tatcgtcgtt agtaacgtcc ttcatcggct tctagtgcca aggcatccac 60
cgtgcgccct taataactta atctatgttt ccaccatttt tataaatcaa acgttaacac 120
diff --git a/gff3toembl/tests/data/expected_single_feature.embl b/gff3toembl/tests/data/expected_single_feature.embl
index fb3045a..2fa4dda 100644
--- a/gff3toembl/tests/data/expected_single_feature.embl
+++ b/gff3toembl/tests/data/expected_single_feature.embl
@@ -30,6 +30,7 @@ FT /db_xref="PFAM:PF01475.13"
FT /locus_tag="8233_4#93_02128"
FT /gene="perR"
FT /transl_table=11
+XX
SQ Sequence 240 BP; 76 A; 54 C; 36 G; 74 T; 0 other;
tctgacaatc gctttcttta aaaagaaact attgtcgaga atttgcatta gcaatatcac 60
tttgtcaaaa agatgtttga atgttaaata aacattcaaa actgaataca atatgtcacg 120
diff --git a/gff3toembl/tests/data/expected_single_feature_new_locus_tag.embl b/gff3toembl/tests/data/expected_single_feature_new_locus_tag.embl
index 91e2933..2dde810 100644
--- a/gff3toembl/tests/data/expected_single_feature_new_locus_tag.embl
+++ b/gff3toembl/tests/data/expected_single_feature_new_locus_tag.embl
@@ -30,6 +30,7 @@ FT /db_xref="PFAM:PF01475.13"
FT /locus_tag="new_locus_tag_02128"
FT /gene="perR"
FT /transl_table=11
+XX
SQ Sequence 240 BP; 76 A; 54 C; 36 G; 74 T; 0 other;
tctgacaatc gctttcttta aaaagaaact attgtcgaga atttgcatta gcaatatcac 60
tttgtcaaaa agatgtttga atgttaaata aacattcaaa actgaataca atatgtcacg 120
diff --git a/gff3toembl/tests/data/expected_single_feature_translation_table.embl b/gff3toembl/tests/data/expected_single_feature_translation_table.embl
index 33e1cbd..8322573 100644
--- a/gff3toembl/tests/data/expected_single_feature_translation_table.embl
+++ b/gff3toembl/tests/data/expected_single_feature_translation_table.embl
@@ -30,6 +30,7 @@ FT /db_xref="PFAM:PF01475.13"
FT /locus_tag="8233_4#93_02128"
FT /gene="perR"
FT /transl_table=1
+XX
SQ Sequence 240 BP; 76 A; 54 C; 36 G; 74 T; 0 other;
tctgacaatc gctttcttta aaaagaaact attgtcgaga atttgcatta gcaatatcac 60
tttgtcaaaa agatgtttga atgttaaata aacattcaaa actgaataca atatgtcacg 120