forked from 4ureliek/TEanalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TE-analysis_pipeline.pl
executable file
·2662 lines (2467 loc) · 125 KB
/
TE-analysis_pipeline.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl -w
#######################################################
# Author : Aurelie Kapusta
# version : see below, $version
# email : [email protected]
# Purpose : Pipeline to analyse TE composition in features (exons of transcripts, coding or non coding, transcription factor binding sites, ChIP-seq data, etc)
# See documentation for more details
######################################################
BEGIN{
#what to do on: kill -s ALRM <pid> so I can check where it is if it stalls
$SIG{ALRM} = sub {print STDERR "SIGALRM received\n"; print STDERR Carp::longmess; print "\n";};
#what to do on ^C
$SIG{INT} = sub {print STDERR "SIGINT received\n"; print STDERR "\n\n".Carp::longmess; exit;};
#add a folder in INC
#unshift(@INC, "~/bin/BioPerl-1.6.901");
}
#load modules
use warnings;
use strict;
use Carp;
use Getopt::Long;
use Data::Dumper;
#keep STDOUT and STDERR from buffering
select((select(STDERR), $|=1)[0]); #make STDERR buffer flush immediately
select((select(STDOUT), $|=1)[0]); #make STDOUT buffer flush immediately
my $version = "4.16";
my $changelog;
set_changelog();
sub set_changelog {
$changelog = "
# v1.0 = Mar 2013
# [...]
# v4.0 = Nov 2014
# - put all in one big pipeline script (use of subroutines), instead of 4 different scripts
# - correct few bugs that still remained
# - more flexibility in the input file + in required files => more dynamic pipeline depending on what users want to do
# v4.1 = 5 Jan 2015
# - bug fix
# - up and dw: wrong regions were kept after subtraction (if statement changed to unless)
# and corrections for the cuts was wrong as well (a cc stupid error)
# - Now --int added in family name when family = ERV and name of TE has I or int at the end.
# v4.2 = 11 Feb 2015
# - bug fix
# = -nonTE option was not taking argument
# = some file names stuff
# = -dir option fixed
# - added number of different chromosomes/scaffolds in the TE-ratios output
# Indeed, when are on 2 different sequences, less likely to be 2 pieces of the same TE
# v4.3 = 12 Feb 2015
# - bug fix
# = -addcol (were added in ExSt file but not in TrInfos files)
# v4.4 = 17 Feb 2015
# - bug fix
# = -parse on added columns, was doing the opposite (excluding)...
# v4.5 = 02 Mar 2015
# - bug fix
# -f bed => did not have the TEov filtering
# TEov: if there is a % in the argument then it will filter on % of the feature coverage and not nt amount
# v4.6 = 10 Feb 2016
# - bug fix
# for -addcol with no -myf (when input = gtf file)
# v4.7 = 20 Sep 2016
# - update to read properly recent Gencode gff3
# v4.8 = 04 Oct 2016
# - update usage and help
# v4.9 = 09 Jan 2017
# - bug fix in the intersection calculation. Fix it, but also change to use the -wo instead
# v4.10 = 25 Jan 2017
# - Add count of features from the bed file in summary file
# v4.11 = 06 Feb 2017
# - Check on \$add to avoid error \"Use of uninitialized value in concatenation (.) or string....\"
# v4.12 = 27 Mar 2017
# - Bug fix \$add, still errors
# - Bug fix \$listtojoin, was erased when no subtract
# - Bug fix amounts
# v4.13 = 08 Jun 2018
# - Bug fix die errors if -RMparsed file not provided or if a repeat is missing from it
# - Few cosmetic changes
# v4.14 = 20 Jul 2018
# - deal with differences between old and new parseRM.pl outputs
# - Few cosmetic changes
# v4.15 = 24 & 26 Jul 2018
# - small bug fix to avoid dying at the TEratio printing step when a repeat is not in the parsedRM file
# v4.16 = 15 Aug 2018
# - fixed that -parse option thing, since the format is more strict now (8 columns input file).
# - bug fix introduced with v4.14 for parsedRM file loading, if old parsedRM format the masked length was 0
# TO DO:
# - check what is used in TEinfoRMP, remove useless stuff
# - global vars in uc, and no need to pass them to subs / cleaning writing!
# - Do a utils script to integrate data from several runs as summary or TEratios, like the Coverage one, but with mosaic plots
# - When -parse, previous files have to be deleted or it won't actually filter, it's annoying. Solve that.
# - Fix intron TrInfos - use exon coordinates to check and not introns, to see if SPL overlap. However, it's not really more informative than the exon output.
\n";
return 1;
}
my $usage;
set_usage();
sub set_usage {
$usage = "\nUsage [$version]:
perl TE-analysis_pipeline_v4+.pl -i <inputfile> [-dir] -f <format> [-myf <col_details.txt>]
-RMout <RepeatMasker.out> [-RMparsed RM.out.parsed] [-base <RM base>] [-TE <TE.tab>] [-TEage] [-nonTE <X>]
[-fa <genome.fa>] [-subtract <what-to-subtract>] [-subid <name>] [-noselfsub] [-bedtools <path/to/bins>]
[-addcol <col1,col2,etc>] [-filter <col,filter>] [-parse <col_nb,filter>] [-cut <X,X,X>]
[-v] [-clean] [-chlog] [-h] [-help]
SYNOPSIS
Type -help for detailed explanations + on how to read the outputs.
Pipeline to analyse TE composition in features (exons of transcripts, coding or non coding,
transcription factor binding sites, ChIP-seq data, etc)
REQUIREMENTS
BEDtools is required for this pipeline to run (Quinlan AR and Hall IM, 2010. Bioinformatics)
Available at: https://github.com/arq5x/bedtools2
CITATION - please put the GitHub link in Methods, and cite:
- For this pipeline using -f gtf, cite Kapusta et al. (2013) PLoS Genetics
- For this pipeline using -f bed, cite Lynch et al. (2015) Cell Reports
- For the use of BEDtools, Quinlan AR and Hall IM (2010) Bioinformatics
DEBUGGING
First thing to do = check that your input files are not encoded as Classic Mac (CR).
Also, double check the usage and the doc, just in case.
Then, if you really think your input files are OK, shoot me an email or open an issue on GitHub with the errors,
your command line and sample files reproducing the error if possible.
DETAILS OF OPTIONS (MD = mandatory and OPT = optional):
-i => MD - (STRING) input file, see more details below (-f usage)
in .bed format for -f bed; in gtf, gff3, or any tabulated file for -f gtf
-dir => OPT - (BOOL) add this if -i corresponds to a folder containing files to work with (need to contain ONLY these files!!)
-f => MD - (STRING) This sets the type of analysis (kind of related to the format of the input file)
chose between -f gtf (complex) or -f bed (simple)
Check -help for more information.
gtf (default) = complex analysis (for transcripts - TSS, exons, splicing sites, etc)
gtf and gff files will work, but only if all the info is in it (see -help)
bed = simple analysis (for TF binding sites, ChIP-seq data etc)
5 columns bedfile is required: chr start end unique_ID score/. strand
-myf => OPT - (STRING) if input file is not formated as a gtf/gff3 or bed: use this option with a text file to set the column numbers.
To generate an example of the text file to provide, type this option alone (with the path/name of the file to create)
When you run the pipeline, you still need to provide -f to determine the type of analysis.
-RMout => MD - (STRING) repeat masker output file .out
Even if you already have the .out.bed file, put .out in command line
Obviously requires to be for the same assembly file/version than the input file (ex. mm9 or mm10, hg19 or hg38, etc)
-TEov => OPT - (INT) minimal length (in nt) of intersection in order to consider the TE included in the feature.
Default = 10
(STRING) put a % after a number (FLOAT) to filter out when less than X% of the feature overlaps with the TE
Ex: -TEov 80% will skip the line if less than 80% of the feature overlaps with the TE
-RMparsed => OPT - (STRING) repeat masker output parsed with parseRM.pl script (with or without the -lib option)
Typically: <RMout.out>.parseRM.all-repeats.tab
See documentation for more details on how to get this file.
If not provided, over represented TE families won't be determined
-base => OPT - (INT) 0 or 1. Typically, if the file is from Repeat Masker website chose 1, if from UCSC chose 0.
If base 0 is chosen, 1 will be added to start of TE coordinates
Default = 1
-TE => OPT - (STRING) file with TE information, tab delimited. Minimum columns = Rname Rclass Rfam Rclass/Rfam
Class and family info are required to parse correctly the files, and for each element
they will be extracted from files set in options, with this priority: -TE > -RMparsed > -RMout
This file here is mostly to allow providing TE age info, or to modify (some) TE classes.
If the -TEage flag is set, then 7 columns are needed: Rname Rclass Rfam Rclass/Rfam %div AGE Ancient/LineageSpe
(6th will be used for age but 7th will be added in the output as well)
-TEage => OPT - (BOOL) add this flag if the file in -TE contains age info
If this is not set, no age parsing will be available (\"na\" will be put in some of the corresponding columns)
-nonTE => OPT - (STRING) to set the behavior regarding non TE sequences: all, no_low, no_nonTE, none
Class info will be extracted from files set in options, with this priority: -TE > -RMparsed > -RMout
all = keep all non TE sequences (no filtering)
no_low = keep all besides low_complexity and simple_repeat
no_nonTE = keep all except when class = nonTE
none (default) = everything is filtered out (nonTE, low_complexity, simple_repeat, snRNA, srpRNA, rRNA, tRNA/tRNA, satellite)
-fa => OPT - (STRING) corresponding genome file (fasta).
Required only when -f gtf, to make sure that surrounding intergenic sequences won't be outside of sequences
If not provided, intergenic regions won't be looked at
-subtract => OPT - (STRING) gtf or bed file to subtract from introns and intergenetic regions. Typically, Ensembl gene annotation.
Only relevant if -f gtf is used
-subid => OPT - (STRING) \"ID\" to show in output files after subtraction of file set in -subtract (for introns and intergenic regions)
Default = sub
-noselfsub => OPT - (BOOL) chose this option to avoid subtracting the input file from itself (from introns and intergenetic regions)
-bedtools => OPT - (STRING) if BEDtools are not in your path, provide path to BEDtools bin directory
-addcol => OPT - (STRING) add columns (no limit) of the input file to the outputs, separated by a coma and NO SPACE (ex: -addcol 10,11)
Note that first column of the file = 0
These added columns can be used for -parse
-filter => OPT - (STRING) to filter on a specific column, to keep lines where <filter> is found in the column <col>
Can't be used on added columns. Lines not matching it won't be printed at all in any of the files.
You can:
(i) use a number corresponding to the -myf file column numbers, to the column number of a bed file,
or to columns 0 to 7 of a gtf file
For example, use -filter 0,chr1 to analyze only chromosome 1
(ii) use the identifier of the feature if you want to filter a gtf file or a custom file
For example, use -filter gene_type,lincRNA or -filter transcript_type,lincRNA to keep only lincRNAs.
If input file is a gtf file (with \"transcript\" lines), then you can also use a special filter
= gene_type,intragenic or transcript_type,intragenic to look at everything that is not lincRNA.
This will exclude all lincRNAs but keep all the non coding stuff, unless they are < 200nt [should exclude all small RNAs]
-parse => OPT - (STRING) to filter on a specific column before joining with TEs, to keep lines where <filter> is found in the column <col>
Can be used ONLY on added columns. Lines not matching it won't be printed at all in any of the files.
Typically, this allows to quickly check if some subsets differ (ex: -addcol 9,10,11 -parse 10,liver)
if column 10 has the tissue of maximum expression in the original file.
-cut => OPT - (STRING) to set size of intergenic regions analyzed (downstream and upstream), in nt
Default = 10000,5000,1000
-v => OPT - (BOOL) chose this to make the script talk to you
print the version if only option
-clean => OPT - (BOOL) to use alone with -i, to delete any previous outputs generated by this script for this input file
If -i was a directory, add the -dir flag
(RMout.bed won't be deleted -> delete it manually if needed)
-chlog => OPT - (BOOL) to print the change log between versions
-h => OPT - (BOOL) to print this usage
-help => OPT - (BOOL) to print a more detailed doc for this pipeline
";
return 1;
}
my $longhelp;
set_longhelp();
sub set_longhelp {
$longhelp = "\nSome documentation for TE-analysis_pipeline.pl [$version]
Author : Aurelie Kapusta
Last update : 04 Oct 2016
--------------------------------
INPUT FILES
--------------------------------
With -f bed (typically for TF binding sites, ChIPseq data, etc)
-i input file needs to be in the 5 columns bed format:
chr start end unique_ID score/. strand
If you only have a 3 columns format, just type:
sed 's/$/ . ./' peaks.bed > peaks.mod.bed
(white spaces are tabs, in the command line you can get them by pressing ctrl+v and then the tab key)
With -f gtf (typically for gene or transcripts annotations, coding or not):
-i input file needs to be in gtf, gff, gff3 or any tabulated format containing the required information
If a non-standard file is used (e.g. tabulated file from transcript assemblies), type:
perl TE-analysis_pipeline_v4+.pl -myf myfile.conf
This will create a .conf file that should be edited to set the columns of the various required info (such as gene_ID, transcript_ID, etc)
Note that the same column can refer to different features (such as id or name, for genes or transcripts). No need to make 4 columns with the same info in it in the input file!
TE info:
-RMout should be the repeat masker output of the SAME assembly, in .out format
The pipeline will create a bed file from it (quite long step if the file is big),
with nonTE annotations filtered out (or not, depends on your choice of -nonTE)
During that conversion, TE class and family are updated using the files from 1) -TE and 2) -RMparsed if provided.
Many pre-masked assemblies can be found at http://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
-RMparsed is the repeat masker output file of -RMout but parsed with the parseRM_simple.pl script (with or without the -lib option)
This script can be found at: https://github.com/4ureliek/Parsing-RepeatMasker-Outputs
(you can run it with or without -lib option will work, column numbers will be corrected based on column headers)
Typically, the file is <RMout.out>.parseRM.all-repeats.tab
If not provided, over represented TE families won't be determined
Some pre-parsed RM outputs can be found in the Data directory of this pipeline
NOTE THAT: This pipeline expects that all repeat names are different, there will be issues if they are not
(this may happen for user's de novo libraries used for RM annotation; also, in older libraries there were
repeats with same name but different class/fam - if this happens then the first occurence will be the one in the hash with TE info;
you can also correct that by using -TE). Note that repeat names will be matched in lower cases, so
if some repeat names are different just thanks to the case that will be a problem.
When the ERV can be identified as internal (-int or -I associated) then family of the TE is renamed with --int
--------------------------------
FILTERING
--------------------------------
-filter
-addcol and -parse
This pipeline expects that info in the added columns are by transcripts.
If they correspond to exons, then the Exon Structure (ExSt) file and TrInfos output files will have it wrong.
However the -parse will work correctly if you use -clean beforehand (or manually delete the ExSt file)
Note that by default, overlaps < 9nt are filtered out (-TEov option)
--------------------------------
FILES CREATED DURING THE RUN
--------------------------------
- creates \"Exon Structure\" file, with a line per exon with some info added like exon type (FIRST, LAST, etc), transcript coordinates and mature transcript length
- extract features:
exons (split non coding and coding -> split in CDS and UTRs)
introns
XXkb up and down (depending what is set - ex. 10, 5 and 1)
- for introns and up+dw, <what-to-subtract.gtf or bed> will be subtracted from the sets => outputs are <file>.subtract.<name>.bed
- then all resulting files are intersected with TEs (RMout.out)
This script relies on Transcript information, so gene info is not mandatory.
However, it is better for analysis - note that GTF from the UCSC table browser use the same ID for gene and transcript
When CDS, in ExSt file, exon nb will be for CDS exons only
ExSt also contains features that would be filtered out.
It is basically generated once per input file, so you should delete it if the input file has been modified.
UTRs won't be in exonSt file, just as the original \"exon\"
UTR output - ALL UTRs => some can be undetermined for 5' or 3' so it might be relevant to check that output
--------------------------------
READ THE OUTPUTS
--------------------------------
Typically, outputs can be used to generate figures and supp tables as in Kapusta et al. (2013) PLoS Genetics.
Supp tables (~TrInfos files) are a bit different because were generated by an older version of this pipeline.
See also: piRNA paper, Vinny's paper [for the bed format]
Relevant outputs (and not intermediary files) are as follow:
## ExSt file
----------------
[ONLY for -f gtf => transcript analysis]
File used internally, summarizing all info about each exons (but can be useful, to get the whole transcript structure)
## TrInfos files
----------------
[ONLY for -f gtf => transcript analysis]
Very useful table with all info of TE content per transcripts. One line per TE overlapping with a feature (exon, up and down...)
Note: Forget the intron TrInfos file... \"SPL\" may reveal overlap with a boundary of an intron piece after subtraction.
Need to fix that, but it's not major. Not used to count features in the _Summary file anyway
## _Summary.tab
----------------
[ONLY for -f gtf => transcript analysis]
Summarizes number of features (TSS, polyA etc) overlapping with TEs
## CAT.tab
----------------
Contains info of amount and class of different TE super families. Good quick way to look at global TE composition
% are to get proportions. They are not % regarding total amount of nt in the data (divide by the value at #total_length(nt) to get that)
When -f gtf (complex analysis) then individual files are concatenated in *concat.CAT.tab
## CAT-class.tab
----------------
Same as CAT.tab but by TE classes
This file also contains the total % of TEs in the data.
When -f gtf (complex analysis) then individual files are concatenated in *concat.CAT-class.tab
## AGE.tab
----------------
Contains info of amounts, but by age and not by class
When -f gtf (complex analysis) then individual files are concatenated in *concat.AGE.tab
## TE-ratios.tab
----------------
# To check if there is any over representation of TEs
# These do not have stats in them; significance should be tested using the \"nrCounts\" columns
# Column details are as follow:
Rname = repeat name from the repeat masker output
Rclass = class
Rfam = family
IN SET:
Len_masked = total length in nt covered by this repeat in the set
%_masked = (Len_masked) / (total amount of TE in the set in nt) *100
Counts = count of all fragments
nrCounts = number of fragments corrected using the Repeat Masker Interrupted Repeats track (eg if there is one deletion or one insertion in a TE, 2 fragments but 1 corrected fragment)
%nrCounts = (nrCounts) / (total amount of nrCounts in the set) *100
NbChrs = number of different chromosomes / scaffolds where these counts are located.
Indeed, when 2 fragments are on 2 different sequences, less likely to be 2 pieces of the same TE...
(nrCount is not always accurate and needs to be double checked mostly for low numbers, by looking at the TEjoin file.
Indeed, lowering numbers may affect significance)
IN GENOME:
Len_masked = total length in nt covered by this repeat in the genome
%_masked = (Len_masked) / (total amount, in the genome, that is masked by all TEs that are also overlapping the peaks) *100
Counts = count of all fragments
nrCounts = number of fragments corrected using the Repeat Masker Interrupted Repeats track
%nrCounts = (nrCounts) / (total amount of nrCounts in the genome for these TEs) *100
AGE_INFORMATION: [if provided]
Lineage = lineage, mostly from http://www.repeatmasker.org/cgi-bin/ViewRepeat?id=XXX (where XXX = repeat name), but also some personal checks
Ancient/LineageSpe = Eutherian shared / lineage specific after the split
avg_%div = pondered average of all % divergence from repeat masker (all fragments)
RATIOS:
Len = ratio that tells you if a TE is potentially over represented
nb = ratio based on nrCounts = used for stats but better to relie on RATION Len for figures if any
";
return 1;
}
################################################################################
# Get arguments/options, check some of them
################################################################################
my ($ft,$myf,$addcol,$fa,$bedtools) = ("gtf","na","na","na","na");
my ($subtract,$subid,$selfsub) = ("na","sub","yes");
my ($RMbase,$TEage,$nonTE,$TEov) = (1,"na","none",10);
my ($filter,$parse) = ("all,all","all,all");
my $cut = "10000,5000,1000";
my ($INPUT,$dir,$RMout,$RMPARSED,$TE,$clean,$chlog,$h,$help,$v);
GetOptions ('i=s' => \$INPUT,
'dir' => \$dir,
'f=s' => \$ft,
'myf=s' => \$myf,
'RMout=s' => \$RMout,
'TEov=s' => \$TEov,
'base=s' => \$RMbase,
'RMparsed=s' => \$RMPARSED,
'TE=s' => \$TE,
'TEage' => \$TEage,
'nonTE=s' => \$nonTE,
'fa=s' => \$fa,
'subtract=s' => \$subtract,
'subid=s' => \$subid,
'noselfsub' => \$selfsub,
'bedtools=s' => \$bedtools,
'filter=s' => \$filter,
'parse=s' => \$parse,
'addcol=s' => \$addcol,
'cut=s' => \$cut,
'clean' => \$clean,
'chlog' => \$chlog,
'h' => \$h,
'help' => \$help,
'v' => \$v);
#check step to see if mandatory arguments are provided + if help/changelog
die "\n version $version\n\n" if (! $INPUT && ! $RMout && ! $h && ! $help && ! $chlog && $v);
die $changelog if ($chlog);
die $longhelp if ($help);
die $usage if ($myf eq "na" && ! $clean && (! $INPUT || ! $RMout || $h));
#print sample file for column informations for custom format, if -myf set without mandatory arguments
myformat($myf) if (! $INPUT && ! $RMout && $myf ne "na");
die "\n ERROR (main): input file $INPUT does not exist?\n\n" unless (-e $INPUT);
die "\n ERROR (main): RMout $RMout does not exist?\n\n" unless (-e $RMout);
#get the list of input files + clean if needed
my @listin = ();
my %realins = ();
if ($dir) {
my $core;
$INPUT = $1 if ($INPUT =~ /^(.*)\/$/); #avoid the / at the end of path
my @list = `ls $INPUT`;
foreach my $in (@list) {
chomp ($in);
my $full = $INPUT."/".$in;
push(@listin,$full);
#need to get core names in case cleaning required
if ($clean) {
$core = $1 if ($in =~ /^(.*)\.TEjoin\./);
$realins{$INPUT."/".$core}=1;
}
}
} else {
push(@listin,$INPUT);
$realins{$INPUT}=1 if ($clean);
}
clean_out(\%realins) if ($clean);
#check some more options
my ($fcol,$fname) = split (",",$filter);
my ($pcol,$pname) = split (",",$parse);
#check -f option values
die "\n ERROR (main): check -f option (use -h if you need to see the usage)\n\n" if ($ft ne "gtf" && $ft ne "bed"); #even if not set it will be "gtf"
#check filter and parse options
die "\n ERROR (main): -filter requires 2 values separated by a coma (-filter <col,filter>; use -h if you need to see the usage)\n\n" if ($filter ne "all,all" && $filter !~ /,/);
die "\n ERROR (main): -filter col should be numeric is -myf is chosen (use -h if you need to see the usage)\n\n" if ($filter ne "all,all" && $fcol !~ /\d/ && $myf ne "na");
die "\n ERROR (main): -filter col should be numeric < 7 or non digit (use -h if you need to see the usage)\n\n" if ($filter ne "all,all" && ($fcol !~ /[0-7]/ || $fcol =~ /\D/) && $myf ne "na");
die "\n ERROR (main): -filter intragenic can't be chosen if -f bed is chosen (use -h if you need to see the usage)\n\n" if (($fname eq "intragenic") && ($myf ne "na") && ($ft ne "gtf"));
die "\n ERROR (main): -parse requires 2 values separated by a coma (use -h if you need to see the usage)\n\n" if (($parse ne "all,all") && ($parse !~ /,/));
die "\n ERROR (main): use of -parse ($parse) without any columns added with -addcol (use -h if you need to see the usage)\n\n" if (($addcol eq "na") && ($parse ne "all,all"));
#if relevant, check that $addcol and $cut are numerical and , only
die "\n ERROR (main): check -addcol option (use -h if you need to see the usage)\n\n" if ($addcol ne "na" && $addcol !~ /[,\d]/);
die "\n ERROR (main): check -cut option (use -h if you need to see the usage)\n\n" if ($cut !~ /^[0-9,]+$/);
#if relevant, check extension of the file to subtract
die "\n ERROR (main): check -subtract, file is not .gtf or .bed - if it is, please add the correct extension\n\n" if ($subtract ne "na" && $subtract !~ /.*\.bed$/ && $subtract !~ /.*\.gtf$/);
#Now start everything => log in STDERR if -v basically
if ($v) {
print STDERR "\n ------------------------------------------------------------------------------------------------\n";
print STDERR " --- Script for TE analysis started (v$version)\n";
print STDERR " - Input file = $INPUT\n" unless ($dir);
print STDERR " - Input files are located in $INPUT\n" if ($dir);
print STDERR " -> Format / Analyze type = $ft\n";
print STDERR " -> Custom columns will be extracted from $myf\n" if ($myf ne "na");
print STDERR " -> Filtering of input file = $filter\n" if ($filter ne "all,all");
print STDERR " -> These columns will be added in the output files = $addcol\n" if ($addcol ne "na");
print STDERR " -> Filtering on these added columns = $parse\n" if ($parse ne "all,all");
print STDERR " - Repeat Masker output file .out = $RMout\n";
print STDERR " -> base = $RMbase\n";
print STDERR " -> minimal length of intersection = $TEov nt\n" if (substr($TEov,-1) ne "%");
print STDERR " -> minimal overlap of feature and TE = $TEov of the feature\n" if (substr($TEov,-1) eq "%");
print STDERR " -> parsed file = $RMPARSED (over/under represention of TE families will be determined)\n" if ($RMPARSED);
print STDERR " -> TE file = $TE\n" if ($TE);
print STDERR " -> lineage/age info will be added in outputs + TE amounts will be parsed by age\n" if ($TEage ne "na");
print STDERR " -> nonTE repeats will be filtered as -nonTE $nonTE (see usage or full documentation)\n";
if ($ft eq "gtf") {
if ($fa ne "na") {
print STDERR " - Genome file (fasta) = $fa\n";
print STDERR " -> up and downstream regions will be analyzed\n";
print STDERR " -> of lengths = $cut nt\n";
} elsif ($fa eq "na") {
print STDERR " - No genome file (-fa option) provided\n";
print STDERR " -> up and downstream regions won't be analyzed\n";
}
print STDERR " - $subtract will be subtracted from introns and intergenic regions\n" if ($subtract ne "na");
($selfsub eq "yes")?(print STDERR " -> option -noselfsub not chosen => $INPUT will be subtracted from introns and surrounding regions as well\n"):(print STDERR " -> option -noselfsub chosen => $INPUT won't be subtracted from introns and surrounding regions\n");
print STDERR " -> subtract ID = $subid (will be in file names)\n" if ($subtract ne "na");
}
$bedtools = $1 if (($bedtools ne "na") && ($bedtools =~ /^(.*)\/$/)); #avoid the / at the end of pathif ($bedtools ne "na");
print STDERR " -> BEDtools path = $bedtools\n" if ($bedtools ne "na");
print STDERR " ------------------------------------------------------------------------------------------------\n";
}
#Get TE infos if relevant
my $TEinfo = ();
my $TEinfoRMP = ();
my $TE_RMP = ();
if ($TE) {
$TEinfo = get_TEs_infos($TE,$v);
} else {
($TEinfo->{"na"} = "na");
}
if ($RMPARSED) {
($TEinfoRMP,$TE_RMP) = get_TEs_infos($RMPARSED,$v);
} else {
$TEinfoRMP->{"na"} = "na",$TE_RMP->{"na"} = "na";
}
#Then write TEs in bed format if needed => do some filtering out (nonTE stuff)
$RMout = RMtobed($RMout,$RMbase,$TEinfoRMP,$TEinfo,$nonTE,$v);
#Convert to bed the file to be subtracted unless it was given in bed format
$subtract = gtftobed($subtract,$v) if ($subtract ne "na");
#Get lengths from genome file if relevant
my $lengths = get_lengths_noBio($fa,$v) if ($fa ne "na");
#Get what column will be what - variable depending of $ft value
my $set = set($ft,$myf,$v);
#extract all relevant info from the input file; if -f gtf => make Exon Structure file + get up/dw etc
#####################################################
foreach my $in (@listin) {
print "\n --- Dealing with $in " if (($v) && ($dir));
my $addn = "$fname.$pname";
my $listtojoin = ();
my $listoffiles = ();
my $name = $in;
$name = $1 if ($in =~ /(.*)\.gtf|gff|gff3$/);
my $ExSt = "$name.ExSt.$addn.tab";
my $TrInfos = ();
my $feat = ();
my $ExInfo = ();
if ($ft eq "gtf") {
my ($pc,$big);
#Load input file and get all info + filtering value
my $TrMatLen = ();
$TrMatLen->{"na"} = "na"; #initialize a value to avoid messing with arguments for get_tr_infos
$TrMatLen = get_TrMatLen($in,$v) if ($fname eq "intragenic"); #only possible if $myf ne "na" (option checked at the beginning of the script)
($pc,$big,$TrInfos,$listtojoin) = get_tr_infos($in,$set,$name,$filter,$addcol,$parse,$addn,$TrMatLen,$v);
push(@{$listoffiles},"$name.nc.$addn.introns") if (-e "$name.nc.$addn.introns.bed");
push(@{$listoffiles},"$name.pc.$addn.introns") if (-e "$name.pc.$addn.introns.bed");
push(@{$listoffiles},"$name.pc.$addn.CDS.introns") if (-e "$name.pc.$addn.CDS.introns.bed");
#print/load exon structure file => -filter and -parse will be done there
unless (-e $ExSt) {
#BIG TABLE STRUCTURE: ($chr,$type,$feat,$currstart,$currend,$strand,$gene_id,$tr_id,$gene_name,$tr_name,$gene_type,$tr_type,$colstoadd);
my @big = @{$big};
@big = sort {
# by chr
($a -> [0] cmp $b -> [0])
# by gene
|| ($a -> [6] cmp $b -> [6])
# by transcript
|| ($a -> [7] cmp $b -> [7])
# by coordinates
|| ($a -> [3] <=> $b -> [3])
|| ($a -> [4] <=> $b -> [4])
} @big;
#Now load the big table and print ExSt file
($feat,$ExInfo,$TrInfos) = print_ExSt($ExSt,$TrInfos,\@big,$pc,$addcol,$v);
#Done with @big => undef it since it's big, no need to use the memory
undef @big;
} else {
print STDERR "\n --- Exon Structure file seems to exist\n" if ($v);
($feat,$ExInfo,$TrInfos) = load_ExSt($ExSt,$TrInfos,$v);
}
#Get UTRs, only if there were coding stuff
if (-e "$name.pc.$addn.CDS.bed") {
get_UTRs($bedtools,$name,$pc,$TrInfos,$addn,$v);
push(@{$listtojoin},("$name.pc.$addn.UTRs","$name.pc.$addn.5UTRs","$name.pc.$addn.3UTRs"));
}
#Get up and down regions, unless no genome file
if ($fa ne "na") {
my @cut = split(",",$cut);
my $len = $cut[0];
get_up_dw($name,$len,$TrInfos,$lengths,$fa,$addn,$v); #unless ((-e "$name.nc.$len.up.bed") && (-e "$name.nc.$len.dw.bed"));
push(@{$listoffiles},("$name.nc.$addn.up","$name.nc.$addn.dw"));
push(@{$listoffiles},("$name.pc.$addn.up","$name.pc.$addn.dw")) if (-e "$name.pc.$addn.CDS.bed");
}
#Now subtract stuff if relevant
if (($subtract ne "na") || ($selfsub eq "yes")) {
$listtojoin = subtract($listtojoin,$listoffiles,$subtract,$subid,$name,$in,$selfsub,$bedtools,$TrInfos,$addn,$cut,$v);
} else {
push(@{$listtojoin},@{$listoffiles});
}
} else {
#Bed file - just join it
$name = $1 if ($in =~ /(.*)\.bed/);
push(@{$listtojoin},$name);
}
#Join files with TEs [longest step]
#####################################################
my $listtoanalyse = TE_join($listtojoin,$RMout,$bedtools,$v); #first list that has the extension of the files in it
#Parse the joined files with TEs
#####################################################
my $feat_nb = `wc -l $in`;
$feat_nb =~ s/^\s*([0-9]+?)\s+.*$/$1/;
$feat->{'all'}{'all'}{$in}=$feat_nb; #save number of features by file for bed ft
my $totlen = get_amounts($listtojoin,$name,$addn,$TrInfos,$cut,$ft,$v); #using files before joining
my $countTE = parse_join($listtoanalyse,$TrInfos,$ExInfo,$TEinfoRMP,$TE_RMP,$TEinfo,$TEage,$addcol,$cut,$totlen,$TEov,$ft,$v);
#Summary file
summary($in,$countTE,$feat,$addn,$ft,$v);
}
#30 subs later - Done!
print STDERR "\n --- Script for TE analysis is done\n" if ($v);
print STDERR " ------------------------------------------------------------------------------------------------\n\n" if ($v);
exit;
##########################################################################################################
# SUBROUTINES
##########################################################################################################
#----------------------------------------------------------------------------
# clean previous outputs generated by this script
# clean_out($realins);
#----------------------------------------------------------------------------
sub clean_out {
my $reals = shift;
print STDERR "\n --- Deleting previous outputfiles (except RMout.bed and <fa>.fa.lengths):\n";
foreach my $in (keys %{$reals}) {
print STDERR " For inputfile = $in\n";
$in = $1 if ($in =~ /(.*)\.gtf/);
$in = $1 if ($in =~ /(.*)\.bed/);
print STDERR " - $in.*TEjoin.*\n";
`rm -Rf $in.*TEjoin.*`;
print STDERR " - $in.*.AGE.tab\n";
`rm -Rf $in.*.AGE.tab`;
print STDERR " - $in.*.CAT.tab\n";
`rm -Rf $in.*.CAT.tab`;
print STDERR " - $in.*.CAT-class.tab\n";
`rm -Rf $in.*.CAT-class.tab`;
print STDERR " - $in.*.TEs-ratios.tab\n";
`rm -Rf $in.*.TEs-ratios.tab`;
print STDERR " - $in.*.amounts.txt\n";
`rm -Rf $in.*.amounts.txt`;
print STDERR " - $in.*.concat.*.tab\n";
`rm -Rf $in.*.concat.*.tab`;
print STDERR " - $in.*_Summary.tab\n";
`rm -Rf $in.*_Summary.tab`;
print STDERR " - $in.pc.*\n";
`rm -Rf $in.pc.*`;
print STDERR " - $in.nc.*\n";
`rm -Rf $in.nc.*`;
print STDERR " - $in.ExSt.*.tab\n";
`rm -Rf $in.ExSt.*.tab`;
print STDERR " - $in.nc-pc*\n";
`rm -Rf $in.nc-pc*`;
print STDERR "\n";
}
exit;
}
#----------------------------------------------------------------------------
# subroutine to print a sample file with the column details and exit
# myformat($myf) if ((! $in) && (! $RMout) && ($myf);
# called by main
#----------------------------------------------------------------------------
sub myformat {
my $myf = shift;
open(my $myf_fh, ">$myf") or confess "\n ERROR (sub myformat): could not open to write $myf $!\n";
print $myf_fh "###################################
# Configuration file for the script TE-analysis_gtf_pipeline_ak.pl
# Allow use of a custom input file -> set columns
###################################
# Numbers correspond to column numbers; note that first column = 0
# If -bed is chosen: type, feat, gene_id, gene_name and tr_name won't be considered, and tr_id will correspond to the unique ID.
# Spaces don't matter, they will be removed anyway
# Don't remove the # between numbers and comments
# If you run on a server, you can edit this file with emacs, vim, etc
###################################
chr = 0 #chromosome or scaffold name
type = 1 #havana, cufflink...
feat = 2 #feature, e.g exon, CDS, start,stop, mRNA or gene... Note that mRNA and gene lines will be ignored necause not required for the pipeline to run
start = 3 #start of the feature
end = 4 #end of the feature
strand = 5 #strand of the feature
gene_id = 6 #gene ID
gene_name = 6 #gene name; if none just use same as gene_id
transcript_id = 7 #transcript_ID (use this as the unique ID if -f bed is chosen)
transcript_name = 7 #transcript name; if none, just use same as tr_id
gene_type = 8 #gene biotype (e.g. protein_coding, lincRNA, snRNA, processed_transcript, etc) - required to split coding and non coding elements.
transcript_type = 8 #can be same as gene biotype
\n\n";
close ($myf_fh);
exit;
}
#----------------------------------------------------------------------------
# get TE infos; for -TE or for -RMparsed
# ($TE)?($TEinfo = get_TEs_infos($TE,$v)):($TEinfo->{"na"} = "na");
# ($RMPARSED)?(($TEinfoRMP,$TE_RMP) = get_TEs_infos($RMPARSED,$v)):($TEinfoRMP->{"na"} = "na",$TE_RMP->{"na"} = "na");
# called by main
#----------------------------------------------------------------------------
sub get_TEs_infos {
my ($in,$v) = @_;
print STDERR " --- Loading TE info from $in\n" if ($v);
my %TEs = ();
my %TE_RMP = ();
open(my $in_fh, "<", $in) or confess "\n ERROR (sub get_TEs_infos): could not open to read $in $!\n";
my $i = 0;
my $r = 0;
my $ifn = "y";
LINE: while(<$in_fh>) {
chomp (my $line = $_);
if ($i == 0 && $in =~ /.out.parseRM.*.tab$/ && $line =~ /Rfullname/ && $line =~ /MED_LEN_MASKED/) {
$r = 1 if ($line =~ /Rlen/);
$ifn = "n";
}
$i++;
next LINE if ($line !~ /\w/ || substr($line,0,5) eq "Rname" || substr($line,0,1) eq "#");
my @TEs = split('\t', $line);
#Deal with name, class, fam:
my ($Rn,$Rc,$Rf) = ($TEs[0],$TEs[1],$TEs[2]);
#make sure no / in family
$Rf =~ s/\//_/;
#add the -int if not there
$Rf = $Rf."--int" if ($Rf =~ /ERV/ && ($Rn =~ /[-_][iI]nt/ || $Rn =~ /[-_]I$/));
my $Rcf = $Rc."/".$Rf;
if ($in =~ /.parseRM.*.tab$/) {
#Now the rest of the columns will differ:
#columns in new parseRM.pl:
# 3 4 5 6 7 8
# Rlen FRG_NB_all FRG_NB_Reconstructed_repeats LEN_MASKED_NR AVG_%DIV MED_%DIV
# 9 10 11 12 13 14
# AVG_%DEL MED_%DEL AVG_%INS MED_%INS AVG_LEN_MASKED %_GENOME
# 15 16 17
# LEN_OVERLAP %_OVERLAP_(GENOME) %_OVERLAP_(LEN_MASKED)
#columns in old parseRM.pl (Rlen not necessarily there):
# 5 6 7 8 9
# FRG_NB FRG_NB_StartToEnd NR_FRG_NB AVG_%DIV MED_%DIV
# 10 11 12 13 14 15 16 17
# AVG_%DEL MED_%DEL AVG_%INS MED_%INS LEN_MASKED AVG_LEN_MASKED MED_LEN_MASKED %_GENOME
# 18 19 20
# LEN_OVERLAP %_OVERLAP_(GENOME) %_OVERLAP_(LEN_MASKED)
my ($frg,$frgnr,$ad,$len,$pgm,$leno,$pgmo);
if ($ifn eq "y") {
($frg,$frgnr,$ad) = ($TEs[4],$TEs[5],$TEs[7]);
($len,$pgm) = ($TEs[6],$TEs[14]);
if ($TEs[15] && $TEs[16]) {
($leno,$pgmo) = ($TEs[15],$TEs[16]);
} else {
($leno,$pgmo) = (0,0);
}
} else {
($frg,$frgnr,$ad) = ($TEs[5+$r],$TEs[7+$r],$TEs[8+$r]);
($len,$pgm) = ($TEs[14+$r],$TEs[17+$r]);
if ($TEs[18+$r] && $TEs[19+$r]) {
($leno,$pgmo) = ($TEs[18+$r],$TEs[19+$r]);
} else {
($leno,$pgmo) = (0,0);
}
}
if ($pgm eq "nd") {
print STDERR " /!\\ The % of the genome covered by each repeat is missing in $in\n";
print STDERR " Please rerun parseRM.pl with the -f option\n";
print STDERR " ... exiting\n\n";
exit;
}
#edit the list
#class famm class/fam frg nr_frg avg%div len_masked %genome_masked len_overlap %genome_overlap(for this repeat)
@TEs = ($Rc,$Rf,$Rcf,$frg,$frgnr,$ad,$len,$pgm,$leno,$pgmo);
#load TE_RMP hash
$TE_RMP{'l'}{lc($Rn)}+=($len-$leno);
$TE_RMP{'p'}{lc($Rn)}+=($pgm-$pgmo);
$TE_RMP{'cnr'}{lc($Rn)}+=$frgnr;
$TE_RMP{'ctot'}{lc($Rn)}+=$frg;
}
#now load TE list
$TEs{lc($Rn)} = \@TEs; #??? The only thing I use from this is the %div?? Double check, and store that only if yes!
}
close $in_fh;
if ($in =~ /.parseRM.*.tab$/) {
return (\%TEs,\%TE_RMP);
} else {
return \%TEs;
}
}
#----------------------------------------------------------------------------
# Convert RMoutput .out file to bed + filter nonTE + update class / fam if relevant
# ($RMout) = RMtobed($RMout,$RMbase,$TEinfoRMP,$TEinfo,$nonTE,$v);
# called by main
#----------------------------------------------------------------------------
sub RMtobed {
my ($RMout,$base,$TEinfoRMP,$TEinfo,$nonTE,$v) = @_;
my $bed = $RMout;
$bed =~ s/(.*)\.out$/$1/;
if (! $TEinfo->{"na"} || ! $TEinfoRMP->{"na"}) {
$bed = $bed.".class.nonTE-$nonTE.bed";
} else {
$bed = $bed.".nonTE-$nonTE.bed";
}
print STDERR " --- Converting $RMout to bed\n" if ($v);
unless (-e $bed) {
print STDERR " Filtering nonTE repeats based on -nonTE: $nonTE\n" if ($v);
print STDERR " Updating TE class and family based on -TE\n" if ($v && ! $TEinfo->{"na"});
print STDERR " Updating TE class and family (if relevant) based on -RMparsed\n" if (($v) && (! $TEinfoRMP->{"na"}) && ($TEinfo->{"na"}));
print STDERR " Updating TE class and family (if relevant) based on 1) -TE and 2) -RMparsed\n" if (($v) && (! $TEinfo->{"na"}) && (! $TEinfoRMP->{"na"}));
open(my $fh, "<$RMout") or confess "\n ERROR (sub RMtobed): could not open to read $RMout!\n";
open(my $bed_fh, ">$bed") or confess "\n ERROR (sub RMtobed): could not open to write $bed!\n";
LINE: while(<$fh>) {
chomp(my $l = $_);
$l =~ s/^\s+//;
next LINE if (($l =~ /^[Ss]core|^SW|^#/) || ($l !~ /\w/));
$l = $1 if ($l =~ /^(.*)\*$/); #remove the star
my @l = split('\s+',$l);
$l[8] =~ s/C/-/; #correct strand
#now get the TE info hash, unless already defined for this element.
#in older libraries there were repeats with same name but different classfam - well if it happens then the first occurence will be the one in the hash
my ($Rclass,$Rfam,$Rclassfam) = get_Rclass_Rfam($l[10],$l[9]);
my $rname = lc($l[9]);
#update class and family if relevant (will have the additional --int)
if ($TEinfo->{$rname}) {
($Rclass,$Rfam,$Rclassfam) = ($TEinfo->{$rname}[0],$TEinfo->{$rname}[1],$TEinfo->{$rname}[2]);
} elsif ($TEinfoRMP->{$rname}) { #use this only if not in $TEinfo
($Rclass,$Rfam,$Rclassfam) = ($TEinfoRMP->{$rname}[0],$TEinfoRMP->{$rname}[1],$TEinfoRMP->{$rname}[2]);
}
#now filter non TE or not, based on -nonTE value
next LINE if (($nonTE eq "none") && ($Rclass =~ /nonTE|snRNA|rRNA|tRNA|snoRNA|scRNA|srpRNA|[Ll]ow_complexity|[Ss]imple_repeat|[Ss]atellite|ARTEFACT/));
next LINE if (($nonTE eq "no_nonTE") && ($Rclass =~ /nonTE/));
next LINE if (($nonTE eq "no_low") && ($Rclass =~ /[Ll]ow_complexity|[Ss]imple_repeat/));
#now create unique ID + Rclassfam will be updated (in cased changed)
my ($chr,$start,$end,$strand) = ($l[4],$l[5],$l[6],$l[8]);
my $ID = $l[0];
for (my $i=1; $i<=9;$i++) {
$ID = $ID.";".$l[$i];
}
$ID = $ID.";".$Rclassfam;
for (my $i=11; $i<=$#l;$i++) {
$ID = $ID.";".$l[$i];
}
$ID =~ s/\s//; #should not need this since it would come from error in TEinfo input file for ex, but somebody could still have the issue
#correct the start if base is 0
$start=$start+1 if ($base == 0);
#now print
# $chr =~ s/gi\|.+\|gb/gb/ if ($gb == 1); #For Rachel
print $bed_fh "$chr\t$start\t$end\t$ID\t.\t$strand\n"; #with ID being the whole line => easy to acces to RMoutput
}
close ($fh);
close ($bed_fh);
print STDERR " => $bed\n" if ($v);
} else {
print STDERR " $bed exists, skipping (delete if -TE or -parsedRM were changed)\n" if ($v);
}
return ($bed);
}
#----------------------------------------------------------------------------
# Get Rclassfam from RMout
# my ($Rclass,$Rfam,$Rclassfam) = get_Rclass_Rfam($classfam,$Rname);
# called by RMtobed
#----------------------------------------------------------------------------
sub get_Rclass_Rfam {
my($classfam,$Rname) = @_;
my ($Rclass,$Rfam);
if ($classfam =~ /\//) {
my $incaseof;
($Rclass,$Rfam,$incaseof) = split(/\//, $classfam); #should not happen but in case there was a / in family
$Rfam = $Rfam."_".$incaseof if ($incaseof);
} else {
$Rfam = $classfam;
$Rfam=~ s/^(.*)\..*$/$1/;
$Rclass = $classfam;
$Rclass =~ s/^.*\.(.*)$/$1/;
}
$Rfam = $Rfam."--int" if (($Rfam =~ /ERV/) && (($Rname =~ /[-_][iI]nt/) || ($Rname =~ /[-_]I$/)));
my $Rclassfam = "$Rclass/$Rfam";
return ($Rclass,$Rfam,$Rclassfam);
}
#----------------------------------------------------------------------------
# Convert gtf file to bed
# gtftobed($subtract,$v) if (($subtract) && ($subtract !~ /.*\.bed$/));
# called by main
#----------------------------------------------------------------------------
sub gtftobed {
my ($gtf,$v) = @_;
my $bed;
if ($gtf =~ /(.*)\.gtf/) {
$bed = $1.".bed";
} elsif ($gtf =~ /.*\.bed/){
$bed = $gtf;
return ($bed);
}
print STDERR "\n --- Converting $gtf to bed\n" if ($v);
unless (-e $bed) {
open(my $gtf_fh, "<$gtf") or confess "\n ERROR (sub gtftobed): could not open to read $gtf!\n";
open(my $bed_fh, ">$bed") or confess "\n ERROR (sub gtftobed): could not open to write $bed!\n";
LINE: while(<$gtf_fh>) {
chomp(my $line = $_);
$line =~ s/\s+"/"/g;
$line =~ s/";\s+/";/g;
next LINE if (substr($line,0,1) eq "#");
next LINE if (substr($line,0,2) eq "MT"); #get rid of mitochondrial DNA
my @line = split (/\s+/,$line);
my ($chr,$feat,$start,$end,$strand) = ($line[0],$line[2],$line[3],$line[4],$line[6]);
if ($feat eq "exon") {
#correct for the fact that there are only numbers for chromosomes in ensembl gtf (not gencode for ex)
$chr ="chr".$chr if ($chr =~ /^\d+$/);
#Get strand +/- (Note some stuff might be a * => no strand info, e.g. single exons lncRNAs...)
$strand = "+" if ($strand eq "1");
$strand = "-" if ($strand eq "-1");
#now, get gene and transcript ids + gene and tr biotypes - different depending on the gtf type
my ($tr_id);
my @info = split(';',$line[8]);
for (my $i = 0; $i < $#info; $i++) {
my ($id,$value);
($info[$i] =~ /"/)?(($id,$value) = split(/"/,$info[$i])):(($id,$value) = split(/\s+/,$info[$i]));
$tr_id = $value if ($id eq "transcript_id");
}
my $ID = $tr_id."#".$chr.":".$start."-".$end;
print $bed_fh "$chr\t$start\t$end\t$ID\t.\t$strand\n" unless ($end - $start <1);
}
}
close ($gtf_fh);
close ($bed_fh);
} else {
print STDERR " $bed exists, skipping\n" if ($v);
}
return ($bed);
}
#----------------------------------------------------------------------------
# Get lengths of all sequences and store that by sequence ID. Note that if some are not unique, it just replaces by last length.
# This sub does not use BioPerl - avoid having to index the genome
# my $lengths = get_lengths_noBio($fa,$v) if ($fa);
#----------------------------------------------------------------------------
sub get_lengths_noBio {
my ($fa,$v) = @_;
print STDERR "\n --- Getting sequences lengths for $fa\n" if ($v);
my %len = ();
my $lengthfile = "$fa.lengths";
if (-e $lengthfile) {
print STDERR " -> lengths have been previously calculated ($lengthfile exists) => extracting\n" if ($v);
#extract lengths now
open (my $lengths_fh, "<", $lengthfile) or confess " \nERROR (sub get_lengths): could not open $lengthfile $!\n";
while (<$lengths_fh>) {
chomp (my $line = $_);
my ($id,$len) = split(/\s+/,$line);
$len{$id}=$len;
}
close ($lengths_fh);
} else {
#looping through fasta file
my $id = "";
my $l = 0;
my $c = 0;
open (my $fa_fh, "<", $fa) or confess " \nERROR (sub get_lengths): could not open $fa $!\n";
open (my $len_fh, ">", $lengthfile) or warn " \nERROR (sub get_lengths): could not create $lengthfile, but lengths will be calculated $!\n";
while (<$fa_fh>) {
chomp (my $line = $_);
if (substr($line,0,1) eq ">") {
#first get and print unless first header
unless ($c == 0) {
print $len_fh "$id\t$l\n";
$len{$id}=$l;
}
$c=1;
#store header and reinitialize length
my @id = split (/\s+/,$line);
$id = $id[0];
$id =~ s/>//;
$l = 0;
} else {
#get length; could be more than one line so increment
$l+=length($line);
}
}
#get and print len last sequence