-
Notifications
You must be signed in to change notification settings - Fork 112
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
compare_masking.pl, merge_masking.pl : scripts to assess and merge th…
…e (soft-)masking for repeats that is produced by different pipelines, e.g. RepeatScout and CARP.
- Loading branch information
1 parent
2c6223c
commit 40beddc
Showing
2 changed files
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/perl | ||
|
||
# Katharina J. Hoff | ||
# November 20th 2018 | ||
|
||
|
||
use strict; | ||
use warnings; | ||
use Getopt::Long; | ||
|
||
my $usage = << 'ENDUSAGE'; | ||
compare_masking compare the repeat masking content of differently masked (same) assemblies | ||
SYNOPSIS | ||
compare_masking file1.fa file2.fa | ||
file1.fa softmasked fasta file | ||
file2.fa softmasked second fasta file | ||
OPTIONS | ||
--help output this help message | ||
WARNING: This script keeps two assemblies in memory, i.e. it is not suitable for large genomes! | ||
ENDUSAGE | ||
|
||
my ($help); | ||
|
||
GetOptions('help' => \$help); | ||
|
||
if($help){ | ||
print $usage; | ||
exit(1); | ||
} | ||
|
||
|
||
my %masking1; | ||
my $key; | ||
open(FILE1, "<", $ARGV[0]) or die ("Could not open file $ARGV[0]!\n"); | ||
while(<FILE1>){ | ||
chomp; | ||
if(m/^>/){ | ||
$masking1{$_} = ""; | ||
$key = $_; | ||
}else{ | ||
$masking1{$key} .= $_; | ||
} | ||
} | ||
close(FILE1) or die ("Could not close file $ARGV[0]!\n"); | ||
|
||
my %masking2; | ||
open(FILE2, "<", $ARGV[1]) or die ("Could not open file $ARGV[1]!\n"); | ||
while(<FILE2>){ | ||
chomp; | ||
if(m/^>/){ | ||
$masking2{$_} = ""; | ||
$key = $_; | ||
}else{ | ||
$masking2{$key} .= $_; | ||
} | ||
} | ||
close(FILE2) or die ("Could not close file $ARGV[1]!\n"); | ||
|
||
|
||
my $only1 = 0; | ||
my $only2 = 0; | ||
my $both = 0; | ||
while( my($key, $value) = each(%masking1)){ | ||
my @arr1 = split(//, $value); | ||
my @arr2 = split(//, $masking2{$key}); | ||
my $counter = 0; | ||
foreach(@arr1){ | ||
if( ($_ =~ m/\p{Lowercase}/) and ($arr2[$counter] =~ m/\p{Lowercase}/)){ | ||
$both++; | ||
}elsif($_ =~ m/\p{Lowercase}/){ | ||
$only1++; | ||
}elsif($arr2[$counter] =~ m/\p{Lowercase}/){ | ||
$only2++; | ||
} | ||
$counter++; | ||
} | ||
} | ||
|
||
print("Masked in both files: $both\n"); | ||
print("Masked in File1, only: $only1\n"); | ||
print("Masked in File2, only: $only2\n"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/perl | ||
|
||
# Katharina J. Hoff | ||
# November 20th 2018 | ||
|
||
use strict; | ||
use warnings; | ||
use Getopt::Long; | ||
|
||
my $usage = << 'ENDUSAGE'; | ||
merge_masking.pl take two softmasked (same) genome assemblies and mask all bases that are | ||
masked in one of the two files (softmasking). | ||
SYNOPSIS | ||
compare_masking file1.fa file2.fa | ||
file1.fa softmasked fasta file | ||
file2.fa softmasked second fasta file | ||
file3.fa output softmasked third fasta file | ||
OPTIONS | ||
--help output this help message | ||
WARNING: this script keeps two assemblies in memory, i.e. it is not suitable for large genomes! | ||
ENDUSAGE | ||
|
||
my ($help); | ||
|
||
GetOptions('help' => \$help); | ||
|
||
if($help){ | ||
print $usage; | ||
exit(1); | ||
} | ||
|
||
|
||
my %masking1; | ||
my $key; | ||
open(FILE1, "<", $ARGV[0]) or die ("Could not open file $ARGV[0]!\n"); | ||
while(<FILE1>){ | ||
chomp; | ||
if(m/^>/){ | ||
$masking1{$_} = ""; | ||
$key = $_; | ||
}else{ | ||
$masking1{$key} .= $_; | ||
} | ||
} | ||
close(FILE1) or die ("Could not close file $ARGV[0]!\n"); | ||
|
||
my %masking2; | ||
open(FILE2, "<", $ARGV[1]) or die ("Could not open file $ARGV[1]!\n"); | ||
while(<FILE2>){ | ||
chomp; | ||
if(m/^>/){ | ||
$masking2{$_} = ""; | ||
$key = $_; | ||
}else{ | ||
$masking2{$key} .= $_; | ||
} | ||
} | ||
close(FILE2) or die ("Could not close file $ARGV[1]!\n"); | ||
|
||
|
||
open(FILE3, ">", $ARGV[2]) or die ("Could not open file $ARGV[2]!\n"); | ||
while( my($key, $value) = each(%masking1)){ | ||
my @arr1 = split(//, $value); | ||
my @arr2 = split(//, $masking2{$key}); | ||
my $counter = 0; | ||
print(FILE3 $key."\n"); | ||
foreach(@arr1){ | ||
if( ($_ =~ m/\p{Lowercase}/) or ($arr2[$counter] =~ m/\p{Lowercase}/)){ | ||
print(FILE3 lc($_)); | ||
}else{ | ||
print(FILE3 $_); | ||
} | ||
$counter++; | ||
} | ||
print(FILE3 "\n"); | ||
} | ||
close(FILE3) or die ("Could not close file $ARGV[2]!\n"); |