Skip to content

Commit

Permalink
compare_masking.pl, merge_masking.pl : scripts to assess and merge th…
Browse files Browse the repository at this point in the history
…e (soft-)masking for repeats that is produced by different pipelines, e.g. RepeatScout and CARP.
  • Loading branch information
KatharinaHoff committed Sep 9, 2019
1 parent 2c6223c commit 40beddc
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 0 deletions.
88 changes: 88 additions & 0 deletions scripts/compare_masking.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/perl

# Katharina J. Hoff
# November 20th 2018


use strict;
use warnings;
use Getopt::Long;

my $usage = << 'ENDUSAGE';
compare_masking compare the repeat masking content of differently masked (same) assemblies
SYNOPSIS
compare_masking file1.fa file2.fa
file1.fa softmasked fasta file
file2.fa softmasked second fasta file
OPTIONS
--help output this help message
WARNING: This script keeps two assemblies in memory, i.e. it is not suitable for large genomes!
ENDUSAGE

my ($help);

GetOptions('help' => \$help);

if($help){
print $usage;
exit(1);
}


my %masking1;
my $key;
open(FILE1, "<", $ARGV[0]) or die ("Could not open file $ARGV[0]!\n");
while(<FILE1>){
chomp;
if(m/^>/){
$masking1{$_} = "";
$key = $_;
}else{
$masking1{$key} .= $_;
}
}
close(FILE1) or die ("Could not close file $ARGV[0]!\n");

my %masking2;
open(FILE2, "<", $ARGV[1]) or die ("Could not open file $ARGV[1]!\n");
while(<FILE2>){
chomp;
if(m/^>/){
$masking2{$_} = "";
$key = $_;
}else{
$masking2{$key} .= $_;
}
}
close(FILE2) or die ("Could not close file $ARGV[1]!\n");


my $only1 = 0;
my $only2 = 0;
my $both = 0;
while( my($key, $value) = each(%masking1)){
my @arr1 = split(//, $value);
my @arr2 = split(//, $masking2{$key});
my $counter = 0;
foreach(@arr1){
if( ($_ =~ m/\p{Lowercase}/) and ($arr2[$counter] =~ m/\p{Lowercase}/)){
$both++;
}elsif($_ =~ m/\p{Lowercase}/){
$only1++;
}elsif($arr2[$counter] =~ m/\p{Lowercase}/){
$only2++;
}
$counter++;
}
}

print("Masked in both files: $both\n");
print("Masked in File1, only: $only1\n");
print("Masked in File2, only: $only2\n");
84 changes: 84 additions & 0 deletions scripts/merge_masking.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/perl

# Katharina J. Hoff
# November 20th 2018

use strict;
use warnings;
use Getopt::Long;

my $usage = << 'ENDUSAGE';
merge_masking.pl take two softmasked (same) genome assemblies and mask all bases that are
masked in one of the two files (softmasking).
SYNOPSIS
compare_masking file1.fa file2.fa
file1.fa softmasked fasta file
file2.fa softmasked second fasta file
file3.fa output softmasked third fasta file
OPTIONS
--help output this help message
WARNING: this script keeps two assemblies in memory, i.e. it is not suitable for large genomes!
ENDUSAGE

my ($help);

GetOptions('help' => \$help);

if($help){
print $usage;
exit(1);
}


my %masking1;
my $key;
open(FILE1, "<", $ARGV[0]) or die ("Could not open file $ARGV[0]!\n");
while(<FILE1>){
chomp;
if(m/^>/){
$masking1{$_} = "";
$key = $_;
}else{
$masking1{$key} .= $_;
}
}
close(FILE1) or die ("Could not close file $ARGV[0]!\n");

my %masking2;
open(FILE2, "<", $ARGV[1]) or die ("Could not open file $ARGV[1]!\n");
while(<FILE2>){
chomp;
if(m/^>/){
$masking2{$_} = "";
$key = $_;
}else{
$masking2{$key} .= $_;
}
}
close(FILE2) or die ("Could not close file $ARGV[1]!\n");


open(FILE3, ">", $ARGV[2]) or die ("Could not open file $ARGV[2]!\n");
while( my($key, $value) = each(%masking1)){
my @arr1 = split(//, $value);
my @arr2 = split(//, $masking2{$key});
my $counter = 0;
print(FILE3 $key."\n");
foreach(@arr1){
if( ($_ =~ m/\p{Lowercase}/) or ($arr2[$counter] =~ m/\p{Lowercase}/)){
print(FILE3 lc($_));
}else{
print(FILE3 $_);
}
$counter++;
}
print(FILE3 "\n");
}
close(FILE3) or die ("Could not close file $ARGV[2]!\n");

0 comments on commit 40beddc

Please sign in to comment.