ExtractSCOs.sh

#!/bin/bash
# This is a bash script that extracts all ortholog groups having single copy gene in different species. 
# It takes the output generated by "CopyNumberGen.sh" and prints single copy orthologs.
# by default it prints to stdout

# Arun Seetharam <aseetharam@purdue.edu>

scriptName="${0##*/}"
declare -i DEFAULT_COPY=1
declare -i copynum=DEFAULT_COPY

function printUsage() {
    cat <<EOF

Synopsis

    $scriptName [-h | --help] [-c number] input_file

Description

    Reads the count table and prints only lines having one copy gene in every genome.
    Output is printed to STDOUT
    
    input_file 
        Input file should contain count table and ortholog group IDs
        This file has to be generated by "CopyNumberGen.sh" script.
    
    -c number
        By default, all orthologs groups that have single copy gene are printed, but you can specify
        desired number of genes per ortholog group. Using 2 will print all ortholog groups having 
        two copy genes per genome.

    -h, --help 
        Brings up this help page
 
Author

    Arun Seetharam, Bioinformatics Core, Purdue University.
    aseetharam@purdue.edu


EOF
}

if [ $# -lt 1 ] ; then
    printUsage
    exit 1
fi

while getopts ':c:' option; do
  case "$option" in
    c) copynum=$OPTARG
       shift
       ;;
    h) printUsage
       exit
       ;;
    help) printUsage
       exit
       ;;
  esac
done
shift $(( $# - 1 ))
FILE="$1"
head -n 1 ${FILE}
while read line; do
g=$(echo $line | cut -d " " -f 2- | tr " " "\n" | uniq | wc -l );
if [ $g -eq "1" ]; then
echo $line | sed 's/ /\t/g' | grep -w "${copynum}" ;
fi;
done<${FILE}