Skip to content

Commit

Permalink
Merge pull request #346 from pangenome/pansn
Browse files Browse the repository at this point in the history
Minimize the minimal `pggb` command line
  • Loading branch information
AndreaGuarracino authored Oct 26, 2023
2 parents 1213e2e + dae26d6 commit 44ab2a0
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 99 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build_and_test_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ jobs:
- name: Build the Docker image
run: docker build . --file Dockerfile --target binary --tag pggb
- name: Run a test on the DRB1-3123 dataset (SPOA)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352:10000' -M -m -o drib1_spoa && ls drib1_spoa/* && head drib1_spoa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -M -m -o drib1_spoa --no-pansn && ls drib1_spoa/* && head drib1_spoa/*.log -n 63"
- name: Run a test on the DRB1-3123 dataset (abPOA)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352:10000' -M -m -b -o drib1_abpoa && ls drib1_abpoa/* && head drib1_abpoa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -M -m -b -o drib1_abpoa --no-pansn && ls drib1_abpoa/* && head drib1_abpoa/*.log -n 63"
- name: Run a test on the DRB1-3123 dataset (paf)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -a data/paf/DRB1-3123.fa.15a1009.wfmash.paf -p 70 -s 3000 -G 2000 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352' -M -m -o drib1_paf && ls drib1_paf/* && head drib1_paf/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -a data/paf/DRB1-3123.fa.15a1009.wfmash.paf -p 70 -s 3000 -G 2000 -n 10 -t 2 -Z -M -m -o drib1_paf --no-pansn && ls drib1_paf/* && head drib1_paf/*.log -n 63"
- name: Run a test on the LPA dataset (SPOA global)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/LPA/LPA.fa.gz -p 95 -s 20000 -G 800,900 -n 90 -k 79 -t 2 -Z -O 0.001 -m -z -o lpa && ls lpa/* && head lpa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/LPA/LPA.fa.gz -p 95 -s 20000 -G 800,900 -k 79 -t 2 -Z -O 0.001 -m -z -o lpa -V 'chm13,chm1:1000' && ls lpa/* && head lpa/*.log -n 63"
- name: Run a test for the gfa2evaluation script on a mini HPRC chrMT dataset
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "gfa2evaluation.sh data/chrM.pan.4.gfa chm13 data/test_eval 2"
Binary file modified data/LPA/LPA.fa.gz
Binary file not shown.
28 changes: 14 additions & 14 deletions data/LPA/LPA.fa.gz.fai
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
chm13__LPA__tig00000001 330243 25 60 61
chm1__LPA__tig00000003 227072 335797 60 61
HG002__LPA__tig00000001 329347 566679 60 61
HG002__LPA__tig00000005 274138 901541 60 61
HG00733__LPA__tig00000001 295824 1180275 60 61
HG00733__LPA__tig00000008 283680 1481057 60 61
HG01358__LPA__tig00000002 337324 1769492 60 61
HG01358__LPA__tig00000010 240282 2112466 60 61
HG02572__LPA__tig00000005 311357 2356780 60 61
HG02572__LPA__tig00000001 309189 2673354 60 61
NA19239__LPA__tig00000002 264003 2987724 60 61
NA19239__LPA__tig00000006 236974 3256155 60 61
NA19240__LPA__tig00000001 285146 3497106 60 61
NA19240__LPA__tig00000012 260090 3787032 60 61
chm13#0#tig00000001 330243 21 60 61
chm1#0#tig00000003 227072 335789 60 61
HG002#0#tig00000001 329347 566667 60 61
HG002#1#tig00000005 274138 901525 60 61
HG00733#0#tig00000001 295824 1180255 60 61
HG00733#1#tig00000008 283680 1481033 60 61
HG01358#0#tig00000002 337324 1769464 60 61
HG01358#1#tig00000010 240282 2112434 60 61
HG02572#0#tig00000005 311357 2356744 60 61
HG02572#1#tig00000001 309189 2673314 60 61
NA19239#0#tig00000002 264003 2987680 60 61
NA19239#1#tig00000006 236974 3256107 60 61
NA19240#0#tig00000001 285146 3497054 60 61
NA19240#1#tig00000012 260090 3786976 60 61
Binary file modified data/LPA/LPA.fa.gz.gzi
Binary file not shown.
28 changes: 14 additions & 14 deletions data/LPA/LPA.sample.list
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
chm13__LPA__tig00000001
chm1__LPA__tig00000003
HG002__LPA__tig00000001
HG002__LPA__tig00000005
HG00733__LPA__tig00000001
HG00733__LPA__tig00000008
HG01358__LPA__tig00000002
HG01358__LPA__tig00000010
HG02572__LPA__tig00000005
HG02572__LPA__tig00000001
NA19239__LPA__tig00000002
NA19239__LPA__tig00000006
NA19240__LPA__tig00000001
NA19240__LPA__tig00000012
chm13#0#tig00000001
chm1#0#tig00000003
HG002#0#tig00000001
HG002#1#tig00000005
HG00733#0#tig00000001
HG00733#1#tig00000008
HG01358#0#tig00000002
HG01358#1#tig00000010
HG02572#0#tig00000005
HG02572#1#tig00000001
NA19239#0#tig00000002
NA19239#1#tig00000006
NA19240#0#tig00000001
NA19240#1#tig00000012
75 changes: 41 additions & 34 deletions partition-before-pggb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ threads=$THREADS
poa_threads=0
keep_intermediate_files=false
compress=false
respect_pansn=true
show_version=false
show_help=false

Expand Down Expand Up @@ -117,7 +118,7 @@ fi

# read the options
cmd=$0" "$@
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,version -n 'pggb' -- "$@"`
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,no-pansn,version -n 'pggb' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
Expand Down Expand Up @@ -161,6 +162,7 @@ while true ; do
-T|--poa-threads) poa_threads=$2 ; shift 2 ;;
-A|--keep-temp-files) keep_intermediate_files=true ; shift ;;
-Z|--compress) compress=true ; shift ;;
--no-pansn) respect_pansn=false ; shift ;;
--version) show_version=true ; shift ;;
-h|--help) show_help=true ; shift ;;
--) shift ; break ;;
Expand All @@ -177,25 +179,10 @@ if [ "$show_version" == true ]; then
exit
fi

# Mandatory parameters
if [[ ("$input_fasta" == false || $n_haps == false) && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: mandatory arguments -i/--input-fasta and -n/--n-haplotypes"
exit
fi

if [[ "$n_haps" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -n/--n-haplotypes must be greater than or equal to 1"
exit
fi

if [[ "$n_mappings" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

if [ $show_help == true ]; then
padding=`printf %${#0}s` # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> -n <n-haplotypes> [options]"
# Check input parameters
if [ "$show_help" == true ]; then
padding=$(printf %${#0}s) # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> [options]"
echo "options:"
echo " [wfmash]"
echo " -i, --input-fasta FILE input FASTA/FASTQ file"
Expand Down Expand Up @@ -261,13 +248,45 @@ if [ $show_help == true ]; then
echo " -A, --keep-temp-files keep intermediate graphs"
echo " -Z, --compress compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz,"
echo " and variant (.vcf) outputs with bgzip"
echo " --no-pansn Pangenome Sequence Naming (PanSN) not required for sequence names"
echo " --version display the version of pggb"
echo " -h, --help this text"
echo
echo "Use wfmash, seqwish, smoothxg, odgi, gfaffix, and vg to build, project and display a pangenome graph."
exit
elif [ "$input_fasta" = "false" ]; then
>&2 echo "[pggb] ERROR: mandatory argument: -i/--input-fasta"
exit
elif [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
elif [ "$respect_pansn" = "false" ] && { [ "$n_haps" = "false" ] || [ "$n_haps" -lt 1 ]; }; then
>&2 echo "[pggb] ERROR: when no-pansn is set, -n/--n-haplotypes must be greater than or equal to 1"
exit
elif [ "$respect_pansn" = "false" -a "$vcf_spec" != "false" ]; then
>&2 echo "[pggb] ERROR: -V/--vcf-spec and --no-pansn are incompatible"
exit
elif [ "$n_mappings" -lt 1 ]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

# Check Pangenome Sequence Naming (PanSN)
pansn_not_respected=false
while IFS= read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ "$pansn_not_respected" == "false" ]]; then
pansn_not_respected=$line
break
fi
done < <(cut -f 1 "${input_fasta}.fai")
if [ "$pansn_not_respected" != "false" ]; then
if [ "$respect_pansn" = "false" ]; then
>&2 echo "[pggb] warning: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN)."
else
>&2 echo "[pggb] ERROR: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN). Set --pan-sn to disable this requirement."
exit
fi
fi

# Alignment
if [[ $input_paf == false ]]; then
Expand Down Expand Up @@ -295,7 +314,7 @@ fi

# Normalization ($n_haps is checked in this part of the script because it is also used for the 'auto' mapping sparsification)
if [[ $n_haps == false ]]; then
n_haps=$n_mappings
n_haps=$(cut -f 1 "${input_fasta}.fai" | cut -f 1,2 -d '#' | sort | uniq | wc -l)
fi

sparse_map_cmd=""
Expand Down Expand Up @@ -408,6 +427,7 @@ general:
compress: $compress
threads: $threads
poa_threads: $poa_threads
respect_pansn: $respect_pansn
$mapper:
version: $mapper_version
segment-length: $segment_length
Expand Down Expand Up @@ -458,19 +478,6 @@ reporting:
multiqc: $multiqc
EOT

# Check Pangenome Sequence Naming (PanSN)
if [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: Index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
fi
warning_emitted=0
cut -f 1 "${input_fasta}.fai" | while read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ $warning_emitted -eq 0 ]]; then
echo "[pggb] Warning: there are sequence names (like '$line') that do not match the Pangenome Sequence Naming (PanSN)."
warning_emitted=1
fi
done

#-------------------------------------------------------------------------------
echo -e "\nRunning partitioning\n" >> "$log_file"
if [[ "$input_paf" == false ]]; then
Expand Down
74 changes: 41 additions & 33 deletions pggb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ threads=$THREADS
poa_threads=0
keep_intermediate_files=false
compress=false
respect_pansn=true
show_version=false
show_help=false

Expand Down Expand Up @@ -117,7 +118,7 @@ fi

# read the options
cmd=$0" "$@
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,version -n 'pggb' -- "$@"`
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,no-pansn,version -n 'pggb' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
Expand Down Expand Up @@ -161,6 +162,7 @@ while true ; do
-T|--poa-threads) poa_threads=$2 ; shift 2 ;;
-A|--keep-temp-files) keep_intermediate_files=true ; shift ;;
-Z|--compress) compress=true ; shift ;;
--no-pansn) respect_pansn=false ; shift ;;
--version) show_version=true ; shift ;;
-h|--help) show_help=true ; shift ;;
--) shift ; break ;;
Expand All @@ -177,25 +179,10 @@ if [ "$show_version" == true ]; then
exit
fi

# Mandatory parameters
if [[ ("$input_fasta" == false || $n_haps == false) && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: mandatory arguments -i/--input-fasta and -n/--n-haplotypes"
exit
fi

if [[ "$n_haps" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -n/--n-haplotypes must be greater than or equal to 1"
exit
fi

if [[ "$n_mappings" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

if [ $show_help == true ]; then
padding=`printf %${#0}s` # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> -n <n-haplotypes> [options]"
# Check input parameters
if [ "$show_help" == true ]; then
padding=$(printf %${#0}s) # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> [options]"
echo "options:"
echo " [wfmash]"
echo " -i, --input-fasta FILE input FASTA/FASTQ file"
Expand Down Expand Up @@ -261,13 +248,45 @@ if [ $show_help == true ]; then
echo " -A, --keep-temp-files keep intermediate graphs"
echo " -Z, --compress compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz,"
echo " and variant (.vcf) outputs with bgzip"
echo " --no-pansn Pangenome Sequence Naming (PanSN) not required for sequence names"
echo " --version display the version of pggb"
echo " -h, --help this text"
echo
echo "Use wfmash, seqwish, smoothxg, odgi, gfaffix, and vg to build, project and display a pangenome graph."
exit
elif [ "$input_fasta" = "false" ]; then
>&2 echo "[pggb] ERROR: mandatory argument: -i/--input-fasta"
exit
elif [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
elif [ "$respect_pansn" = "false" ] && { [ "$n_haps" = "false" ] || [ "$n_haps" -lt 1 ]; }; then
>&2 echo "[pggb] ERROR: when no-pansn is set, -n/--n-haplotypes must be greater than or equal to 1"
exit
elif [ "$respect_pansn" = "false" -a "$vcf_spec" != "false" ]; then
>&2 echo "[pggb] ERROR: -V/--vcf-spec and --no-pansn are incompatible"
exit
elif [ "$n_mappings" -lt 1 ]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

# Check Pangenome Sequence Naming (PanSN)
pansn_not_respected=false
while IFS= read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ "$pansn_not_respected" == "false" ]]; then
pansn_not_respected=$line
break
fi
done < <(cut -f 1 "${input_fasta}.fai")
if [ "$pansn_not_respected" != "false" ]; then
if [ "$respect_pansn" = "false" ]; then
>&2 echo "[pggb] warning: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN)."
else
>&2 echo "[pggb] ERROR: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN). Set --pan-sn to disable this requirement."
exit
fi
fi

# Alignment
if [[ $input_paf == false ]]; then
Expand Down Expand Up @@ -295,7 +314,7 @@ fi

# Normalization ($n_haps is checked in this part of the script because it is also used for the 'auto' mapping sparsification)
if [[ $n_haps == false ]]; then
n_haps=$n_mappings
n_haps=$(cut -f 1 "${input_fasta}.fai" | cut -f 1,2 -d '#' | sort | uniq | wc -l)
fi

sparse_map_cmd=""
Expand Down Expand Up @@ -408,6 +427,7 @@ general:
compress: $compress
threads: $threads
poa_threads: $poa_threads
respect_pansn: $respect_pansn
$mapper:
version: $mapper_version
segment-length: $segment_length
Expand Down Expand Up @@ -458,18 +478,6 @@ reporting:
multiqc: $multiqc
EOT

# Check Pangenome Sequence Naming (PanSN)
if [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: Index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
fi
warning_emitted=0
cut -f 1 "${input_fasta}.fai" | while read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ $warning_emitted -eq 0 ]]; then
echo "[pggb] Warning: there are sequence names (like '$line') that do not match the Pangenome Sequence Naming (PanSN)."
warning_emitted=1
fi
done

echo -e "\nRunning pggb\n" >> "$log_file"

Expand Down

0 comments on commit 44ab2a0

Please sign in to comment.