Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimize the minimal pggb command line #346

Merged
merged 5 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build_and_test_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ jobs:
- name: Build the Docker image
run: docker build . --file Dockerfile --target binary --tag pggb
- name: Run a test on the DRB1-3123 dataset (SPOA)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352:10000' -M -m -o drib1_spoa && ls drib1_spoa/* && head drib1_spoa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -M -m -o drib1_spoa --no-pansn && ls drib1_spoa/* && head drib1_spoa/*.log -n 63"
- name: Run a test on the DRB1-3123 dataset (abPOA)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352:10000' -M -m -b -o drib1_abpoa && ls drib1_abpoa/* && head drib1_abpoa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -p 70 -s 3000 -G 800,900,1100 -n 10 -t 2 -Z -M -m -b -o drib1_abpoa --no-pansn && ls drib1_abpoa/* && head drib1_abpoa/*.log -n 63"
- name: Run a test on the DRB1-3123 dataset (paf)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -a data/paf/DRB1-3123.fa.15a1009.wfmash.paf -p 70 -s 3000 -G 2000 -n 10 -t 2 -Z -V 'gi|568815561,gi|29124352' -M -m -o drib1_paf && ls drib1_paf/* && head drib1_paf/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/HLA/DRB1-3123.fa.gz -a data/paf/DRB1-3123.fa.15a1009.wfmash.paf -p 70 -s 3000 -G 2000 -n 10 -t 2 -Z -M -m -o drib1_paf --no-pansn && ls drib1_paf/* && head drib1_paf/*.log -n 63"
- name: Run a test on the LPA dataset (SPOA global)
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/LPA/LPA.fa.gz -p 95 -s 20000 -G 800,900 -n 90 -k 79 -t 2 -Z -O 0.001 -m -z -o lpa && ls lpa/* && head lpa/*.log -n 63"
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "pggb -i data/LPA/LPA.fa.gz -p 95 -s 20000 -G 800,900 -k 79 -t 2 -Z -O 0.001 -m -z -o lpa -V 'chm13,chm1:1000' && ls lpa/* && head lpa/*.log -n 63"
- name: Run a test for the gfa2evaluation script on a mini HPRC chrMT dataset
run: docker run -v ${PWD}/data/:/data pggb /bin/bash -c "gfa2evaluation.sh data/chrM.pan.4.gfa chm13 data/test_eval 2"
Binary file modified data/LPA/LPA.fa.gz
Binary file not shown.
28 changes: 14 additions & 14 deletions data/LPA/LPA.fa.gz.fai
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
chm13__LPA__tig00000001 330243 25 60 61
chm1__LPA__tig00000003 227072 335797 60 61
HG002__LPA__tig00000001 329347 566679 60 61
HG002__LPA__tig00000005 274138 901541 60 61
HG00733__LPA__tig00000001 295824 1180275 60 61
HG00733__LPA__tig00000008 283680 1481057 60 61
HG01358__LPA__tig00000002 337324 1769492 60 61
HG01358__LPA__tig00000010 240282 2112466 60 61
HG02572__LPA__tig00000005 311357 2356780 60 61
HG02572__LPA__tig00000001 309189 2673354 60 61
NA19239__LPA__tig00000002 264003 2987724 60 61
NA19239__LPA__tig00000006 236974 3256155 60 61
NA19240__LPA__tig00000001 285146 3497106 60 61
NA19240__LPA__tig00000012 260090 3787032 60 61
chm13#0#tig00000001 330243 21 60 61
chm1#0#tig00000003 227072 335789 60 61
HG002#0#tig00000001 329347 566667 60 61
HG002#1#tig00000005 274138 901525 60 61
HG00733#0#tig00000001 295824 1180255 60 61
HG00733#1#tig00000008 283680 1481033 60 61
HG01358#0#tig00000002 337324 1769464 60 61
HG01358#1#tig00000010 240282 2112434 60 61
HG02572#0#tig00000005 311357 2356744 60 61
HG02572#1#tig00000001 309189 2673314 60 61
NA19239#0#tig00000002 264003 2987680 60 61
NA19239#1#tig00000006 236974 3256107 60 61
NA19240#0#tig00000001 285146 3497054 60 61
NA19240#1#tig00000012 260090 3786976 60 61
Binary file modified data/LPA/LPA.fa.gz.gzi
Binary file not shown.
28 changes: 14 additions & 14 deletions data/LPA/LPA.sample.list
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
chm13__LPA__tig00000001
chm1__LPA__tig00000003
HG002__LPA__tig00000001
HG002__LPA__tig00000005
HG00733__LPA__tig00000001
HG00733__LPA__tig00000008
HG01358__LPA__tig00000002
HG01358__LPA__tig00000010
HG02572__LPA__tig00000005
HG02572__LPA__tig00000001
NA19239__LPA__tig00000002
NA19239__LPA__tig00000006
NA19240__LPA__tig00000001
NA19240__LPA__tig00000012
chm13#0#tig00000001
chm1#0#tig00000003
HG002#0#tig00000001
HG002#1#tig00000005
HG00733#0#tig00000001
HG00733#1#tig00000008
HG01358#0#tig00000002
HG01358#1#tig00000010
HG02572#0#tig00000005
HG02572#1#tig00000001
NA19239#0#tig00000002
NA19239#1#tig00000006
NA19240#0#tig00000001
NA19240#1#tig00000012
75 changes: 41 additions & 34 deletions partition-before-pggb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ threads=$THREADS
poa_threads=0
keep_intermediate_files=false
compress=false
respect_pansn=true
show_version=false
show_help=false

Expand Down Expand Up @@ -117,7 +118,7 @@ fi

# read the options
cmd=$0" "$@
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,version -n 'pggb' -- "$@"`
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,no-pansn,version -n 'pggb' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
Expand Down Expand Up @@ -161,6 +162,7 @@ while true ; do
-T|--poa-threads) poa_threads=$2 ; shift 2 ;;
-A|--keep-temp-files) keep_intermediate_files=true ; shift ;;
-Z|--compress) compress=true ; shift ;;
--no-pansn) respect_pansn=false ; shift ;;
--version) show_version=true ; shift ;;
-h|--help) show_help=true ; shift ;;
--) shift ; break ;;
Expand All @@ -177,25 +179,10 @@ if [ "$show_version" == true ]; then
exit
fi

# Mandatory parameters
if [[ ("$input_fasta" == false || $n_haps == false) && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: mandatory arguments -i/--input-fasta and -n/--n-haplotypes"
exit
fi

if [[ "$n_haps" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -n/--n-haplotypes must be greater than or equal to 1"
exit
fi

if [[ "$n_mappings" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

if [ $show_help == true ]; then
padding=`printf %${#0}s` # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> -n <n-haplotypes> [options]"
# Check input parameters
if [ "$show_help" == true ]; then
padding=$(printf %${#0}s) # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> [options]"
echo "options:"
echo " [wfmash]"
echo " -i, --input-fasta FILE input FASTA/FASTQ file"
Expand Down Expand Up @@ -261,13 +248,45 @@ if [ $show_help == true ]; then
echo " -A, --keep-temp-files keep intermediate graphs"
echo " -Z, --compress compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz,"
echo " and variant (.vcf) outputs with bgzip"
echo " --no-pansn Pangenome Sequence Naming (PanSN) not required for sequence names"
echo " --version display the version of pggb"
echo " -h, --help this text"
echo
echo "Use wfmash, seqwish, smoothxg, odgi, gfaffix, and vg to build, project and display a pangenome graph."
exit
elif [ "$input_fasta" = "false" ]; then
>&2 echo "[pggb] ERROR: mandatory argument: -i/--input-fasta"
exit
elif [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
elif [ "$respect_pansn" = "false" ] && { [ "$n_haps" = "false" ] || [ "$n_haps" -lt 1 ]; }; then
>&2 echo "[pggb] ERROR: when no-pansn is set, -n/--n-haplotypes must be greater than or equal to 1"
exit
elif [ "$respect_pansn" = "false" -a "$vcf_spec" != "false" ]; then
>&2 echo "[pggb] ERROR: -V/--vcf-spec and --no-pansn are incompatible"
exit
elif [ "$n_mappings" -lt 1 ]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

# Check Pangenome Sequence Naming (PanSN)
pansn_not_respected=false
while IFS= read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ "$pansn_not_respected" == "false" ]]; then
pansn_not_respected=$line
break
fi
done < <(cut -f 1 "${input_fasta}.fai")
if [ "$pansn_not_respected" != "false" ]; then
if [ "$respect_pansn" = "false" ]; then
>&2 echo "[pggb] warning: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN)."
else
>&2 echo "[pggb] ERROR: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN). Set --pan-sn to disable this requirement."
exit
fi
fi

# Alignment
if [[ $input_paf == false ]]; then
Expand Down Expand Up @@ -295,7 +314,7 @@ fi

# Normalization ($n_haps is checked in this part of the script because it is also used for the 'auto' mapping sparsification)
if [[ $n_haps == false ]]; then
n_haps=$n_mappings
n_haps=$(cut -f 1 "${input_fasta}.fai" | cut -f 1,2 -d '#' | sort | uniq | wc -l)
fi

sparse_map_cmd=""
Expand Down Expand Up @@ -408,6 +427,7 @@ general:
compress: $compress
threads: $threads
poa_threads: $poa_threads
respect_pansn: $respect_pansn
$mapper:
version: $mapper_version
segment-length: $segment_length
Expand Down Expand Up @@ -458,19 +478,6 @@ reporting:
multiqc: $multiqc
EOT

# Check Pangenome Sequence Naming (PanSN)
if [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: Index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
fi
warning_emitted=0
cut -f 1 "${input_fasta}.fai" | while read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ $warning_emitted -eq 0 ]]; then
echo "[pggb] Warning: there are sequence names (like '$line') that do not match the Pangenome Sequence Naming (PanSN)."
warning_emitted=1
fi
done

#-------------------------------------------------------------------------------
echo -e "\nRunning partitioning\n" >> "$log_file"
if [[ "$input_paf" == false ]]; then
Expand Down
74 changes: 41 additions & 33 deletions pggb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ threads=$THREADS
poa_threads=0
keep_intermediate_files=false
compress=false
respect_pansn=true
show_version=false
show_help=false

Expand Down Expand Up @@ -117,7 +118,7 @@ fi

# read the options
cmd=$0" "$@
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,version -n 'pggb' -- "$@"`
TEMP=`getopt -o i:o:D:a:p:c:s:l:K:F:k:x:f:B:Xn:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haplotypes:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,no-pansn,version -n 'pggb' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
Expand Down Expand Up @@ -161,6 +162,7 @@ while true ; do
-T|--poa-threads) poa_threads=$2 ; shift 2 ;;
-A|--keep-temp-files) keep_intermediate_files=true ; shift ;;
-Z|--compress) compress=true ; shift ;;
--no-pansn) respect_pansn=false ; shift ;;
--version) show_version=true ; shift ;;
-h|--help) show_help=true ; shift ;;
--) shift ; break ;;
Expand All @@ -177,25 +179,10 @@ if [ "$show_version" == true ]; then
exit
fi

# Mandatory parameters
if [[ ("$input_fasta" == false || $n_haps == false) && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: mandatory arguments -i/--input-fasta and -n/--n-haplotypes"
exit
fi

if [[ "$n_haps" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -n/--n-haplotypes must be greater than or equal to 1"
exit
fi

if [[ "$n_mappings" < 1 && "$show_help" == false ]]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

if [ $show_help == true ]; then
padding=`printf %${#0}s` # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> -n <n-haplotypes> [options]"
# Check input parameters
if [ "$show_help" == true ]; then
padding=$(printf %${#0}s) # prints as many spaces as the length of $0
echo "usage: $0 -i <input-fasta> [options]"
echo "options:"
echo " [wfmash]"
echo " -i, --input-fasta FILE input FASTA/FASTQ file"
Expand Down Expand Up @@ -261,13 +248,45 @@ if [ $show_help == true ]; then
echo " -A, --keep-temp-files keep intermediate graphs"
echo " -Z, --compress compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz,"
echo " and variant (.vcf) outputs with bgzip"
echo " --no-pansn Pangenome Sequence Naming (PanSN) not required for sequence names"
echo " --version display the version of pggb"
echo " -h, --help this text"
echo
echo "Use wfmash, seqwish, smoothxg, odgi, gfaffix, and vg to build, project and display a pangenome graph."
exit
elif [ "$input_fasta" = "false" ]; then
>&2 echo "[pggb] ERROR: mandatory argument: -i/--input-fasta"
exit
elif [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
elif [ "$respect_pansn" = "false" ] && { [ "$n_haps" = "false" ] || [ "$n_haps" -lt 1 ]; }; then
>&2 echo "[pggb] ERROR: when no-pansn is set, -n/--n-haplotypes must be greater than or equal to 1"
exit
elif [ "$respect_pansn" = "false" -a "$vcf_spec" != "false" ]; then
>&2 echo "[pggb] ERROR: -V/--vcf-spec and --no-pansn are incompatible"
exit
elif [ "$n_mappings" -lt 1 ]; then
>&2 echo "[pggb] ERROR: -c/--n-mappings must be greater than or equal to 1"
exit
fi

# Check Pangenome Sequence Naming (PanSN)
pansn_not_respected=false
while IFS= read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ "$pansn_not_respected" == "false" ]]; then
pansn_not_respected=$line
break
fi
done < <(cut -f 1 "${input_fasta}.fai")
if [ "$pansn_not_respected" != "false" ]; then
if [ "$respect_pansn" = "false" ]; then
>&2 echo "[pggb] warning: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN)."
else
>&2 echo "[pggb] ERROR: there are sequence names (like '$pansn_not_respected') that do not match the Pangenome Sequence Naming (PanSN). Set --pan-sn to disable this requirement."
exit
fi
fi

# Alignment
if [[ $input_paf == false ]]; then
Expand Down Expand Up @@ -295,7 +314,7 @@ fi

# Normalization ($n_haps is checked in this part of the script because it is also used for the 'auto' mapping sparsification)
if [[ $n_haps == false ]]; then
n_haps=$n_mappings
n_haps=$(cut -f 1 "${input_fasta}.fai" | cut -f 1,2 -d '#' | sort | uniq | wc -l)
fi

sparse_map_cmd=""
Expand Down Expand Up @@ -408,6 +427,7 @@ general:
compress: $compress
threads: $threads
poa_threads: $poa_threads
respect_pansn: $respect_pansn
$mapper:
version: $mapper_version
segment-length: $segment_length
Expand Down Expand Up @@ -458,18 +478,6 @@ reporting:
multiqc: $multiqc
EOT

# Check Pangenome Sequence Naming (PanSN)
if [ ! -f "${input_fasta}.fai" ]; then
echo "[pggb] ERROR: Index for $input_fasta does not exist. Please create it using 'samtools faidx $input_fasta'."
exit 1
fi
warning_emitted=0
cut -f 1 "${input_fasta}.fai" | while read -r line; do
if [[ ! $line =~ ^([^#]+#)+[^#]+$ ]] && [[ $warning_emitted -eq 0 ]]; then
echo "[pggb] Warning: there are sequence names (like '$line') that do not match the Pangenome Sequence Naming (PanSN)."
warning_emitted=1
fi
done

echo -e "\nRunning pggb\n" >> "$log_file"

Expand Down
Loading