-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_cpos.sh
executable file
·114 lines (99 loc) · 3.31 KB
/
calculate_cpos.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/bin/bash
# Script: calculate_methylated_c_positions.sh
# Author: Ruining Dong
# Date: 2024-06-26
# Summary: This script processes an input BED file to calculate the positions of methylated 'C's
# based on the MM tag, converts them to CpG index values using wgbstools, and handles invalid genomic regions.
# input bed is generated by another script: patformm
# mamba activate methyl_env
source /g/data/pq08/software/mambaforge/etc/profile.d/conda.sh
conda activate /g/data/pq08/software/mambaforge/envs/methyl_env
module load samtools
patformm_path=/g/data/pq08/projects/biomodal/patformm
# Function to display usage information
usage() {
echo "Usage: $0 [--threads <threads>] [--chunk-size <chunk_size>] [-o <output_file>] <input_bed>"
exit 1
}
# Default values
THREADS=1
OUTPUT_FILE=""
CHUNK_SIZE="" # Empty by default
# Parse the command-line arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
--threads)
THREADS=$2
shift
;;
--chunk-size)
CHUNK_SIZE=$2
shift
;;
-o)
OUTPUT_FILE=$2
shift
;;
*)
if [[ -z "$INPUT_BED" ]]; then
INPUT_BED=$1
else
usage
fi
;;
esac
shift
done
# Check if the input BED file is provided
if [[ -z "$INPUT_BED" ]]; then
usage
fi
# Create a temporary directory for split files
export TMPDIR=/scratch/pq08/rd6078/tmp
tmp_dir=$(mktemp -d)
# tmp_dir=/scratch/pq08/rd6078/tmp/tmp.smPmXC0Ndp
# echo "WARNING: resuming previously failed run for testing purposes: $tmp_dir"
# Split the input BED file based on chunk size or thread count
echo "split BED to $tmp_dir"
if [[ -n "$CHUNK_SIZE" ]]; then
split -l "$CHUNK_SIZE" "$INPUT_BED" "$tmp_dir/bed_chunk_"
else
split -l $(( $(wc -l < "$INPUT_BED") / THREADS + 1 )) "$INPUT_BED" "$tmp_dir/bed_chunk_"
fi
# Process each chunk in parallel and Combine the results
echo "process temp bed chunks"
find "$tmp_dir" -maxdepth 1 -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" -I {} $patformm_path/process_chunk.sh {} # &&
# cat "$tmp_dir"/*.out > $tmp_dir/tmp_final_output.bed
# find "$tmp_dir" -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" bash -c 'process_chunk "$0"' _
# find "$tmp_dir" -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" -I {} bash -c 'process_chunk "$@"' _ {}
# Combine results and recalculate counts
echo "Combining results..."
cat "$tmp_dir"/*.out | \
sort -k1,1 -k2,2n -k3,3 | \
awk '
BEGIN {OFS="\t"}
{
key = $1 "\t" $2 "\t" $3
count[key] += $4
}
END {
for (key in count) {
print key, count[key]
}
}
' > "$tmp_dir/final_combined.bed"
# Sort the BED file and convert the second field to an integer
sort -k2,2n -k3,3 $tmp_dir/final_combined.bed > $tmp_dir/sorted_final_combined.bed
# Compress the sorted BED file using bgzip
# bgzip sorted_final_combined.bed
# Index the compressed BED file using tabix
# tabix -p bed sorted_final_combined.bed.gz
# Output results
if [[ -z "$OUTPUT_FILE" ]]; then
cat "$tmp_dir/sorted_final_combined.bed"
else
mv "$tmp_dir/sorted_final_combined.bed" "$OUTPUT_FILE"
# Compress and index if needed
bgzip $OUTPUT_FILE && tabix -C -b 2 -e 2 -m 12 $OUTPUT_FILE.gz
fi
# rm -r "$tmp_dir"