calculate_cpos.sh

#!/bin/bash
# Script: calculate_methylated_c_positions.sh
# Author: Ruining Dong
# Date: 2024-06-26
# Summary: This script processes an input BED file to calculate the positions of methylated 'C's 
# based on the MM tag, converts them to CpG index values using wgbstools, and handles invalid genomic regions.
# input bed is generated by another script: patformm


# mamba activate methyl_env
source /g/data/pq08/software/mambaforge/etc/profile.d/conda.sh
conda activate /g/data/pq08/software/mambaforge/envs/methyl_env
module load samtools
patformm_path=/g/data/pq08/projects/biomodal/patformm

# Function to display usage information
usage() {
    echo "Usage: $0 [--threads <threads>] [--chunk-size <chunk_size>] [-o <output_file>] <input_bed>"
    exit 1
}

# Default values
THREADS=1
OUTPUT_FILE=""
CHUNK_SIZE=""  # Empty by default

# Parse the command-line arguments
while [[ "$#" -gt 0 ]]; do
    case $1 in
        --threads)
            THREADS=$2
            shift
            ;;
        --chunk-size)
            CHUNK_SIZE=$2
            shift
            ;;
        -o)
            OUTPUT_FILE=$2
            shift
            ;;
        *)
            if [[ -z "$INPUT_BED" ]]; then
                INPUT_BED=$1
            else
                usage
            fi
            ;;
    esac
    shift
done

# Check if the input BED file is provided
if [[ -z "$INPUT_BED" ]]; then
    usage
fi


# Create a temporary directory for split files
export TMPDIR=/scratch/pq08/rd6078/tmp
tmp_dir=$(mktemp -d)
# tmp_dir=/scratch/pq08/rd6078/tmp/tmp.smPmXC0Ndp
# echo "WARNING: resuming previously failed run for testing purposes: $tmp_dir"

# Split the input BED file based on chunk size or thread count
echo "split BED to $tmp_dir"
if [[ -n "$CHUNK_SIZE" ]]; then
    split -l "$CHUNK_SIZE" "$INPUT_BED" "$tmp_dir/bed_chunk_"
else
    split -l $(( $(wc -l < "$INPUT_BED") / THREADS + 1 )) "$INPUT_BED" "$tmp_dir/bed_chunk_"
fi

# Process each chunk in parallel and Combine the results
echo "process temp bed chunks"
find "$tmp_dir" -maxdepth 1 -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" -I {} $patformm_path/process_chunk.sh {} # &&
	# cat "$tmp_dir"/*.out > $tmp_dir/tmp_final_output.bed
# find "$tmp_dir" -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" bash -c 'process_chunk "$0"' _
# find "$tmp_dir" -name 'bed_chunk_*' | xargs -n 1 -P "$THREADS" -I {} bash -c 'process_chunk "$@"' _ {}
# Combine results and recalculate counts
echo "Combining results..."
cat "$tmp_dir"/*.out | \
    sort -k1,1 -k2,2n -k3,3 | \
    awk '
        BEGIN {OFS="\t"}
        {
            key = $1 "\t" $2 "\t" $3
            count[key] += $4
        }
        END {
            for (key in count) {
                print key, count[key]
            }
        }
    ' > "$tmp_dir/final_combined.bed"
# Sort the BED file and convert the second field to an integer
sort -k2,2n -k3,3 $tmp_dir/final_combined.bed  > $tmp_dir/sorted_final_combined.bed

# Compress the sorted BED file using bgzip
# bgzip sorted_final_combined.bed

# Index the compressed BED file using tabix
# tabix -p bed sorted_final_combined.bed.gz

# Output results
if [[ -z "$OUTPUT_FILE" ]]; then
    cat "$tmp_dir/sorted_final_combined.bed"
else
    mv "$tmp_dir/sorted_final_combined.bed" "$OUTPUT_FILE"
    # Compress and index if needed
    bgzip $OUTPUT_FILE && tabix -C -b 2 -e 2 -m 12 $OUTPUT_FILE.gz

fi

# rm -r "$tmp_dir"