Skip to content

Latest commit

 

History

History
678 lines (566 loc) · 20.1 KB

sequence_annotations.md

File metadata and controls

678 lines (566 loc) · 20.1 KB

Sequence Annotations

A. Prokka

  1. Predict genes based on ref genome gff
#!/usr/bin/bash -l
#SBATCH -p batch
#SBATCH -J Emb_Prk_Mpox
#SBATCH -n 3
#SBATCH --mem=8G

# Exit with error reporting
set -e
set -x

# Load modules
module purge
module load clustalw/2.1
module load emboss/6.6.0
module load samtools/1.15.1
module load prokka/1.14.6

# Define directories
WORK_DIR="/var/scratch/${USER}/viral_genomes/mpox_data"
FASTA_DIR="${WORK_DIR}/mpox_fasta"
OUT_DIR="${WORK_DIR}/mpox_results/mpox_prokka"

# Make the output directory if it doesn't exist
if [ ! -e "${OUT_DIR}" ]; then
  mkdir -p "${OUT_DIR}"
fi

# Reference protein fasta in gbk format
protein_db="${WORK_DIR}/mpox_refseqs/Mpox_ref_NC_063383.1.gb"

echo "Running Prokka"
# Loop through all fasta.gz files in the input directory
for gz_file in "${FASTA_DIR}"/*.fasta.gz; do
    if [ -f "$gz_file" ]; then
        sample=$(basename "$gz_file" .fasta.gz)
        temp_fasta="${FASTA_DIR}/${sample}.temp.fasta"
        final_fasta="${FASTA_DIR}/${sample}.fasta"
        compressed_fasta="${OUT_DIR}/${sample}.fasta.gz"
        output_path="${OUT_DIR}/${sample}"

        # Decompress the gz file to a temporary fasta file
        echo "Decompressing ${gz_file} to ${temp_fasta}"
        gunzip -c "$gz_file" > "$temp_fasta"

        # Check if the temporary file is in FASTA format
        if grep -q "^>" "$temp_fasta"; then
            echo "File ${temp_fasta} is in FASTA format"
            fasta_file="$temp_fasta"
        else
            # Convert to FASTA format if it's not
            fasta_file="${FASTA_DIR}/${sample}.converted.fasta"
            echo "Converting ${temp_fasta} to FASTA format"
            seqret -sequence "$temp_fasta" -outseq "$fasta_file"
        fi

        # Clean the FASTA headers and save to the final fasta file
        echo "Cleaning headers in ${fasta_file}"
        awk '/^>/ {header=$0; gsub(/[|\/\-]/, "_", header); print header} !/^>/ {print}' "$fasta_file" > "$final_fasta"

        # Ensure the final fasta file has a proper FASTA header
        if ! grep -q "^>" "$final_fasta"; then
        echo "ERROR!!! The file ${final_fasta} does not have a proper FASTA header. Adding default headers."

        # Add default headers to each sequence
        awk 'BEGIN {header_num=1} !/^>/ {print ">sequence_" header_num "\n" $0; header_num++} /^>/ {print}' "$final_fasta" > "${final_fasta}.tmp"
        mv "${final_fasta}.tmp" "$final_fasta"
        echo "Default headers added to ${final_fasta}"
        else
            echo "The file ${final_fasta} has been cleaned and is in proper FASTA format"

        fi

        # Compress the final fasta file
        echo "Compressing ${final_fasta} to ${compressed_fasta}"
        bgzip -c "$final_fasta" > "$compressed_fasta"

        # Index the compressed fasta file
        echo "Indexing ${compressed_fasta}"
        samtools faidx "$compressed_fasta"

        # Echo fasta file path for the current sample
        echo "contigs_fasta for ${sample}: ${final_fasta}"

        # Run the Prokka command on the current file
        echo "Processing Prokka for ${sample}: ${final_fasta}"
        prokka "$final_fasta" \
            --outdir "$output_path" \
            --cpus 3 \
            --mincontiglen 200 \
            --kingdom Viruses \
            --centre WHO \
            --addgenes \
            --addmrna \
            --locustag MPXV \
            --genus "Mpox virus" \
            --proteins "$protein_db" \
            --usegenus \
            --compliant \
            --rfam \
            --force \
            --debug

        # Clean up all temp files
        rm -f "$temp_fasta"

    else
        echo "ERROR!!! fasta.gz file not found: ${gz_file}"
    fi
done
  1. Concatenate & group prokka data of interest (gffs, gbks, gbfs & faas) per sample in each sample's own folder
#!/bin/bash

# Define directories
WORK_DIR="/var/scratch/${USER}/viral_genomes/mpox_data"
PROKKA_DIR="${WORK_DIR}/mpox_results/mpox_prokka"
OUTPUT_DIR="${WORK_DIR}/mpox_results/prokka_data"

# Make the output directory if it doesn't exist
if [ ! -e "${OUTPUT_DIR}" ]; then
  mkdir -p "${OUTPUT_DIR}"
fi

# Add a debug statement to check PLASMIDS_DIR
echo "PROKKA_DIR: ${PROKKA_DIR}"

# Iterate over subdirectories in GFF_DIR
for sample_dir in "${PROKKA_DIR}"/*; do
  if [ ! -d "${sample_dir}" ]; then
    continue
  fi

  # Extract the sample name from the directory name
  sample_name=$(basename "${sample_dir}") 
  gff_file="${sample_dir}/PROKKA_07132024.gff"
  gbk_file="${sample_dir}/PROKKA_07132024.gbk"
  gbf_file="${sample_dir}/PROKKA_07132024.gbf"
  faa_file="${sample_dir}/PROKKA_07132024.faa"

  #Check the  files of interests #gff #gbk #gbf #faa
  if [ ! -f "${gff_file}" ]; then
    echo "Error: ${gff_file} not found!"
  elif [ ! -s "${gff_file}" ]; then
    echo "Error: ${gff_file} is empty!"
  elif [ ! -r "${gff_file}" ]; then
    echo "Error: ${gff_file} is unreadable!"
  else
    echo "${gff_file} is present and readable."
  fi

  if [ ! -f "${gbk_file}" ]; then
    echo "Error: ${gbk_file} not found!"
  elif [ ! -s "${gbk_file}" ]; then
    echo "Error: ${gbk_file} is empty!"
  elif [ ! -r "${gbk_file}" ]; then
    echo "Error: ${gbk_file} is unreadable!"
  else
    echo "${gbk_file} is present and readable."
  fi

  if [ ! -f "${gbf_file}" ]; then
    echo "Error: ${gbf_file} not found!"
  elif [ ! -s "${gbf_file}" ]; then
    echo "Error: ${gbf_file} is empty!"
  elif [ ! -r "${gbf_file}" ]; then
    echo "Error: ${gbf_file} is unreadable!"
  else
    echo "${gbf_file} is present and readable."
  fi

  if [ ! -f "${faa_file}" ]; then
    echo "Error: ${faa_file} not found!"
  elif [ ! -s "${faa_file}" ]; then
    echo "Error: ${faa_file} is empty!"
  elif [ ! -r "${faa_file}" ]; then
    echo "Error: ${faa_file} is unreadable!"
  else
    echo "${faa_file} is present and readable."
  fi

  OUT="${OUTPUT_DIR}/${sample_name}"
  mkdir -p "${OUT}"

  cat_gff="${OUT}/${sample_name}.gff"
  cat_gbk="${OUT}/${sample_name}.gbk"
  cat_gbf="${OUT}/${sample_name}.gbf"
  cat_faa="${OUT}/${sample_name}.faa"

  echo "Concatenating files for ${sample_name}:"
  echo "Gff file: ${gff_file}"
  echo "Gbk file: ${gbk_file}"
  echo "Gbf file: ${gbf_file}"
  echo "Faa file: ${faa_file}"

  cat "${gff_file}" > "${cat_gff}"
  cat "${gbk_file}" > "${cat_gbk}"
  cat "${gbf_file}" > "${cat_gbf}"
  cat "${faa_file}" > "${cat_faa}"
  echo "Concatenating is complete for ${sample_name}"
  echo "Concatenated assembly: ${cat_gff}"
  echo "Concatenated assembly: ${cat_gbk}"
  echo "Concatenated assembly: ${cat_gbf}"
  echo "Concatenated assembly: ${cat_faa}"
done

echo "Concatenation for all samples is complete!"
  1. Concatenate & group prokka data gffs, gbks, gbfs & faas in their own folders eg. grouping all gbks from all samples in one gbk_data folder
#!/bin/bash

# Define directories
WORK_DIR="/var/scratch/woguta/viral_genomes/mpox_data"
PROKKA_DIR="${WORK_DIR}/mpox_results/prokka_data"

# Add a debug statement to check PPROKKA_DIR
echo "PROKKA_DIR: ${PROKKA_DIR}"

# Iterate over subdirectories in PROKKA_DIR
for sample_dir in "${PROKKA_DIR}"/*; do
  if [ ! -d "${sample_dir}" ]; then
    continue
  fi

  # Extract the sample name from the directory name
  sample_name=$(basename "${sample_dir}") 
  gff_file="${sample_dir}/${sample_name}.gff"
  gbk_file="${sample_dir}/${sample_name}.gbk"
  gbf_file="${sample_dir}/${sample_name}.gbf"
  faa_file="${sample_dir}/${sample_name}.faa"

  #Check the  files of interests #gff #gbk #gbf #faa
  if [ ! -f "${gff_file}" ]; then
    echo "Error: ${gff_file} not found!"
  elif [ ! -s "${gff_file}" ]; then
    echo "Error: ${gff_file} is empty!"
  elif [ ! -r "${gff_file}" ]; then
    echo "Error: ${gff_file} is unreadable!"
  else
    echo "${gff_file} is present and readable."
  fi

  if [ ! -f "${gbk_file}" ]; then
    echo "Error: ${gbk_file} not found!"
  elif [ ! -s "${gbk_file}" ]; then
    echo "Error: ${gbk_file} is empty!"
  elif [ ! -r "${gbk_file}" ]; then
    echo "Error: ${gbk_file} is unreadable!"
  else
    echo "${gbk_file} is present and readable."
  fi

  if [ ! -f "${gbf_file}" ]; then
    echo "Error: ${gbf_file} not found!"
  elif [ ! -s "${gbf_file}" ]; then
    echo "Error: ${gbf_file} is empty!"
  elif [ ! -r "${gbf_file}" ]; then
    echo "Error: ${gbf_file} is unreadable!"
  else
    echo "${gbf_file} is present and readable."
  fi

  if [ ! -f "${faa_file}" ]; then
    echo "Error: ${faa_file} not found!"
  elif [ ! -s "${faa_file}" ]; then
    echo "Error: ${faa_file} is empty!"
  elif [ ! -r "${faa_file}" ]; then
    echo "Error: ${faa_file} is unreadable!"
  else
    echo "${faa_file} is present and readable."
  fi

  OUT="${PROKKA_DIR}/prokka_gff"
  OUT1="${PROKKA_DIR}/prokka_gbk"
  OUT2="${PROKKA_DIR}/prokka_gbf"
  OUT3="${PROKKA_DIR}/prokka_faa"
  mkdir -p "${OUT}"
  mkdir -p "${OUT1}"
  mkdir -p "${OUT2}"
  mkdir -p "${OUT3}"

  cat_gff="${OUT}/${sample_name}.gff"
  cat_gbk="${OUT1}/${sample_name}.gbk"
  cat_gbf="${OUT2}/${sample_name}.gbf"
  cat_faa="${OUT3}/${sample_name}.faa"

  echo "Concatenating files for ${sample_name}:"
  echo "Gff file: ${gff_file}"
  echo "Gbk file: ${gbk_file}"
  echo "Gbf file: ${gbf_file}"
  echo "Faa file: ${faa_file}"

  cat "${gff_file}" > "${cat_gff}"
  cat "${gbk_file}" > "${cat_gbk}"
  cat "${gbf_file}" > "${cat_gbf}"
  cat "${faa_file}" > "${cat_faa}"
  echo "Concatenating is complete for ${sample_name}"
  echo "Concatenated assembly: ${cat_gff}"
  echo "Concatenated assembly: ${cat_gbk}"
  echo "Concatenated assembly: ${cat_gbf}"
  echo "Concatenated assembly: ${cat_faa}"
done

echo "Concatenation for all samples is complete!"
  1. Renaming gff, gbk and gbf files with correct header inside out
#!/bin/bash

# Define directories
WORK_DIR="/var/scratch/${USER}/viral_genomes/mpox_data"
GFF_DIR="${WORK_DIR}/mpox_results/prokka_data/prokka_gff"
GBK_DIR="${WORK_DIR}/mpox_results/prokka_data/prokka_gbk"
GBF_DIR="${WORK_DIR}/mpox_results/prokka_data/prokka_gbf"

# Loop through each sample in the GFF_DIR
for gff_file in "${GFF_DIR}"/*.gff; do
  # Extract the sample name from the file name
  sample_name=$(basename "${gff_file}" .gff) 
  gbk_file="${GBK_DIR}/${sample_name}.gbk"
  gbf_file="${GBF_DIR}/${sample_name}.gbf"
  
  # Function to check files
  check_file() {
    local file=$1
    if [ ! -f "${file}" ]; then
      echo "Error: ${file} not found!"
    elif [ ! -s "${file}" ]; then
      echo "Error: ${file} is empty!"
    elif [ ! -r "${file}" ]; then
      echo "Error: ${file} is unreadable!"
    else
      echo "${file} is present and readable."
      return 0
    fi
    return 1
  }

  # Function to process GFF file
  process_gff() {
    local file=$1
    local tmp_file=$(mktemp)
    while IFS= read -r line; do
      if [[ $line == "##sequence-region"* ]]; then
        echo "##sequence-region ${sample_name}" >> "${tmp_file}"
      elif [[ $line == *"gnl|WHO|MPXV_1"* ]]; then
        echo "${line//gnl|WHO|MPXV_1/gnl|WHO|${sample_name}}" >> "${tmp_file}"
      else
        echo "${line}" >> "${tmp_file}"
      fi
    done < "${file}"
    mv "${tmp_file}" "${file}"
    echo "Updated ${file} with sample name ${sample_name}."
  }

  # Function to process GBK and GBF files
  process_gbk_gbf() {
    local file=$1
    local tmp_file=$(mktemp)
    while IFS= read -r line; do
      if [[ $line == LOCUS* ]]; then
        echo "${line/MPXV_1/${sample_name}}" >> "${tmp_file}"
      else
        echo "${line}" >> "${tmp_file}"
      fi
    done < "${file}"
    mv "${tmp_file}" "${file}"
    echo "Updated ${file} with sample name ${sample_name}."
  }

  # Check and process GFF file
  if check_file "${gff_file}"; then
    process_gff "${gff_file}"
  fi

  # Check and process GBK file
  if check_file "${gbk_file}"; then
    process_gbk_gbf "${gbk_file}"
  fi

  # Check and process GBF file
  if check_file "${gbf_file}"; then
    process_gbk_gbf "${gbf_file}"
  fi
done

B. Variants calling

Involves using snippy and variants annotation/prediction using snpeff, extraction using snpsift and R

  1. Variants calling using snippy
#!/usr/bin/bash -l
#SBATCH -p batch
#SBATCH -J Snippy_Mpox
#SBATCH -n 4
#SBATCH --mem=8G

# Exit with error reporting
set -e
set -x

# load required modules
module purge
module load perl/5.22.3
module load snippy/4.6.0
module load python/3.9

# Directories path
WORK_DIR="/var/scratch/${USER}/viral_genomes/mpox_data"
DATA_DIR="${WORK_DIR}/mpox_fasta"
REF_DIR="${WORK_DIR}/mpox_refseqs"
OUT_DIR="${WORK_DIR}/mpox_results/mpox_snippy"

# Define ref
reference="${REF_DIR}/Mpox_ref_NC_063383.1.fasta"

# Make directories
if [ ! -e "${OUT_DIR}" ]; then
  mkdir -p "${OUT_DIR}"
fi

# Step 1: Run snippy
for fasta_file in "${DATA_DIR}"/*.fasta; do
    sample=$(basename "${fasta_file}" .fasta)

    # create output directory for every sample
    OUT="${OUT_DIR}/${sample}"

    if [ ! -e "${OUT}" ]; then
        mkdir -p "${OUT}"
    fi

    vcf="${OUT}/${sample}.vcf"
    bed="${OUT}/${sample}.bed"
    gff="${OUT}/${sample}.gff"
    csv="${OUT}/${sample}.csv"
    html="${OUT}/${sample}.html"

    if [ ! -f "${vcf}" ] && [ ! -f "${bed}" ] && [ ! -f "${gff}" ] && [ ! -f "${csv}" ] && [ ! -f "${html}" ]; then
        echo -e "Calling variants on sample:\t${sample}; Fasta file: ${fasta_file}"
        snippy \
            --cpus 4 \
            --ram 8 \
            --prefix "${sample}" \
            --cleanup \
            --mapqual 60 \
            --basequal 15 \
            --mincov 10 \
            --force \
            --outdir "${OUT}" \
            --ref "${reference}" \
            --ctgs "${fasta_file}"
    fi
done

# Step 2: Run snippy-core
dirs=()
for d in "${OUT_DIR}"/*; do
    if [ -d "${d}" ] && [ "${d}" != 'reference' ]; then
        dirs+=("${d}")
    fi
done

DIRS=$(printf '%s ' "${dirs[@]}")
echo "${DIRS}"

CORE_DIR="${OUT_DIR}/snippy-core"

if [ ! -e "${CORE_DIR}" ]; then
    mkdir -p "${CORE_DIR}"
fi

cd "${CORE_DIR}"
echo -e "Running snippy-core"
snippy-core --ref "${reference}" --prefix core ${DIRS}
  1. Variants annotation, prediction and extraction using snpeff and snpsift
#!/usr/bin/bash -l
#SBATCH -p batch
#SBATCH -J mpox_snp_eff_sift
#SBATCH -n 4

# Exit immediately if a command exits with a non-zero status
set -e

# Load necessary modules
module purge
module load samtools/1.15.1
module load bcftools/1.15.1
module load snpeff/4.1g
module load java/17

# Directories path
WORK_DIR="/var/scratch/${USER}/viral_genomes/mpox_data"
REF_DIR="${WORK_DIR}/mpox_refseqs"
SNIPPY_DIR="${WORK_DIR}/mpox_results/mpox_snippy"

#snpEff config for Mpox virus
echo "Processing snpEff config for mpox"
mkdir -p "./database/snpEff/data/NC063383/"
cp -rf $REF_DIR/Mpox_ref_NC_063383.1.gff3 ./database/snpEff/data/NC063383/genes.gff
cp -rf $REF_DIR/Mpox_ref_NC_063383.1.fasta ./database/snpEff/data/NC063383/sequences.fa
echo -e "# Mpox viral genome, version MpoxNC063383\nNC063383.genome: NC063383" > ./database/snpEff/data/NC063383/snpEff.config

# Step 3: Build the database
java -Xmx4g -jar /export/apps/snpeff/4.1g/snpEff.jar build \
        -config ./database/snpEff/data/NC063383/snpEff.config \
        -dataDir ./../ \
        -gff3 \
        -v NC063383

# Define the output prefix for the index files and its directory
reference_prefix="${output_dir}/NC063383/NC063383"
mkdir -p "${reference_prefix}"

#Step 4: Compress the filtered VCF files #bgzip -c file.vcf > file.vcf.gz
echo "Compressing filtered VCF files..."
for sample_dir in "${SNIPPY_DIR}"/*; do
    sample_name=$(basename "${sample_dir}")
    filtered_vcf="${sample_dir}/${sample_name}.filt.vcf"
    echo "Compressing: ${filtered_vcf}"
    bgzip "${filtered_vcf}"
done

# Step 5: Index the filtered.vcf.gz VCF files
echo "Indexing filtered VCF files..."
for sample_dir in "${SNIPPY_DIR}"/*; do
    sample_name=$(basename "${sample_dir}")
    filtered_vcf_gz="${sample_dir}/${sample_name}.filt.vcf.gz"
    echo "Indexing: ${filtered_vcf_gz}"
    bcftools index -c --threads 4 "${filtered_vcf_gz}"
done

# Step 6: Variant Annotation with GFF file
# Define input files and directories
vcf_dir="${SNIPPY_DIR}"
annotated_dir="./mpox_results/annotated_mpox_variants_snippy"
reference_file="./database/snpEff/data/NC063383/sequences.fa"
gff_file="../database/snpEff/data/NC063383/genes.gff"
snpeff_jar="/export/apps/snpeff/4.1g/snpEff.jar"

# Create output directory for annotated variants
mkdir -p "$annotated_dir"

# Run SnpEff for variant annotation 
echo "Performing variant annotation..."
for sample_dir in "${SNIPPY_DIR}"/*; do
    sample_name=$(basename "${sample_dir}")
    vcf_file="${sample_dir}/${sample_name}.filt.vcf.gz"
    echo "Processing ${sample_name}: ${vcf_file}"
    bcftools view --threads 4 "$vcf_file" |
    java -Xmx4g -jar "$snpeff_jar" \
           -config ./database/snpEff/data/NC063383/snpEff.config \
           -dataDir ./../ \
           -v NC063383 ${vcf_file} > "${annotated_dir}/${sample_name}.snpEff.vcf"

echo "Variant annotation complete."

# Step 7: Rename summary.html and genes.txt and zip vcf files
mv ./snpEff_summary.html "${annotated_dir}/${sample_name}.snpEff.summary.html"
mv ./snpEff_genes.txt "${annotated_dir}/${sample_name}.snpEff.genes.txt"

# Compress vcf
bgzip -c "${annotated_dir}/${sample_name}.snpEff.vcf" > "${annotated_dir}/${sample_name}.snpEff.vcf.gz"

# Create tabix index - Samtools
tabix -p vcf -f "${annotated_dir}/${sample_name}.snpEff.vcf.gz"

# Generate VCF files
bcftools stats "${annotated_dir}/${sample_name}.snpEff.vcf.gz" > "${annotated_dir}/${sample_name}.snpEff.stats.txt"

echo "Variant summary renaming, compressing complete."

done

# Step 8: Variant Extraction with SnpSift
# Define input files and directories
snpeff_dir="${annotated_dir}"
extracted_dir="./mpox_results/extracted_variants_snippy"

# Create output directory for extracted variants
mkdir -p "$extracted_dir"

# Run SnpSift for variant extraction #${sample}.filtered.snpEff.genes.txt
echo "Performing variant extraction..."
for snpeff_file in "$snpeff_dir"/*.snpEff.vcf.gz; do
    sample_name=$(basename "${snpeff_file}" .snpEff.vcf.gz)
    echo "Processing ${sample_name}: ${snpeff_file}"
    bcftools view --threads 4 "$snpeff_file" |
    java -Xmx4g -jar "/export/apps/snpeff/4.1g/SnpSift.jar" \
        extractFields \
        -s "," \
        -e "." \
        /dev/stdin \
        "ANN[*].GENE" "ANN[*].GENEID" \
        "ANN[*].IMPACT" "ANN[*].EFFECT" \
        "ANN[*].FEATURE" "ANN[*].FEATUREID" \
        "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \
        "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \
        "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \
        "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \
        "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \
        "LOF[*].GENE" "LOF[*].GENEID" "LOF[*].NUMTR" "LOF[*].PERC" \
        "NMD[*].GENE" "NMD[*].GENEID" "NMD[*].NUMTR" "NMD[*].PERC" \
        > "${extracted_dir}/${sample_name}.snpsift.txt"
done

echo "SnpSift variant extraction complete"
  1. Extract the variants for analysis and visualization in R
#!/usr/bin/Rscript

# Load necessary modules
module purge
module load R/4.3

# Open R console/studio in the terminal
R

# Remove all objects saved in R workspace
rm(list = ls())

# Set the input directory where the sample directories are located
input_dir <- "./mpox_results/extracted_variants_snippy"

# Get a list of all sample directories in the input directory
file_list <- list.files(input_dir, pattern = "\\.snpsift\\.txt", full.names = TRUE)

# Create an empty data frame to store the aggregated data
agg_df <- data.frame()

# Loop over each annotated result file
for (file in file_list) {
  # Read the annotated data from the file
  annotated_data <- read.delim(file, sep = "\t", header = TRUE)

  # Skip sample if no annotated data was found
  if (nrow(annotated_data) == 0) {
    next
  }

  # Extract the sample name from the file path
  sample_name <- sub("\\.snpsift\\.txt$", "", basename(file))

  # Add the sample name as a column in the data frame
  annotated_data$sample_name <- sample_name

  # Append the data to the aggregated data frame
  agg_df <- rbind(agg_df, annotated_data)
}

# Set the output file name
output_file <- "combined_annotated_data.csv"

# Check if the aggregated data frame is empty
if (nrow(agg_df) > 0) {
  # Write the aggregated data to a CSV file
  write.csv(agg_df, file = output_file, row.names = FALSE)
  cat("Variant extraction complete. Data saved to:", output_file, "\n")
} else {
  cat("No data extracted. Output file not created.\n")
}