Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
bdolmo committed Nov 18, 2024
0 parents commit 5a27eff
Show file tree
Hide file tree
Showing 1,015 changed files with 195,698 additions and 0 deletions.
1 change: 1 addition & 0 deletions libdeflate
Submodule libdeflate added at d5070b
27 changes: 27 additions & 0 deletions src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# fragment_extractor

### Synopsis
Tool to extract read counts from cfDNA samples.

Features:
- Count total read count per window (default 1Mb)
- Count individual short fragments (< 150bp) and long fragments (> 150bp)
- fragment size ratio (short/long)

Outputs:
- wig file (can be fed to ichorCNA!)
- bed file with the following structure:
```
chr pos end read_count ultra_short_fragments short_fragmentslong_fragments fragment_size_ratio
chr1 0 1000000 20105 290 3974 15841 0.0729743
chr1 1000000 2000000 73446 1322 16566 55558 0.079802
chr1 2000000 3000000 58489 1032 13101 44356 0.0787726
```
### Usage
```
./fragment_extractor <BAM file> <BED file> <output file> <WIG file>
```

### Todo:
- Include a parameter to control windo size.
- Define short, long fragment thresholds.
161 changes: 161 additions & 0 deletions src/cfDNAcounter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#include <htslib/sam.h>
#include <htslib/hts.h>
#include <string>
#include <vector>
#include <tuple>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include <sstream>

struct FragmentStats {
int read_count;
int ultra_short_fragments;
int short_fragments;
int long_fragments;
double fragment_size_ratio;
};

// Calculate fragment statistics for each region
FragmentStats CalculateFragmentStats(htsFile* bamFile, bam_hdr_t* header, hts_idx_t* idx, const std::string& region) {
FragmentStats stats = {0, 0, 0, 0, 0.0};

hts_itr_t* iter = sam_itr_querys(idx, header, region.c_str());
if (!iter) {
throw std::runtime_error(" ERROR: Failed to set iterator for region: " + region);
}
bam1_t* b = bam_init1();
while (sam_itr_next(bamFile, iter, b) >= 0) {

if (b->core.qual < 20) {
continue;
}

++stats.read_count;

const uint32_t* cigar = bam_get_cigar(b);
int sequence_length = b->core.l_qseq;

if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
sequence_length -= bam_cigar_oplen(cigar[0]);
}
if (bam_cigar_op(cigar[b->core.n_cigar - 1]) == BAM_CSOFT_CLIP) {
sequence_length -= bam_cigar_oplen(cigar[b->core.n_cigar - 1]);
}

if (sequence_length < 100) {
++stats.ultra_short_fragments;
} else if (sequence_length >= 100 && sequence_length <= 150) {
++stats.short_fragments;
} else {
++stats.long_fragments;
}
}
bam_destroy1(b);
hts_itr_destroy(iter);

// Calculate fragment size ratio
if (stats.short_fragments > 0) {
stats.fragment_size_ratio = static_cast<double>(stats.ultra_short_fragments) / stats.short_fragments;
}

return stats;
}

// Parse regions from BED file
std::vector<std::tuple<std::string, int, int>> ParseRegionsFromBed(const std::string& bedFile) {
std::vector<std::tuple<std::string, int, int>> regions;
std::ifstream infile(bedFile);

if (!infile) {
throw std::runtime_error(" ERROR: Failed to open BED file: " + bedFile);
}

std::string line;
while (std::getline(infile, line)) {
std::istringstream ss(line);
std::string chrom;
int start, end;
if (ss >> chrom >> start >> end) {
regions.emplace_back(chrom, start, end);
}
}

return regions;
}

int main(int argc, char* argv[]) {
if (argc < 5) {
std::cerr << "Usage: " << argv[0] << " <BAM file> <BED file> <output file> <WIG file>" << std::endl;
return 1;
}

std::string bamFilePath = argv[1];
std::string bedFile = argv[2];
std::string outputFile = argv[3];
std::string wigFile = argv[4];

try {
htsFile* bamFile = sam_open(bamFilePath.c_str(), "r");
if (!bamFile) throw std::runtime_error(" ERROR: Failed to open BAM file: " + bamFilePath);

bam_hdr_t* header = sam_hdr_read(bamFile);
if (!header) throw std::runtime_error(" ERROR: Failed to read BAM header from: " + bamFilePath);

hts_idx_t* idx = sam_index_load(bamFile, bamFilePath.c_str());
if (!idx) throw std::runtime_error(" ERROR: Failed to load BAM index for: " + bamFilePath);

std::vector<std::tuple<std::string, int, int>> regions = ParseRegionsFromBed(bedFile);

std::ofstream outFile(outputFile);
if (!outFile) throw std::runtime_error(" ERROR: Failed to open output file: " + outputFile);

// Open the WIG output file
std::ofstream wigOut(wigFile);
if (!wigOut) throw std::runtime_error(" ERROR: Failed to open WIG file: " + wigFile);

// Write headers for the main output
outFile << "chr\tpos\tend\tread_count\tultra_short_fragments\tshort_fragments\tlong_fragments\tfragment_size_ratio\n";

std::string current_chrom = "";
int step = 1000000;

// Process each region
for (const auto& [chrom, start, end] : regions) {
std::string region = chrom + ":" + std::to_string(start) + "-" + std::to_string(end);
FragmentStats stats = CalculateFragmentStats(bamFile, header, idx, region);

// Write data to the main output file
outFile << chrom << "\t" << start << "\t" << end << "\t"
<< stats.read_count << "\t"
<< stats.ultra_short_fragments << "\t"
<< stats.short_fragments << "\t"
<< stats.long_fragments << "\t"
<< stats.fragment_size_ratio << "\n";

// Write short_fragments data to the WIG file in fixedStep format
if (chrom != current_chrom) {
// Start a new fixedStep section for a new chromosome
wigOut << "fixedStep chrom=" << chrom << " start=1" << " step=" << step << " span=" << step << "\n";
current_chrom = chrom;
}
wigOut << stats.short_fragments << "\n";
}

outFile.close();
wigOut.close();

// Clean up
hts_idx_destroy(idx);
bam_hdr_destroy(header);
sam_close(bamFile);

} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}

return 0;
}


Binary file added src/cfDNAcounter.o
Binary file not shown.
Binary file added src/fragment_extractor
Binary file not shown.
Loading

0 comments on commit 5a27eff

Please sign in to comment.