Skip to content

Commit

Permalink
Merge pull request #9 from Illumina/read-caching
Browse files Browse the repository at this point in the history
Read caching
  • Loading branch information
egor-dolzhenko authored Jul 19, 2017
2 parents b0c3f5d + 515e3e1 commit 367f93c
Show file tree
Hide file tree
Showing 11 changed files with 235 additions and 148 deletions.
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <ostream>
#include <string>


enum class ReadType { kSpanning, kFlanking, kInrepeat, kOther };
const std::map<ReadType, std::string> kReadTypeToString = {
{ReadType::kInrepeat, "INREPEAT"},
Expand Down
37 changes: 19 additions & 18 deletions common/repeat_spec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,27 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

#include <string>
#include <fstream>
#include <cassert>
#include <set>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include <map>

#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/algorithm/string/join.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/optional/optional.hpp>
#include <boost/tokenizer.hpp>
#include <boost/filesystem.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>
#include <boost/regex.hpp>
#include <boost/algorithm/string/join.hpp>
#include <boost/tokenizer.hpp>

#include "common/ref_genome.h"
#include "common/repeat_spec.h"
#include "common/timestamp.h"
#include "purity/purity.h"

using std::string;
Expand All @@ -62,7 +63,7 @@ const char RepeatSpec::LeftFlankBase() const {
return left_flank[left_flank.size() - 1];
}

RepeatSpec::RepeatSpec(const string& json_path) {
RepeatSpec::RepeatSpec(const string &json_path) {
std::ifstream istrm(json_path.c_str());

if (!istrm.is_open()) {
Expand Down Expand Up @@ -104,21 +105,21 @@ RepeatSpec::RepeatSpec(const string& json_path) {

if (confusion_node) {
offtarget_regions.clear();
for (const ptree::value_type& region_node : *confusion_node) {
assert(region_node.first.empty()); // array elements have no names
for (const ptree::value_type &region_node : *confusion_node) {
assert(region_node.first.empty()); // array elements have no names
offtarget_regions.push_back(Region(region_node.second.data()));
}
}
}

// Fill out prefix and suffix sequences.
bool LoadFlanks(const string& genome_path, double min_wp,
RepeatSpec* repeat_spec) {
bool LoadFlanks(const string &genome_path, double min_wp,
RepeatSpec *repeat_spec) {
RefGenome ref_genome(genome_path);
// Reference repeat flanks should be at least as long as reads.
const int kFlankLen = 250;

const Region& repeat_region = repeat_spec->target_region;
const Region &repeat_region = repeat_spec->target_region;

const int64_t left_flank_begin = repeat_region.start() - kFlankLen;
const int64_t left_flank_end = repeat_region.start() - 1;
Expand Down Expand Up @@ -155,8 +156,8 @@ bool LoadFlanks(const string& genome_path, double min_wp,
return true;
}

bool LoadRepeatSpecs(const string& specs_path, const string& genome_path,
double min_wp, map<string, RepeatSpec>* repeat_specs) {
bool LoadRepeatSpecs(const string &specs_path, const string &genome_path,
double min_wp, map<string, RepeatSpec> *repeat_specs) {
assert(!specs_path.empty());

const boost::regex regex_json(".*\\.json$");
Expand All @@ -169,7 +170,7 @@ bool LoadRepeatSpecs(const string& specs_path, const string& genome_path,
boost::smatch what;

if (boost::regex_match(fname, what, regex_json)) {
cerr << "[Loading " << fname << "]" << endl;
cerr << TimeStamp() << ",[Loading " << fname << "]" << endl;

const string json_path = itr->path().string();
RepeatSpec repeat_spec(json_path);
Expand Down
25 changes: 25 additions & 0 deletions common/timestamp.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Concept: Michael Eberle <[email protected]>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

#include "common/timestamp.h"

std::string TimeStamp() {
std::time_t now = time(0);
const size_t timestamp_size = 80;
char timestamp_buf[timestamp_size];
std::strftime(timestamp_buf, timestamp_size, "%FT%T", std::localtime(&now));
return std::string(timestamp_buf);
}
28 changes: 28 additions & 0 deletions common/timestamp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//
// Expansion Hunter
// Copyright (c) 2016 Illumina, Inc.
//
// Author: Egor Dolzhenko <[email protected]>,
// Mitch Bekritsky <[email protected]>, Richard Shaw
// Concept: Michael Eberle <[email protected]>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

#pragma once

#include <string>
#include <chrono>

std::string TimeStamp();
2 changes: 1 addition & 1 deletion genotyping/short_repeat_genotyper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "genotyping/short_repeat_genotyper.h"

#include <algorithm>
#import <cassert>
#include <cassert>
#include <cctype>
#include <cmath>
#include <iostream>
Expand Down
42 changes: 22 additions & 20 deletions include/irr_counting.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@

#pragma once

#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <string>
#include <vector>

#include "common/genomic_region.h"
Expand All @@ -46,31 +46,33 @@ enum WhatToCache { kCacheAll, kCacheIrr };

// Extract alignment pairs from a specified region.
void CacheReadsFromRegion(
const Region& region, const WhatToCache whatToCache,
const std::vector<std::vector<std::string>>& units_shifts,
double min_wp_score, BamFile* bam_file, AlignPairs* align_pairs);
const Region &region, const WhatToCache whatToCache,
const std::vector<std::vector<std::string>> &units_shifts,
double min_wp_score, BamFile *bam_file, AlignPairs *align_pairs);

void CountAnchoredIrrs(
const BamFile& bam_file, const Parameters& parameters,
const Region& target_neighborhood, const RepeatSpec& repeat_spec,
const std::unordered_set<std::string>& ontarget_frag_names,
AlignPairs& align_pairs, int& num_anchored_irrs,
const std::vector<std::vector<std::string>>& units_shifts,
std::vector<RepeatAlign>* anchored_irrs);
const BamFile &bam_file, const Parameters &parameters,
const Region &target_neighborhood, const RepeatSpec &repeat_spec,
const std::unordered_set<std::string> &ontarget_frag_names,
AlignPairs &align_pairs, int &num_anchored_irrs,
const std::vector<std::vector<std::string>> &units_shifts,
std::vector<RepeatAlign> *anchored_irrs);

void FillinMates(BamFile& bam_file, AlignPairs& align_pairs);
void FillinMates(BamFile &bam_file, AlignPairs &align_pairs,
const std::vector<std::vector<std::string>> &units_shifts,
double min_wp_score,
const std::unordered_set<std::string> &ontarget_frag_names);

// Count the number of in-repeat reads stored in an AlignPairs object.
// A fragment is in-repeat if both of the reads fuzzy match to the repeat
// sequence.
bool CountUnalignedIrrs(
BamFile& bam_file, const Parameters& parameters, int& numInRepeatReads,
const std::vector<std::vector<std::string>>& units_shifts,
std::vector<RepeatAlign>* irr_rep_aligns);
BamFile &bam_file, const Parameters &parameters, int &numInRepeatReads,
const std::vector<std::vector<std::string>> &units_shifts,
std::vector<RepeatAlign> *irr_rep_aligns);

int CountAlignedIrr(
const BamFile& bam_file, const Parameters& parameters,
const AlignPairs& align_pairs,
std::map<std::string, int>& num_irrs_per_offtarget_region,
const std::vector<std::vector<std::string>>& units_shifts,
std::vector<RepeatAlign>* irr_rep_aligns);
int CountAlignedIrr(const BamFile &bam_file, const Parameters &parameters,
const AlignPairs &align_pairs,
std::map<std::string, int> &num_irrs_per_offtarget_region,
const std::vector<std::vector<std::string>> &units_shifts,
std::vector<RepeatAlign> *irr_rep_aligns);
2 changes: 1 addition & 1 deletion include/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@

#include <string>

const std::string kProgramVersion = "Expansion Hunter v2.5.1";
const std::string kProgramVersion = "Expansion Hunter v2.5.2";
12 changes: 10 additions & 2 deletions src/bam_file.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

#include "common/parameters.h"
#include "common/ref_genome.h"
#include "common/timestamp.h"
#include "include/bam_index.h"

typedef boost::tokenizer<boost::char_separator<char>> Tokenizer;
Expand Down Expand Up @@ -94,7 +95,14 @@ void BamFile::Init(const string &path, const string &reference) {
}
}

cerr << "[Input format: " << format_ << "]" << endl;
string input_format = "Unknown";
if (format_ == kBamFile) {
input_format = "BAM";
} else if (format_ == kCramFile) {
input_format = "CRAM";
}

cerr << TimeStamp() << ",[Input format: " << input_format << "]" << endl;

// Read hdr and set up ref_vec_
hts_bam_hdr_ptr_ = sam_hdr_read(hts_file_ptr_);
Expand Down Expand Up @@ -412,7 +420,7 @@ double BamFile::CalcMedianDepth(Parameters &parameters, size_t read_len) {
}

if (skip_cur_chrom) {
cerr << "[Skipping " << chrom_name << " during depth calculation]"
cerr << TimeStamp() << ",[Skipping " << chrom_name << " during depth calculation]"
<< endl;
continue;
}
Expand Down
Loading

0 comments on commit 367f93c

Please sign in to comment.