diff --git a/annotation/annotation.go b/annotation/annotation.go index 0d206cd..2011661 100644 --- a/annotation/annotation.go +++ b/annotation/annotation.go @@ -22,15 +22,15 @@ type chunk struct { feats chan rtreego.Spatial } -// RtreeMap is a map of pointers to Rtree with string keys. -// type RtreeMap map[string]*rtreego.Rtree -type RtreeMap map[string]*rtreego.Rtree - type tree struct { chr string tree *rtreego.Rtree } +// RtreeMap is a map of pointers to Rtree with string keys. +// type RtreeMap map[string]*rtreego.Rtree +type RtreeMap map[string]*rtreego.Rtree + // Get returns the pointer to an Rtree for the specified chromosome and create a new Rtree if not present. func (t RtreeMap) Get(chr string) *rtreego.Rtree { v, ok := t[chr] @@ -45,7 +45,7 @@ func (t RtreeMap) Len() int { return len(t) } -func scan(scanner *Scanner, regions chan chunk, elems chan string) { +func scan(scanner *Scanner, regions chan chunk) { regMap := make(map[string]chan rtreego.Spatial) var chr, lastChr string for scanner.Next() { @@ -53,9 +53,6 @@ func scan(scanner *Scanner, regions chan chunk, elems chan string) { if feature == nil { continue } - if logrus.GetLevel() == logrus.DebugLevel { - elems <- feature.Out() - } if len(chr) == 0 { lastChr = feature.Chr() } @@ -73,22 +70,24 @@ func scan(scanner *Scanner, regions chan chunk, elems chan string) { } close(regMap[lastChr]) close(regions) - close(elems) if scanner.Error() != nil { logrus.Panic(scanner.Error()) } } -func writeElements(items <-chan string) { +func writeElements(items <-chan rtreego.Spatial, done chan<- struct{}) { var w *bufio.Writer out, _ := os.Create(debugElementsFile) w = bufio.NewWriter(out) logrus.Debugf("Writing index elements to %s", out.Name()) - for item := range items { - w.WriteString(item) + itemSlice := NewFeatureSlice(chan2slice(items)) + sort.Sort(itemSlice) + for _, item := range itemSlice { + w.WriteString(item.Out()) w.WriteRune('\n') } w.Flush() + done <- struct{}{} } func mergeIntervals(intervals []rtreego.Spatial) []*Feature { @@ -123,24 +122,23 @@ func mergeIntervals(intervals []rtreego.Spatial) []*Feature { return out } -func interleaveFeatures(tree *rtreego.Rtree, start, end float64, element string, updated []byte, extremes bool) []*Feature { - features := QueryIndexByElement(tree, start, end, element) - merged := mergeIntervals(features) +func interleaveFeatures(features []*Feature, start, end float64, element string, updated []byte, extremes bool) []*Feature { var fs []*Feature - for i, f := range merged { + + for i, f := range features { fs = append(fs, f) if extremes { if i == 0 { n, _ := parseFeature(f.chr, updated, start, f.Start()) fs = append(fs, n) } - if i == len(merged)-1 { + if i == len(features)-1 { n, _ := parseFeature(f.chr, updated, f.End(), end) fs = append(fs, n) } } if i > 0 { - g := merged[i-1] + g := features[i-1] n, _ := parseFeature(f.chr, updated, g.End(), f.Start()) fs = append(fs, n) } @@ -148,22 +146,45 @@ func interleaveFeatures(tree *rtreego.Rtree, start, end float64, element string, return fs } -func updateIndex(index *rtreego.Rtree, start, end float64, feature, updated string, extremes bool) *rtreego.Rtree { +func updateIndex(index *rtreego.Rtree, start, end float64, feature, updated string, extremes bool, elems chan rtreego.Spatial) *rtreego.Rtree { if end-start <= 0 { return index } var features []rtreego.Spatial - for _, f := range interleaveFeatures(index, start, end, feature, []byte(updated), extremes) { + + genes := QueryIndexByElement(index, start, end, feature) + for _, i := range genes { + f := i.(*Feature) features = append(features, f) - for _, g := range interleaveFeatures(index, f.Start(), f.End(), "exon", []byte("intron"), false) { - features = append(features, g) + } + mergedGenes := mergeIntervals(genes) + for _, f := range interleaveFeatures(mergedGenes, start, end, feature, []byte(updated), extremes) { + if f.Element() == updated { + features = append(features, f) + if logrus.GetLevel() == logrus.DebugLevel { + elems <- f + } + } + exons := QueryIndexByElement(index, f.Start(), f.End(), "exon") + for _, i := range exons { + f := i.(*Feature) + features = append(features, f) + } + mergedExons := mergeIntervals(exons) + for _, g := range interleaveFeatures(mergedExons, f.Start(), f.End(), "exon", []byte("intron"), false) { + if g.Element() == "intron" { + features = append(features, g) + if logrus.GetLevel() == logrus.DebugLevel { + elems <- g + } + } } } return rtreego.NewTree(1, 25, 50, features...) } -func chan2slice(c chan rtreego.Spatial) []rtreego.Spatial { +func chan2slice(c <-chan rtreego.Spatial) []rtreego.Spatial { var s []rtreego.Spatial for item := range c { s = append(s, item) @@ -171,10 +192,17 @@ func chan2slice(c chan rtreego.Spatial) []rtreego.Spatial { return s } -func createTree(trees chan *tree, chr string, length float64, feats chan rtreego.Spatial, wg *sync.WaitGroup) { +func createTree(trees chan *tree, chr string, length float64, feats chan rtreego.Spatial, wg *sync.WaitGroup, elems chan rtreego.Spatial) { wg.Add(1) - tmpIndex := rtreego.NewTree(1, 25, 50, chan2slice(feats)...) - trees <- &tree{chr, updateIndex(tmpIndex, 0, length, "gene", "intergenic", true)} + featSlice := chan2slice(feats) + tmpIndex := rtreego.NewTree(1, 25, 50, featSlice...) + t := tree{chr, updateIndex(tmpIndex, 0, length, "gene", "intergenic", true, elems)} + trees <- &t + if logrus.GetLevel() == logrus.DebugLevel && length == 0 { + for _, f := range featSlice { + elems <- f.(*Feature) + } + } wg.Done() } @@ -192,30 +220,37 @@ func createIndex(scanner *Scanner) *RtreeMap { trees := make(RtreeMap) regions := make(chan chunk) treeChan := make(chan *tree) - debugElements := make(chan string) + debugElements := make(chan rtreego.Spatial) + debugElemsDone := make(chan struct{}) if logrus.GetLevel() == logrus.DebugLevel { - go writeElements(debugElements) + go writeElements(debugElements, debugElemsDone) } - go scan(scanner, regions, debugElements) + go scan(scanner, regions) var wg sync.WaitGroup for chunk := range regions { chr := chunk.chr feats := chunk.feats length := float64(scanner.r.chrLens[chr]) - go createTree(treeChan, chr, length, feats, &wg) + go createTree(treeChan, chr, length, feats, &wg, debugElements) } go func() { wg.Wait() close(treeChan) + close(debugElements) }() for t := range treeChan { trees[t.chr] = t.tree } + + if logrus.GetLevel() == logrus.DebugLevel { + <-debugElemsDone + } + return &trees } diff --git a/annotation/annotation_test.go b/annotation/annotation_test.go index 02b582c..279eb1b 100644 --- a/annotation/annotation_test.go +++ b/annotation/annotation_test.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "os" "regexp" + "sort" "testing" "github.com/Sirupsen/logrus" @@ -13,10 +14,6 @@ import ( "github.com/dhconnelly/rtreego" ) -var ( - annotationFiles = []string{"../data/coverage-test.bed", "../data/coverage-test.gtf.gz", "../data/coverage-test-shuffled.bed", "../data/coverage-test-shuffled.gtf.gz"} -) - func TestParseFeature(t *testing.T) { e := bytes.Split([]byte(`chr1 11868 12227 exon`), []byte("\t")) chr := e[0] @@ -329,21 +326,47 @@ func TestReadFeatures(t *testing.T) { } elems := map[string]int{ + "exon": 106742, + "gene": 5397, + "intergenic": 3202, + "intron": 26230, + } + mergedElems := map[string]int{ "exon": 29431, "gene": 3201, "intergenic": 3202, "intron": 26230, } - for _, f := range annotationFiles { - m := CreateIndex(f, chrLens) + for _, i := range []struct { + f string + expected map[string]int + }{ + { + "../data/coverage-test.bed", + mergedElems, + }, + { + "../data/coverage-test-shuffled.bed", + mergedElems, + }, + { + "../data/coverage-test.gtf.gz", + elems, + }, + { + "../data/coverage-test-shuffled.gtf.gz", + elems, + }, + } { + m := CreateIndex(i.f, chrLens) index := m.Get("chr1") res := make(map[string]int) for _, s := range QueryIndex(index, 0, 248956422) { f := s.(*Feature) res[f.Element()]++ } - for k, v := range elems { + for k, v := range i.expected { if v != res[k] { t.Errorf("(%s) Different number of %s features. Expected: %d, got %d", t.Name(), k, v, res[k]) } diff --git a/annotation/location.go b/annotation/location.go index 3912975..c9e0991 100644 --- a/annotation/location.go +++ b/annotation/location.go @@ -38,16 +38,21 @@ func (loc *Location) String() string { } // GetElements returns all elements overlapping with buf -func (loc *Location) GetElements(buf *[]rtreego.Spatial, elems map[string]uint8) { - for _, feature := range *buf { +func (loc *Location) GetElements(buf []rtreego.Spatial, elems map[string]uint8, tags ...string) { + for _, feature := range buf { if feature, ok := feature.(*Feature); ok { start := math.Max(loc.Start(), feature.Start()) end := math.Min(loc.End(), feature.End()) if end <= start { continue } + if feature.Element() != "gene" { elems[feature.Element()]++ + } else { + for _, t := range tags { + elems[feature.Tag(t)]++ + } } } } diff --git a/stats/ihec.go b/stats/ihec.go index 455a38c..8f877b6 100644 --- a/stats/ihec.go +++ b/stats/ihec.go @@ -47,21 +47,8 @@ func (s *IHECstats) Collect(record *sam.Record) { } results := annotation.QueryIndex(rtree, mappingLocation.Start(), mappingLocation.End()) - var filteredResults []rtreego.Spatial - for _, r := range results { - if r, ok := r.(*annotation.Feature); ok { - // if r.Element() == "intergenic" { - // if r.End()-r.Start() <= 1000 { - // continue - // } - // if !(mappingLocation.End() >= r.Start()+500 && mappingLocation.Start() <= r.End()-500) { - // continue - // } - // } - filteredResults = append(filteredResults, r) - } - } - mappingLocation.GetElements(&filteredResults, elements) + + mappingLocation.GetElements(filterElements(results, mappingLocation.Start(), mappingLocation.End(), 500), elements, "gene_type") // if _, isIntergenic := elements["intergenic"]; isIntergenic && len(elements) > 1 { // fmt.Println(elements) @@ -77,19 +64,47 @@ func NewIHECstats(index *annotation.RtreeMap) *IHECstats { } } +func filterElements(elements []rtreego.Spatial, start, end, offset float64) []rtreego.Spatial { + var filteredElements []rtreego.Spatial + for _, r := range elements { + if r, ok := r.(*annotation.Feature); ok { + if r.Element() == "intergenic" { + if r.End()-r.Start() < 2*offset { + continue + } + if end <= r.Start()+offset || start > r.End()-offset { + continue + } + } + filteredElements = append(filteredElements, r) + } + } + return filteredElements +} + func updateIHECcount(elems map[string]uint8, st *IHECstats) { if len(elems) == 0 { return } - if _, isRRNA := elems["rRNA"]; isRRNA { + rRNAs := []string{ + "rRNA", + "Mt_rRNA", + } + + for _, gt := range rRNAs { + if _, isRRNA := elems[gt]; isRRNA { + st.RRNA++ + } + } + + if _, isRRNA := elems["Mt_rRNA"]; isRRNA { st.RRNA++ - return } if _, isIntergenic := elems["intergenic"]; isIntergenic { st.Intergenic++ - return } + }