Skip to content

Commit

Permalink
Fix annotation code for ihec and update stats
Browse files Browse the repository at this point in the history
  • Loading branch information
emi80 committed Jun 19, 2018
1 parent 8187e14 commit 40643e5
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 57 deletions.
95 changes: 65 additions & 30 deletions annotation/annotation.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ type chunk struct {
feats chan rtreego.Spatial
}

// RtreeMap is a map of pointers to Rtree with string keys.
// type RtreeMap map[string]*rtreego.Rtree
type RtreeMap map[string]*rtreego.Rtree

type tree struct {
chr string
tree *rtreego.Rtree
}

// RtreeMap is a map of pointers to Rtree with string keys.
// type RtreeMap map[string]*rtreego.Rtree
type RtreeMap map[string]*rtreego.Rtree

// Get returns the pointer to an Rtree for the specified chromosome and create a new Rtree if not present.
func (t RtreeMap) Get(chr string) *rtreego.Rtree {
v, ok := t[chr]
Expand All @@ -45,17 +45,14 @@ func (t RtreeMap) Len() int {
return len(t)
}

func scan(scanner *Scanner, regions chan chunk, elems chan string) {
func scan(scanner *Scanner, regions chan chunk) {
regMap := make(map[string]chan rtreego.Spatial)
var chr, lastChr string
for scanner.Next() {
feature := scanner.Feat()
if feature == nil {
continue
}
if logrus.GetLevel() == logrus.DebugLevel {
elems <- feature.Out()
}
if len(chr) == 0 {
lastChr = feature.Chr()
}
Expand All @@ -73,22 +70,24 @@ func scan(scanner *Scanner, regions chan chunk, elems chan string) {
}
close(regMap[lastChr])
close(regions)
close(elems)
if scanner.Error() != nil {
logrus.Panic(scanner.Error())
}
}

func writeElements(items <-chan string) {
func writeElements(items <-chan rtreego.Spatial, done chan<- struct{}) {
var w *bufio.Writer
out, _ := os.Create(debugElementsFile)
w = bufio.NewWriter(out)
logrus.Debugf("Writing index elements to %s", out.Name())
for item := range items {
w.WriteString(item)
itemSlice := NewFeatureSlice(chan2slice(items))
sort.Sort(itemSlice)
for _, item := range itemSlice {
w.WriteString(item.Out())
w.WriteRune('\n')
}
w.Flush()
done <- struct{}{}
}

func mergeIntervals(intervals []rtreego.Spatial) []*Feature {
Expand Down Expand Up @@ -123,58 +122,87 @@ func mergeIntervals(intervals []rtreego.Spatial) []*Feature {
return out
}

func interleaveFeatures(tree *rtreego.Rtree, start, end float64, element string, updated []byte, extremes bool) []*Feature {
features := QueryIndexByElement(tree, start, end, element)
merged := mergeIntervals(features)
func interleaveFeatures(features []*Feature, start, end float64, element string, updated []byte, extremes bool) []*Feature {
var fs []*Feature
for i, f := range merged {

for i, f := range features {
fs = append(fs, f)
if extremes {
if i == 0 {
n, _ := parseFeature(f.chr, updated, start, f.Start())
fs = append(fs, n)
}
if i == len(merged)-1 {
if i == len(features)-1 {
n, _ := parseFeature(f.chr, updated, f.End(), end)
fs = append(fs, n)
}
}
if i > 0 {
g := merged[i-1]
g := features[i-1]
n, _ := parseFeature(f.chr, updated, g.End(), f.Start())
fs = append(fs, n)
}
}
return fs
}

func updateIndex(index *rtreego.Rtree, start, end float64, feature, updated string, extremes bool) *rtreego.Rtree {
func updateIndex(index *rtreego.Rtree, start, end float64, feature, updated string, extremes bool, elems chan rtreego.Spatial) *rtreego.Rtree {
if end-start <= 0 {
return index
}

var features []rtreego.Spatial
for _, f := range interleaveFeatures(index, start, end, feature, []byte(updated), extremes) {

genes := QueryIndexByElement(index, start, end, feature)
for _, i := range genes {
f := i.(*Feature)
features = append(features, f)
for _, g := range interleaveFeatures(index, f.Start(), f.End(), "exon", []byte("intron"), false) {
features = append(features, g)
}
mergedGenes := mergeIntervals(genes)
for _, f := range interleaveFeatures(mergedGenes, start, end, feature, []byte(updated), extremes) {
if f.Element() == updated {
features = append(features, f)
if logrus.GetLevel() == logrus.DebugLevel {
elems <- f
}
}
exons := QueryIndexByElement(index, f.Start(), f.End(), "exon")
for _, i := range exons {
f := i.(*Feature)
features = append(features, f)
}
mergedExons := mergeIntervals(exons)
for _, g := range interleaveFeatures(mergedExons, f.Start(), f.End(), "exon", []byte("intron"), false) {
if g.Element() == "intron" {
features = append(features, g)
if logrus.GetLevel() == logrus.DebugLevel {
elems <- g
}
}
}
}
return rtreego.NewTree(1, 25, 50, features...)
}

func chan2slice(c chan rtreego.Spatial) []rtreego.Spatial {
func chan2slice(c <-chan rtreego.Spatial) []rtreego.Spatial {
var s []rtreego.Spatial
for item := range c {
s = append(s, item)
}
return s
}

func createTree(trees chan *tree, chr string, length float64, feats chan rtreego.Spatial, wg *sync.WaitGroup) {
func createTree(trees chan *tree, chr string, length float64, feats chan rtreego.Spatial, wg *sync.WaitGroup, elems chan rtreego.Spatial) {
wg.Add(1)
tmpIndex := rtreego.NewTree(1, 25, 50, chan2slice(feats)...)
trees <- &tree{chr, updateIndex(tmpIndex, 0, length, "gene", "intergenic", true)}
featSlice := chan2slice(feats)
tmpIndex := rtreego.NewTree(1, 25, 50, featSlice...)
t := tree{chr, updateIndex(tmpIndex, 0, length, "gene", "intergenic", true, elems)}
trees <- &t
if logrus.GetLevel() == logrus.DebugLevel && length == 0 {
for _, f := range featSlice {
elems <- f.(*Feature)
}
}
wg.Done()
}

Expand All @@ -192,30 +220,37 @@ func createIndex(scanner *Scanner) *RtreeMap {
trees := make(RtreeMap)
regions := make(chan chunk)
treeChan := make(chan *tree)
debugElements := make(chan string)
debugElements := make(chan rtreego.Spatial)
debugElemsDone := make(chan struct{})

if logrus.GetLevel() == logrus.DebugLevel {
go writeElements(debugElements)
go writeElements(debugElements, debugElemsDone)
}

go scan(scanner, regions, debugElements)
go scan(scanner, regions)

var wg sync.WaitGroup
for chunk := range regions {
chr := chunk.chr
feats := chunk.feats
length := float64(scanner.r.chrLens[chr])
go createTree(treeChan, chr, length, feats, &wg)
go createTree(treeChan, chr, length, feats, &wg, debugElements)
}

go func() {
wg.Wait()
close(treeChan)
close(debugElements)
}()

for t := range treeChan {
trees[t.chr] = t.tree
}

if logrus.GetLevel() == logrus.DebugLevel {
<-debugElemsDone
}

return &trees
}

Expand Down
37 changes: 30 additions & 7 deletions annotation/annotation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@ import (
"io/ioutil"
"os"
"regexp"
"sort"
"testing"

"github.com/Sirupsen/logrus"

"github.com/dhconnelly/rtreego"
)

var (
annotationFiles = []string{"../data/coverage-test.bed", "../data/coverage-test.gtf.gz", "../data/coverage-test-shuffled.bed", "../data/coverage-test-shuffled.gtf.gz"}
)

func TestParseFeature(t *testing.T) {
e := bytes.Split([]byte(`chr1 11868 12227 exon`), []byte("\t"))
chr := e[0]
Expand Down Expand Up @@ -329,21 +326,47 @@ func TestReadFeatures(t *testing.T) {
}

elems := map[string]int{
"exon": 106742,
"gene": 5397,
"intergenic": 3202,
"intron": 26230,
}
mergedElems := map[string]int{
"exon": 29431,
"gene": 3201,
"intergenic": 3202,
"intron": 26230,
}

for _, f := range annotationFiles {
m := CreateIndex(f, chrLens)
for _, i := range []struct {
f string
expected map[string]int
}{
{
"../data/coverage-test.bed",
mergedElems,
},
{
"../data/coverage-test-shuffled.bed",
mergedElems,
},
{
"../data/coverage-test.gtf.gz",
elems,
},
{
"../data/coverage-test-shuffled.gtf.gz",
elems,
},
} {
m := CreateIndex(i.f, chrLens)
index := m.Get("chr1")
res := make(map[string]int)
for _, s := range QueryIndex(index, 0, 248956422) {
f := s.(*Feature)
res[f.Element()]++
}
for k, v := range elems {
for k, v := range i.expected {
if v != res[k] {
t.Errorf("(%s) Different number of %s features. Expected: %d, got %d", t.Name(), k, v, res[k])
}
Expand Down
9 changes: 7 additions & 2 deletions annotation/location.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,21 @@ func (loc *Location) String() string {
}

// GetElements returns all elements overlapping with buf
func (loc *Location) GetElements(buf *[]rtreego.Spatial, elems map[string]uint8) {
for _, feature := range *buf {
func (loc *Location) GetElements(buf []rtreego.Spatial, elems map[string]uint8, tags ...string) {
for _, feature := range buf {
if feature, ok := feature.(*Feature); ok {
start := math.Max(loc.Start(), feature.Start())
end := math.Min(loc.End(), feature.End())
if end <= start {
continue
}

if feature.Element() != "gene" {
elems[feature.Element()]++
} else {
for _, t := range tags {
elems[feature.Tag(t)]++
}
}
}
}
Expand Down
51 changes: 33 additions & 18 deletions stats/ihec.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,8 @@ func (s *IHECstats) Collect(record *sam.Record) {
}

results := annotation.QueryIndex(rtree, mappingLocation.Start(), mappingLocation.End())
var filteredResults []rtreego.Spatial
for _, r := range results {
if r, ok := r.(*annotation.Feature); ok {
// if r.Element() == "intergenic" {
// if r.End()-r.Start() <= 1000 {
// continue
// }
// if !(mappingLocation.End() >= r.Start()+500 && mappingLocation.Start() <= r.End()-500) {
// continue
// }
// }
filteredResults = append(filteredResults, r)
}
}
mappingLocation.GetElements(&filteredResults, elements)

mappingLocation.GetElements(filterElements(results, mappingLocation.Start(), mappingLocation.End(), 500), elements, "gene_type")

// if _, isIntergenic := elements["intergenic"]; isIntergenic && len(elements) > 1 {
// fmt.Println(elements)
Expand All @@ -77,19 +64,47 @@ func NewIHECstats(index *annotation.RtreeMap) *IHECstats {
}
}

func filterElements(elements []rtreego.Spatial, start, end, offset float64) []rtreego.Spatial {
var filteredElements []rtreego.Spatial
for _, r := range elements {
if r, ok := r.(*annotation.Feature); ok {
if r.Element() == "intergenic" {
if r.End()-r.Start() < 2*offset {
continue
}
if end <= r.Start()+offset || start > r.End()-offset {
continue
}
}
filteredElements = append(filteredElements, r)
}
}
return filteredElements
}

func updateIHECcount(elems map[string]uint8, st *IHECstats) {

if len(elems) == 0 {
return
}

if _, isRRNA := elems["rRNA"]; isRRNA {
rRNAs := []string{
"rRNA",
"Mt_rRNA",
}

for _, gt := range rRNAs {
if _, isRRNA := elems[gt]; isRRNA {
st.RRNA++
}
}

if _, isRRNA := elems["Mt_rRNA"]; isRRNA {
st.RRNA++
return
}

if _, isIntergenic := elems["intergenic"]; isIntergenic {
st.Intergenic++
return
}

}

0 comments on commit 40643e5

Please sign in to comment.