Skip to content

Commit e9ca778

Browse files
committed
Guessing V/J annotation from CDR3 sequence for cases when it is missing
1 parent 6c7d864 commit e9ca778

File tree

7 files changed

+4402
-25
lines changed

7 files changed

+4402
-25
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ tmp/
22
database/
33
releases/
44
out/
5+
misc/export
56

67
## Groovy / Java
78

Diff for: misc/misc.Rmd

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
title: "misc"
3+
output: html_document
4+
---
5+
6+
```{r setup, include=FALSE}
7+
knitr::opts_chunk$set(echo = TRUE)
8+
```
9+
10+
```{r}
11+
source("utils.R")
12+
```
13+
14+
Generate a dump of annotated V/J segment sequences. This dump is used by CDRFixer to annotate cases when V/J is not supplied
15+
16+
```{r}
17+
fread("../database/vdjdb.slim.txt") %>%
18+
as.data.frame %>%
19+
get_segment_parts %>%
20+
fwrite("../res/segments.aaparts.txt", sep = "\t")
21+
```

Diff for: summary/pub_diff.sh renamed to misc/pub_diff.sh

File renamed without changes.

Diff for: misc/utils.R

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
library(dplyr)
2+
library(stringr)
3+
library(data.table)
4+
5+
### Segment processing
6+
get_segment_parts <- function(.df) {
7+
.df.v <- .df %>%
8+
select(species, gene, v.segm, v.end, cdr3) %>%
9+
filter(v.end > 0) %>%
10+
mutate(v.segm = str_split_fixed(v.segm, "[*,]", 2)[,1],
11+
cdr3 = substr(cdr3, 1, v.end)) %>%
12+
group_by(species, gene, v.segm, cdr3) %>%
13+
summarise(count = n()) %>%
14+
group_by(species, gene, cdr3, type = "V") %>%
15+
summarise(segm = v.segm[which(count == max(count))][1])
16+
17+
.df.j <- .df %>%
18+
select(species, gene, j.segm, j.start, cdr3) %>%
19+
filter(j.start > 0) %>%
20+
mutate(j.segm = str_split_fixed(j.segm, "[*,]", 2)[,1],
21+
cdr3 = substr(cdr3, j.start, nchar(cdr3))) %>%
22+
group_by(species, gene, j.segm, cdr3) %>%
23+
summarise(count = n()) %>%
24+
group_by(species, gene, cdr3, type = "J") %>%
25+
summarise(segm = j.segm[which(count == max(count))][1])
26+
27+
rbind(.df.v, .df.j)
28+
}
29+
30+
31+
32+
### VDJtools export
33+
34+
mock_codons <- c('GCT', 'TGT', 'GAT', 'GAA', 'TTT',
35+
'GGT', 'ATT', 'CAT', 'AAA', 'TTA',
36+
'ATG', 'AAT', 'CCT', 'CAA', 'CGT',
37+
'TCT', 'ACT', 'GTT', 'TGG', 'TAT')
38+
39+
names(mock_codons) <- c('A', 'C', 'D', 'E', 'F',
40+
'G', 'I', 'H', 'K', 'L',
41+
'M', 'N', 'P', 'Q', 'R',
42+
'S', 'T', 'V', 'W', 'Y')
43+
44+
mock_back_translate <- function(x) {
45+
paste0(mock_codons[x], collapse = "")
46+
}
47+
48+
# "CASS" %>% strsplit('') %>% lapply(mock_back_translate)
49+
50+
as.vdjtools.df <- function(.df, .chain = c("beta", "alpha")) {
51+
if (.chain == "beta") {
52+
.df$cdr3aa <- .df$cdr3.beta
53+
.df$v <- .df$v.beta
54+
.df$j <- .df$j.beta
55+
} else {
56+
.df$cdr3aa <- .df$cdr3.alpha
57+
.df$v <- .df$v.alpha
58+
.df$j <- .df$j.alpha
59+
}
60+
61+
.df$cdr3nt <- cdr3.beta %>%
62+
strsplit('') %>%
63+
lapply(mock_back_translate)
64+
65+
.df %>%
66+
mutate(count = 1, freq = 1 / n(), d = "",
67+
vend = -1, dstart = -1, dend = -1, jstart = -1) %>%
68+
select(count, freq, cdr3nt, cdr3aa, v, d, j, vend, dstart, dend, jstart)
69+
}

0 commit comments

Comments
 (0)