Skip to content

Commit

Permalink
preprocesamiento dataset problema
Browse files Browse the repository at this point in the history
  • Loading branch information
Erickcufe committed Aug 7, 2023
1 parent 1b60142 commit 3c648e1
Showing 1 changed file with 204 additions and 1 deletion.
205 changes: 204 additions & 1 deletion 02_sesion5.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Erick Cuevas
[

```{r,echo=FALSE}
knitr::include_url("https://comunidadbioinfo.github.io/cdsb2023/brett_talk.pdf", height = "380px")
knitr::include_url("https://erickcufe.github.io/anotacion_scRNA/anotacion_celulas.html", height = "380px")
```

](<https://comunidadbioinfo.github.io/cdsb2023/brett_talk.pdf>)
Expand Down Expand Up @@ -72,6 +72,7 @@ Dentro de este contexto, la anotación de clusters en scRNAseq es esencial. Una
## Paqueterías de R mas "famosas" para anotar

- [SingleR](https://bioconductor.org/packages/release/bioc/vignettes/SingleR/inst/doc/SingleR.html)
- [Seurat](https://satijalab.org/seurat/)
- [scCATCH](https://github.com/ZJUFanLab/scCATCH)
- [cellassign](https://github.com/Irrationone/cellassign)
- [SCINA](https://github.com/jcao89757/SCINA)
Expand All @@ -90,3 +91,205 @@ Otros sitios interesantes para obtener conjuntos de datos de referencia:
- [GLIASEQ](https://www.liddelowlab.com/gliaseq)


## Preparación del dataset

El dataset proviene tiene el ID en GEO [GSE84465](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84465)

```{r, eval=FALSE}
library(cowplot)
library(ggplot2)
library(scater)
library(scds)
library(SingleCellExperiment)
#Carga de datos y re formato
#Carga de las cuentas crudas
#Recuerda cambiar el PATH o directorio a la ubicación de los datos
fastq_dirs <- list.dirs("../datos_atherosclerosis/tom_alsaigh_2022/", recursive = FALSE, full.names = TRUE)
names(fastq_dirs) <- basename(fastq_dirs)
sce <- DropletUtils::read10xCounts(fastq_dirs)
#Renombrar row/colData nombre de columnas y SCE dimnames
names(rowData(sce)) <- c("ENSEMBL", "SYMBOL")
names(colData(sce)) <- c("sample_id", "barcode")
sce$sample_id <- factor(basename(sce$sample_id))
dimnames(sce) <- list(
with(rowData(sce), paste(SYMBOL, sep = ".")),
with(colData(sce), paste(barcode, sample_id, sep = ".")))
# Agregamos la metadata
md <- data.frame(
Sample_ID = c("Patient1_AC", "Patient1_PA",
"Patient2_AC", "Patient2_PA",
"Patient3_AC", "Patient3_PA"),
Age = c(82, 82, 87, 87, 65, 65),
AHA_Classification = c("Type_VII_Calcified", "Type_VII_Calcified",
"Type_VII_Calcified", "Type_VII_Calcified",
"Type_VII_Calcified", "Type_VII_Calcified"),
Region = c("Atherosclerotic_Core", "Proximal_Adjacent",
"Atherosclerotic_Core", "Proximal_Adjacent",
"Atherosclerotic_Core", "Proximal_Adjacent")
)
m <- match(sce$sample_id, md$Sample_ID)
sce$region <- md$Region[m]
sce$AHA_Classification <- md$AHA_Classification[m]
sce$sample_id <- md$Sample_ID[m]
sce$Age <- md$Age[m]
```

Hacemos ahora un pequeño proceso de limpieza y normalización

```{r, eval=FALSE}
#Remover genes no detectados
sce <- sce[Matrix::rowSums(counts(sce) > 0) > 0, ]
dim(sce)
#Remover doublets
#dividir SCE por muestra OJO esto podria ser por lotes
cs_by_s <- split(colnames(sce), sce$sample_id)
sce_by_s_2 <- lapply(cs_by_s, function(cs) sce[, cs])
#correr 'scds'
sce_by_s_2 <- lapply(sce_by_s_2, function(u)
scds::cxds_bcds_hybrid(scds::bcds(scds::cxds(u))))
#Eliminando doublets
sce_by_s <- lapply(sce_by_s_2, function(u) {
#Calcula numero de doublets (10x)
n_dbl <- ceiling(0.01 * ncol(u)^2 / 1e3)
#Elimina 'n_dbl' celulas con mayor scpre de doublet
o <- order(u$hybrid_score, decreasing = TRUE)
u[, -o[seq_len(n_dbl)]]
})
#Integrar nuevamente al objeto SCE
sce <- do.call(cbind, sce_by_s)
#Calcular metricas QC
(mito <- grep("MT-", rownames(sce), value = TRUE))
# sce <- perCellQCMetrics(sce, subsets = list(Mito=mito))
sce <- addPerCellQCMetrics(sce, subsets = list(Mito=mito))
sce <- addPerFeatureQCMetrics(sce)
```

Filtración de células

```{r, eval=FALSE}
#Obtener outliers
cols <- c("total", "detected", "subsets_Mito_percent")
log <- c(TRUE, TRUE, FALSE)
type <- c("both", "both", "higher")
# Con esto decidimos que barcode desechar
drop_cols <- paste0(cols, "_drop")
for (i in seq_along(cols)){
colData(sce)[[drop_cols[i]]] <- isOutlier(sce[[cols[i]]],
nmads = 2.5, type = type[i],
log = log[i], batch = sce$sample_id)
}
# Muestra un resumen de los barcodes que se eliminaran
sapply(drop_cols, function(i)
sapply(drop_cols, function(j)
sum(sce[[i]] & sce[[j]])))
cd <- data.frame(colData(sce))
ps <- lapply(seq_along(cols), function (i) {
p <- ggplot(cd, aes_string(x = cols[i], alpha = drop_cols[i])) +
geom_histogram(bins = 100, show.legend = FALSE) +
scale_alpha_manual(values = c("FALSE" = 1, "TRUE" = 0.4)) +
facet_wrap(~sample_id, ncol = 1, scales = "free") +
theme_classic() + theme(strip.background = element_blank())
if (log[i])
p <- p + scale_x_log10()
return(p)
})
plot_grid(plotlist = ps, ncol = 3)
layout(matrix(1:2, nrow = 1))
ol <- Matrix::rowSums(as.matrix(colData(sce)[drop_cols])) != 0
x <- sce$total
y <- sce$detected
LSD::heatscatter(x, y, log="xy", main = "unfiltered",
xlab = "Total counts", ylab = "Non-zero features")
LSD::heatscatter(x[!ol], y[!ol], log="xy", main = "filtered",
xlab = "Total counts", ylab = "Non-zero features")
#Generar resumen de celulas a mantener
ns <- table(sce$sample_id)
ns_fil <- table(sce$sample_id[!ol])
print(rbind(
unfiltered = ns, filtered = ns_fil,
"%" = ns_fil / ns * 100))
#Eliminar celulas outlier
sce <- sce[, !ol]
dim(sce)
#count > 1
sce <- sce[Matrix::rowSums(counts(sce) > 1) >= 20, ]
dim(sce)
```

Agrupamiento, toma en cuenta que esta parte podría demorar bastante.

```{r, eval=FALSE}
#Load packages
library(cowplot)
library(Seurat)
library(SingleCellExperiment)
library(ggplot2)
#INTEGRATE
#Crear SeuratObject
so <- CreateSeuratObject(
counts = counts(sce),
meta.data = data.frame(colData(sce)),
project = "Alsaigh_10x_data")
#Dividir por muestra
cells_by_sample <- split(colnames(sce), sce$sample_id)
so <- lapply(cells_by_sample, function(i)
subset(so, cells = i))
#Normalizar, encontrar genes variables, escalar
so <- lapply(so, NormalizeData, verbose = FALSE)
so <- lapply(so, FindVariableFeatures, nfeatures = 2e3,
selection.method = "vst", do.plot = FALSE, verbose = FALSE)
so <- lapply(so, ScaleData, verbose = FALSE)
#Encontrar anclas
# Estas anclas nos ayudaran despues para integrar todo con la funcion IntegrateData.
as <- FindIntegrationAnchors(so, verbose = TRUE)
so <- IntegrateData(anchorset = as, dims = seq_len(30), verbose = TRUE)
#Escalar datos integrados
DefaultAssay(so) <- "integrated"
so <- ScaleData(so, display.progress = FALSE)
#Reducción de dimension
so <- RunPCA(so, npcs = 100, verbose = FALSE)
#Cambiar numbero de PCs usedos
so <- RunTSNE(so, reduction = "pca", dims = seq_len(20),
seed.use = 1, do.fast = TRUE, verbose = FALSE)
so <- RunUMAP(so, reduction = "pca", dims = seq_len(20),
seed.use = 1, verbose = FALSE)
#CLUSTERING
so <- FindNeighbors(so, reduction = "pca", dims = seq_len(20), verbose = FALSE)
for (res in c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)){
so <- FindClusters(so, resolution = res, random.seed = 1, verbose = FALSE)
}
```

0 comments on commit 3c648e1

Please sign in to comment.