Expression and Copy Number Correlation

In this example, we will look at the correlation between mRNAseq-based gene expression and copy number data. We will do this using two data tables from the isb-cgc:tcga_201510_alpha dataset and a genome table from the isb-cgc:genome_reference dataset.

library(dplyr)
library(bigrquery)
library(ggplot2)
library(stringr)
library(ISBCGCExamples)

# The directory in which the files containing SQL reside.
#sqlDir = file.path("/PATH/TO/GIT/CLONE/OF/examples-R/inst/",
sqlDir = file.path(system.file(package = "ISBCGCExamples"), "sql")

######################[ TIP ]########################################
## Set the Google Cloud Platform project id under which these queries will run.
##
## If you are using the workshop docker image, this is already
## set for you in your .Rprofile and you can skip this step.

# project = "YOUR-PROJECT-ID"
#####################################################################

Getting gene information

We're going to use the genome data set to query some information about a given gene.

q <- "select
         seqname,
         MIN(start) as start,
         MAX(end) as end,
         gene_name
      from
         [isb-cgc:genome_reference.GENCODE_v19]
      where
         gene_name = 'TP53'
      group by
         seqname,
         gene_name"

gene_info <- query_exec(q, project)
gene_info

##   seqname   start     end gene_name
## 1   chr17 7565097 7590856      TP53

Pearson Correlation in BigQuery

This sql example builds a table where for each sample barcode, we have the RNA-seq level and the mean of copy number changes for segments that overlap the gene region. Pearson correlation is computed over expr and copy number columns.

# Set the desired tables to query.
expressionTable = "isb-cgc:tcga_201510_alpha.mRNA_UNC_HiSeq_RSEM"
copyNumberTable = "isb-cgc:tcga_201510_alpha.Copy_Number_segments"

# Now we are ready to run the query.
result = DisplayAndDispatchQuery(
     file.path(sqlDir, "copy_number_and_expr_corr.sql"),
               project=project,
               replacements=list("_STUDY_"="BRCA",
                                 "_EXPRESSION_TABLE_"=expressionTable,
                                 "_COPY_NUMBER_TABLE_"=copyNumberTable,
                                 "_GENE_"=gene_info$gene_name,
                                 "_TP_"="TP",
                                 "_CHR_"=str_sub(gene_info$seqname, start=4),
                                 "_START_"=gene_info$start,
                                 "_END_"=gene_info$end))


SELECT
  expr.Study as Study,
  expr.HGNC_gene_symbol as Gene,
  CORR(log2(expr.normalized_count+1), cn.mean_segment_mean) as correlation
FROM
  [isb-cgc:tcga_201510_alpha.mRNA_UNC_HiSeq_RSEM] AS expr
JOIN (
  SELECT
    Study,
    SampleBarcode,
    Num_Probes,
    AVG(Segment_Mean) AS mean_segment_mean,
  FROM
    [isb-cgc:tcga_201510_alpha.Copy_Number_segments]
  WHERE
    SampleTypeLetterCode = 'TP'
    AND Chromosome = '17'
    AND ((start <= 7565097 AND END >= 7590856)
      OR (start >= 7565097 AND start <= 7590856)
      OR (END >= 7565097   AND END <= 7590856)
      OR (start >= 7565097 AND END <= 7590856))
  GROUP BY
    Study,
    SampleBarcode,
    Num_Probes ) AS cn
ON
  expr.SampleBarcode = cn.SampleBarcode
  AND expr.Study = cn.Study
WHERE
  expr.HGNC_gene_symbol = 'TP53'
  AND expr.Study = 'BRCA'
GROUP BY
  Study,
  Gene
ORDER BY
  correlation

result

  Study Gene correlation
1  BRCA TP53   0.3053237

The result is a table with the study, gene, and pearson correlation for expression with mean of copy number segments overlapping the gene of interest.

But now we can run it over all studies.

# Now we are ready to run the query.
result = DisplayAndDispatchQuery(
     file.path(sqlDir, "copy_number_and_expr_corr_all_studies.sql"),
               project=project,
               replacements=list("_EXPRESSION_TABLE_"=expressionTable,
                                 "_COPY_NUMBER_TABLE_"=copyNumberTable,
                                 "_GENE_"=gene_info$gene_name,
                                 "_TP_"="TP",
                                 "_CHR_"=str_sub(gene_info$seqname, start=4),
                                 "_START_"=gene_info$start,
                                 "_END_"=gene_info$end))

## SELECT
##   expr.Study as Study,
##   expr.HGNC_gene_symbol as Gene,
##   CORR(log2(expr.normalized_count+1), cn.mean_segment_mean) as correlation
## FROM
##   [isb-cgc:tcga_201510_alpha.mRNA_UNC_HiSeq_RSEM] AS expr
## JOIN (
##   SELECT
##     Study,
##     SampleBarcode,
##     Num_Probes,
##     AVG(Segment_Mean) AS mean_segment_mean,
##   FROM
##     [isb-cgc:tcga_201510_alpha.Copy_Number_segments]
##   WHERE
##     SampleTypeLetterCode = 'TP'
##     AND Chromosome = '17'
##     AND ((start <= 7565097 AND END >= 7590856)
##       OR (start >= 7565097 AND start <= 7590856)
##       OR (END >= 7565097   AND END <= 7590856)
##       OR (start >= 7565097 AND END <= 7590856))
##   GROUP BY
##     Study,
##     SampleBarcode,
##     Num_Probes ) AS cn
## ON
##   expr.SampleBarcode = cn.SampleBarcode
##   AND expr.Study = cn.Study
## WHERE
##   expr.HGNC_gene_symbol = 'TP53'
## GROUP BY
##   Study,
##   Gene
## ORDER BY
##   correlation

result

##    Study Gene correlation
## 1   LUSC TP53   0.1193299
## 2   HNSC TP53   0.1483263
## 3   THCA TP53   0.1842087
## 4   KICH TP53   0.1985945
## 5   ESCA TP53   0.2064320
## 6     OV TP53   0.2237812
## 7    LGG TP53   0.2259375
## 8    GBM TP53   0.2762577
## 9   STAD TP53   0.2876563
## 10  BRCA TP53   0.3053237
## 11  BLCA TP53   0.3203657
## 12   ACC TP53   0.3295563
## 13  LUAD TP53   0.3373264
## 14  READ TP53   0.3508715
## 15  KIRP TP53   0.3597593
## 16  COAD TP53   0.3642031
## 17  CHOL TP53   0.3722740
## 18  KIRC TP53   0.3746402
## 19  UCEC TP53   0.3857859
## 20   UVM TP53   0.3910619
## 21  PAAD TP53   0.4013773
## 22  DLBC TP53   0.4417422
## 23  CESC TP53   0.4424818
## 24  THYM TP53   0.4928354
## 25   UCS TP53   0.5222810
## 26  SKCM TP53   0.5291675
## 27  TGCT TP53   0.5385247
## 28  SARC TP53   0.5608798
## 29  PRAD TP53   0.5819391
## 30  LIHC TP53   0.5838598
## 31  MESO TP53   0.5931288
## 32  PCPG TP53   0.6024819

# Plot comparing the CN ~ Expr correlation across studies.
ggplot(result, aes(x=factor(Study, ordered=T, levels=Study), y=correlation)) +
geom_point() +
theme(axis.text.x=element_text(angle=90, size=8)) +
xlab("Study") + ylab("Pearson correlation of gene expr and copy number data") +
ggtitle(gene_info$gene_name)

Looks like the MESO has one of the strongest correlations. Let's check the data.

q <- "
SELECT
  expr.Study as Study,
  expr.HGNC_gene_symbol as Gene,
  expr.SampleBarcode as SampleBarcode,
  log2(expr.normalized_count+1) as expr,
  cn.mean_segment_mean as mean_cn
FROM
  [isb-cgc:tcga_201510_alpha.mRNA_UNC_HiSeq_RSEM] AS expr
JOIN (
  SELECT
    Study,
    SampleBarcode,
    Num_Probes,
    AVG(Segment_Mean) AS mean_segment_mean,
  FROM
    [isb-cgc:tcga_201510_alpha.Copy_Number_segments]
  WHERE
    SampleTypeLetterCode = 'TP'
    AND Study = 'LIHC'
    AND Chromosome = '17'
    AND ((start <= 7565097 AND END >= 7590856)
      OR (start >= 7565097 AND start <= 7590856)
      OR (END >= 7565097   AND END <= 7590856)
      OR (start >= 7565097 AND END <= 7590856))
  GROUP BY
    Study,
    SampleBarcode,
    Num_Probes ) AS cn
ON
  expr.SampleBarcode = cn.SampleBarcode
  AND expr.Study = cn.Study
WHERE
  expr.HGNC_gene_symbol = 'TP53'
  AND expr.Study = 'LIHC'
GROUP BY
  Study,
  Gene,
  SampleBarcode,
  expr,
  mean_cn
"

data <- query_exec(q, project)
head(data)

##   Study Gene    SampleBarcode      expr mean_cn
## 1  LIHC TP53 TCGA-DD-A4NS-01A 10.039070  0.0137
## 2  LIHC TP53 TCGA-DD-A4NK-01A  9.546030 -0.0762
## 3  LIHC TP53 TCGA-GJ-A3OU-01A 10.024997 -0.2271
## 4  LIHC TP53 TCGA-CC-A8HT-01A  9.693517 -0.3713
## 5  LIHC TP53 TCGA-EP-A2KB-01A 11.535877  0.2011
## 6  LIHC TP53 TCGA-2Y-A9H0-01A  9.368493 -0.9768

# Plot comparing the CN ~ Expr correlation across studies.
ggplot(data, aes(x=expr, y=mean_cn)) +
geom_point() +
geom_smooth(method="lm") +
xlab("Log2 RNA-seq expression level") + ylab("Mean copy number change") +
ggtitle(gene_info$gene_name)

Provenance

sessionInfo()

R version 3.2.4 (2016-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.5 (El Capitan)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] stringr_1.0.0        ggplot2_2.1.0        bigrquery_0.2.0     
[4] dplyr_0.4.3          knitr_1.13           ISBCGCExamples_0.1.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5      assertthat_0.1   plyr_1.8.3       grid_3.2.4      
 [5] R6_2.1.2         jsonlite_0.9.21  gtable_0.2.0     DBI_0.4-1       
 [9] formatR_1.4      magrittr_1.5     scales_0.4.0     evaluate_0.9    
[13] httr_1.1.0       stringi_1.1.1    curl_0.9.7       labeling_0.3    
[17] tools_3.2.4      munsell_0.4.3    parallel_3.2.4   colorspace_1.2-6
[21] openssl_0.9.4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ExpressionAndCopyNumberCorrelation.md

ExpressionAndCopyNumberCorrelation.md

Expression and Copy Number Correlation

Getting gene information

Pearson Correlation in BigQuery

Provenance

Files

ExpressionAndCopyNumberCorrelation.md

Latest commit

History

ExpressionAndCopyNumberCorrelation.md

File metadata and controls

Expression and Copy Number Correlation

Getting gene information

Pearson Correlation in BigQuery

Provenance