R01_deseq2.R

library(DESeq2)
library(tidyverse)
load("wsbim2122_data/deseq2/counts.rda")
load("wsbim2122_data/deseq2/coldata.rda")

# Create dds object
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = coldata,
                              design = ~ Condition)

# Inspect counts distribution
as_tibble(assay(dds)) %>%
  gather(sample, value = counts) %>% 
  ggplot(aes(x = log2(counts + 1), fill = sample)) +
  geom_histogram(bins = 20) +
  facet_wrap(~ sample)

# Run DESeq2
dds <- DESeq(dds)

#PCA with plotPCA
rld <- rlogTransformation(dds)


# effect of the rlog transformation 

# picked a very highly expressed gene
assay(dds["ENSG00000198804"])
log2(assay(dds["ENSG00000198804"]))
assay(rld["ENSG00000198804"])

# picked a lowly expressed gene
assay(dds["ENSG00000248671"])
log2(assay(dds["ENSG00000248671"]))
assay(rld["ENSG00000248671"])

plotPCA(rld, intgroup = "Condition")

# Values can be extracted to customise the plot
x <- plotPCA(rld, intgroup = "Condition", return = TRUE)
ggplot(x, aes(x = PC1, y = PC2, color = Condition, label = name)) +
  geom_point() +
  geom_text()


# Size Factors
sizeFactors(dds)

# Compare with sequencing depths
SF <- enframe(sizeFactors(dds), name = "sample", value = "Size_Factor") %>%
  ggplot(aes(x = sample, y = Size_Factor)) +
  geom_bar(stat = "identity")

SD <- enframe(colSums(assay(dds, "counts")), name = "sample", value = "n_reads") %>%
  ggplot(aes(x = sample, y = n_reads)) +
  geom_bar(stat = "identity")

library("patchwork")
SF / SD


# Available results
resultsNames(dds)
dds$Condition <- relevel(dds$Condition, ref = "mock")

dds <- DESeq(dds)

res <- results(dds,
               name = "Condition_KD_vs_mock")

res_tbl <- as_tibble(res, rownames = "ENSEMBL")

## Exercices

# 1. Inspect the results table and identify the 5 “best genes” showing the lowest padjusted value.
res_tbl %>%
  arrange(padj) %>%
  head(5)

# 2. Calculate the mean expression level of these 5 "best genes" using 
# the function `count()`. Compare with baseMean values.

best_genes <- res_tbl %>%
  arrange(padj) %>%
  head(5) %>% pull(ENSEMBL)
best_genes
counts(dds[best_genes], normalize = TRUE)
rowMeans(counts(dds[best_genes], normalize = TRUE))


# 3. Extract the ß coefficient of these 5 "best genes" from the GLM 
# using the function `coefficient()`. Compare with log2FoldChange values.

coefficients(dds[best_genes])

# 4. using the function `count()`, calculate the expression levels (in log2) 
# of these 5 "best genes" in mock cells. Compare with ß coefficients.

log2(rowMeans(counts(dds[best_genes, 1:3], normalize = TRUE)))


# 5. Calculate the expression levels (in log2) 
#of these 5 "best genes" in KD cells. Compare with ß coefficients.

log2(rowMeans(counts(dds[best_genes, 4:6], normalize = TRUE)))

# log2 expression levels in KD cells evaluated from ß coefficients
coefficients(dds[best_genes])[,1] + coefficients(dds[best_genes])[,2]

# 6. How many genes have no padjusted value? Why?
summary(res$padj)

# filter genes with no padjusted values
res_tbl %>%
  filter(is.na(padj))


# Independant filtering threshold used
metadata(res)$filterThreshold

quantile(res$baseMean, 0.7169)

head(metadata(res)$filterNumRej)
as_tibble(metadata(res)$filterNumRej) %>%
  ggplot(aes(x = theta, y = numRej)) +
  geom_point() +
  geom_vline(xintercept = 0.7169,
             color = 'red')


# Evaluate how many genes were really filtered by the independent
# filtering procedure.

# Number of genes with basemean == 0
res_tbl %>%
  filter(baseMean == 0) %>%
  nrow()

# Number of genes filtered by the independent filtering procedure
res_tbl %>%
  filter(baseMean > 0 & baseMean < metadata(res)$filterThreshold) %>%
  nrow()

# Re-run the results() function on the same dds object,
#but set the independent filtering parameter to FALSE.
# Check how many genes have no padj?
res_no_IF <- results(dds,
                     name = "Condition_KD_vs_mock",
                     independentFiltering = FALSE)
as_tibble(res_no_IF, rownames = "ENSEMBL") %>%
  filter(is.na(padj)) %>% nrow()


# Imagine another way of filtering genes with very low counts

# filter the data to remove genes with few counts
filtering_thr <- 5
# keep genes with counts > 5 in 3 or more samples
keep <- rowSums(counts(dds, normalized = TRUE) >= filtering_thr) >=3
dds_bis <- DESeq(dds[keep, ])
res_bis <- results(dds_bis,
                   name = "Condition_KD_vs_mock",
                   independentFiltering = FALSE)

as_tibble(res_bis, rownames = "ENSEMBL") %>%
  filter(is.na(padj)) %>% nrow()

# => Number of genes kept
as_tibble(res_bis, rownames = "ENSEMBL") %>%
  filter(!is.na(padj)) %>% nrow()

# Compare with previous anaylsis with the independant filtering thr fixed by deseq2
as_tibble(res, rownames = "ENSEMBL") %>%
  filter(!is.na(padj)) %>% nrow()


# Histogram of pvalues
hist(res_tbl$pvalue)

res_tbl %>%
  filter(pvalue > 0.8 & pvalue < 0.85) %>%
  head(10)

res_tbl %>%
  filter(baseMean > metadata(res)$filterThreshold) %>% 
  pull(pvalue) %>% 
  hist()
            
# plot MA
plotMA(res)

# Volcano plot
res_tbl %>%
  filter(!is.na(padj)) %>%
  ggplot(aes(x = log2FoldChange, y = -log10(padj), 
             color = padj < 0.05 & abs(log2FoldChange) > 1)) +
  scale_colour_manual(values = c("gray", "red")) +
  geom_point(size = 0.5) + 
  geom_hline(yintercept = -log10(0.05)) +
  geom_vline(xintercept = 1) +
  geom_vline(xintercept = -1)


# plot counts of 6 best genes
best_genes <- res_tbl %>%
  arrange(padj)  %>%
  head(6)

as_tibble(counts(dds[best_genes$ENSEMBL, ], normalize = T),
          rownames = 'ENSEMBL') %>% 
  gather(sample, counts, -ENSEMBL) %>%
  left_join(as_tibble(coldata, rownames = "sample")) %>%
  ggplot(aes(x = sample, y = counts, fill = Condition)) +
  geom_bar(stat = 'identity', color = "gray30") +
  facet_wrap( ~ ENSEMBL, scales = "free", ncol = 3) +
  theme(axis.text.x = element_text(size = 7, angle = 90),
        axis.title.x = element_blank(),
        legend.position = "right",
        legend.text = element_text(size = 7),
        legend.title = element_text(size = 7))


#Identify and inspect counts of the genes plotted in red in the volcano-plot. 
#These genes have a very large log2FC (|log2FC| > 5) but are far from bearing 
#the lowest padjusted value (their padjusted value is between 0.05 and 1e-5).
selected_genes <- res_tbl %>%
  filter(padj < 0.05 & padj > 1e-5 & abs(log2FoldChange) > 5)

as_tibble(counts(dds[selected_genes$ENSEMBL, ], normalize = T),
          rownames = 'ENSEMBL') %>%
  gather(sample, counts, -ENSEMBL) %>%
  left_join(as_tibble(coldata, rownames = "sample")) %>%
  ggplot(aes(x = sample, y = counts, fill = Condition)) +
  geom_bar(stat = 'identity', color = "gray30") +
  facet_wrap( ~ ENSEMBL, scales = "free", ncol = 3) +
  theme(axis.text.x = element_text(size = 7, angle = 90),
        axis.title.x = element_blank(),
        legend.position = "right",
        legend.text = element_text(size = 7),
        legend.title = element_text(size = 7))

# Using dispersion() function, compare dispersion values for both group 
#of genes
dispersions(dds[best_genes$ENSEMBL,])
dispersions(dds[selected_genes$ENSEMBL,])


# Add genes names
library("biomaRt")
library("org.Hs.eg.db")

# Load homo sapiens ensembl dataset
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))

#Attributes define the values we are interested to retrieve.
listAttributes(mart)
ensembl_to_geneName <- getBM(
  attributes = c("ensembl_gene_id", "external_gene_name", "entrezgene_id"), 
  mart = mart)
ensembl_to_geneName
names(ensembl_to_geneName) <- c("ENSEMBL", "gene", "ENTREZID")

res_tbl <- res_tbl %>%
  left_join(ensembl_to_geneName) %>%
  arrange(padj)


saveRDS(dds, file = "./data/dds.rds")
saveRDS(res_tbl, file = "./data/res_tbl.rds")
saveRDS(ensembl_to_geneName, file = "./data/ensembl_to_geneName.rds")
readRDS(file = "./data/en")