test.R

library(DESeq2)
library(tidyverse)
load("wsbim2122_data/deseq2/counts.rda")
load("wsbim2122_data/deseq2/coldata.rda")
coldata
head(counts)

# Create dds object
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = coldata,
                              design = ~ Condition)


head(assay(dds))
colData(dds)
rowData(dds)

# Inspect counts distribution
as_tibble(assay(dds)) %>%
  gather(sample, value = counts) %>%
  ggplot(aes(x = log2(counts + 1), fill = sample)) +
  geom_histogram(bins = 20) +
  facet_wrap(~ sample)

# Run DESeq2
dds <- DESeq(dds)

#PCA with plotPCA
dim(rld)
rld <- rlogTransformation(dds)
plotPCA(rld, intgroup = "Condition")

# Values can be extracted to customise the plot
x <- plotPCA(rld, intgroup = "Condition", return = TRUE)
ggplot(x, aes(x = PC1, y = PC2, color = Condition, label = name)) +
  geom_point() +
  geom_text()

# PCA with prcomp()
pca <- prcomp(t(assay(rld)))
summary(pca)
var <- pca$sdev^2
pve <- var/sum(var) # % var explained
as_tibble(pca$x, rownames = "Sample") %>% 
  select(Sample, PC1, PC2) %>% 
  left_join(as_tibble(coldata, rownames = "Sample")) %>% 
  ggplot(aes(x = PC1, y = PC2, color = Condition)) +
  geom_point() +
  xlab(paste0("PC1:", round(pve[1]*100), " % variance")) +
  ylab(paste0("PC2:", round(pve[2]*100), " % variance"))

# PCA with prcomp() taking 500 genes with higher variance
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(rld)[select, ]))
pve <- pca$sdev^2/sum(pca$sdev^2)
as_tibble(pca$x, rownames = "Sample") %>% 
  select(Sample, PC1, PC2) %>% 
  left_join(as_tibble(coldata, rownames = "Sample")) %>% 
  ggplot(aes(x = PC1, y = PC2, color = Condition)) +
  geom_point() +
  xlab(paste0("PC1:", round(pve[1]*100), " % variance")) +
  ylab(paste0("PC2:", round(pve[2]*100), " % variance"))


# Size Factors
sizeFactors(dds)

# Compare with sequencing depths
SF <- enframe(sizeFactors(dds), name = "sample", value = "Size_Factor") %>%
  ggplot(aes(x = sample, y = Size_Factor)) +
  geom_bar(stat = "identity")

SD <- enframe(colSums(assay(dds, "counts")), name = "sample", value = "n_reads") %>%
  ggplot(aes(x = sample, y = n_reads)) +
  geom_bar(stat = "identity")

library("patchwork")
SF / SD


# Available results
resultsNames(dds)
dds$Condition

dds$Condition <- relevel(dds$Condition, ref = "mock")
dds$Condition
dds <- DESeq(dds)
resultsNames(dds)

res <- results(dds,
               name = "Condition_KD_vs_mock")
res_tbl <- as_tibble(res, rownames = "ENSEMBL")

## Exercices

# 1. Inspect the results table and identify the 5 “best genes” showing the lowest padjusted value.
res_tbl %>%
  arrange(padj) %>%
  head(5)

# 2. Calculate the mean expression level of these 5 "best genes" using 
# the function `count()`. Compare with baseMean values.

best_genes <- res_tbl %>%
  arrange(padj) %>%
  head(5) %>% pull(ENSEMBL)

rowMeans(counts(dds[best_genes], normalize = TRUE))


rowMeans(counts(dds[best_genes], normalize = TRUE))

# 3. Extract the ß coefficient of these 5 "best genes" from the GLM 
# using the function `coefficient()`. Compare with log2FoldChange values.

coefficients(dds[best_genes])

# 4. using the function `count()`, calculate the expression levels (in log2) 
# of these 5 "best genes" in mock cells. Compare with ß coefficients.

log2(rowMeans(counts(dds[best_genes, 1:3], normalize = TRUE)))


# 5. Calculate the expression levels (in log2) 
#of these 5 "best genes" in KD cells. Compare with ß coefficients.

log2(rowMeans(counts(dds[best_genes, 4:6], normalize = TRUE)))

# log2 expression levels in KD cells evaluated from ß coefficients
coefficients(dds[best_genes])[,1] + coefficients(dds[best_genes])[,2]

# 6. How many genes have no padjusted value? Why?
summary(res$padj)

# filter genes with no padjusted values
res_tbl %>%
  filter(is.na(padj))

head(metadata(res)$filterNumRej)
as_tibble(metadata(res)$filterNumRej) %>%
  ggplot(aes(x = theta, y = numRej)) +
  geom_point() +
  geom_vline(xintercept = 0.7169,
             color = 'red')


# Evaluate how many genes were really filtered by the independent
# filtering procedure.

# Number of genes with basemean == 0
res_tbl %>%
  filter(baseMean == 0) %>%
  nrow()

# Number of genes filtered by the independent filtering procedure
res_tbl %>%
  filter(baseMean > 0 & baseMean < metadata(res)$filterThreshold) %>%
  nrow()

# Re-run the results() function on the same dds object,
#but set the independent filtering parameter to FALSE.
# Check how many genes have no padj?
res_no_IF <- results(dds,
                     name = "Condition_KD_vs_mock",
                     independentFiltering = FALSE)
as_tibble(res_no_IF, rownames = "ENSEMBL") %>%
  filter(is.na(padj)) %>% nrow()


# Imagine another way of filtering genes with very low counts

# filter the data to remove genes with few counts
filtering_thr <- 5
# keep genes with counts > 5 in 3 or more samples
keep <- rowSums(counts(dds, normalized = TRUE) >= filtering_thr) >=3
dds_bis <- DESeq(dds[keep, ])
res_bis <- results(dds_bis,
                   name = "Condition_KD_vs_mock",
                   independentFiltering = FALSE)

as_tibble(res_bis, rownames = "ENSEMBL") %>%
  filter(is.na(padj)) %>% nrow()

as_tibble(res_bis, rownames = "ENSEMBL") %>%
  filter(!is.na(padj)) %>% nrow()

# Histogram of pvalues
hist(res_tbl$pvalue)

res_tbl %>%
  filter(pvalue > 0.8 & pvalue < 0.85) %>%
  head(10)

res_tbl %>%
  filter(baseMean > metadata(res)$filterThreshold) %>% 
  pull(pvalue) %>% 
  hist()
            
# plot MA
plotMA(res, alpha = 0.05)

# Volcano plot
res_tbl %>%
  filter(!is.na(padj)) %>%
  ggplot(aes(x= log2FoldChange, y = -log10(padj))) +
  geom_point(size = 0.5)