Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
library(DESeq2)
library(tidyverse)
load("wsbim2122_data/deseq2/counts.rda")
load("wsbim2122_data/deseq2/coldata.rda")
# Create dds object
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = coldata,
design = ~ Condition)
# Inspect counts distribution
as_tibble(assay(dds)) %>%
gather(sample, value = counts) %>%
ggplot(aes(x = log2(counts + 1), fill = sample)) +
geom_histogram(bins = 20) +
facet_wrap(~ sample)
# Run DESeq2
dds <- DESeq(dds)
#PCA with plotPCA
rld <- rlogTransformation(dds)
# effect of the rlog transformation
# picked a very highly expressed gene
assay(dds["ENSG00000198804"])
log2(assay(dds["ENSG00000198804"]))
assay(rld["ENSG00000198804"])
# picked a lowly expressed gene
assay(dds["ENSG00000248671"])
log2(assay(dds["ENSG00000248671"]))
assay(rld["ENSG00000248671"])
plotPCA(rld, intgroup = "Condition")
# Values can be extracted to customise the plot
x <- plotPCA(rld, intgroup = "Condition", return = TRUE)
ggplot(x, aes(x = PC1, y = PC2, color = Condition, label = name)) +
geom_point() +
geom_text()
# Size Factors
sizeFactors(dds)
# Compare with sequencing depths
SF <- enframe(sizeFactors(dds), name = "sample", value = "Size_Factor") %>%
ggplot(aes(x = sample, y = Size_Factor)) +
geom_bar(stat = "identity")
SD <- enframe(colSums(assay(dds, "counts")), name = "sample", value = "n_reads") %>%
ggplot(aes(x = sample, y = n_reads)) +
geom_bar(stat = "identity")
library("patchwork")
SF / SD
# Available results
resultsNames(dds)
dds$Condition <- relevel(dds$Condition, ref = "mock")
dds <- DESeq(dds)
res <- results(dds,
name = "Condition_KD_vs_mock")
res_tbl <- as_tibble(res, rownames = "ENSEMBL")
## Exercices
# 1. Inspect the results table and identify the 5 “best genes” showing the lowest padjusted value.
res_tbl %>%
arrange(padj) %>%
head(5)
# 2. Calculate the mean expression level of these 5 "best genes" using
# the function `count()`. Compare with baseMean values.
best_genes <- res_tbl %>%
arrange(padj) %>%
head(5) %>% pull(ENSEMBL)
best_genes
counts(dds[best_genes], normalize = TRUE)
rowMeans(counts(dds[best_genes], normalize = TRUE))
# 3. Extract the ß coefficient of these 5 "best genes" from the GLM
# using the function `coefficient()`. Compare with log2FoldChange values.
coefficients(dds[best_genes])
# 4. using the function `count()`, calculate the expression levels (in log2)
# of these 5 "best genes" in mock cells. Compare with ß coefficients.
log2(rowMeans(counts(dds[best_genes, 1:3], normalize = TRUE)))
# 5. Calculate the expression levels (in log2)
#of these 5 "best genes" in KD cells. Compare with ß coefficients.
log2(rowMeans(counts(dds[best_genes, 4:6], normalize = TRUE)))
# log2 expression levels in KD cells evaluated from ß coefficients
coefficients(dds[best_genes])[,1] + coefficients(dds[best_genes])[,2]
# 6. How many genes have no padjusted value? Why?
summary(res$padj)
# filter genes with no padjusted values
res_tbl %>%
filter(is.na(padj))
# Independant filtering threshold used
metadata(res)$filterThreshold
quantile(res$baseMean, 0.7169)
head(metadata(res)$filterNumRej)
as_tibble(metadata(res)$filterNumRej) %>%
ggplot(aes(x = theta, y = numRej)) +
geom_point() +
geom_vline(xintercept = 0.7169,
color = 'red')
# Evaluate how many genes were really filtered by the independent
# filtering procedure.
# Number of genes with basemean == 0
res_tbl %>%
filter(baseMean == 0) %>%
nrow()
# Number of genes filtered by the independent filtering procedure
res_tbl %>%
filter(baseMean > 0 & baseMean < metadata(res)$filterThreshold) %>%
nrow()
# Re-run the results() function on the same dds object,
#but set the independent filtering parameter to FALSE.
# Check how many genes have no padj?
res_no_IF <- results(dds,
name = "Condition_KD_vs_mock",
independentFiltering = FALSE)
as_tibble(res_no_IF, rownames = "ENSEMBL") %>%
filter(is.na(padj)) %>% nrow()
# Imagine another way of filtering genes with very low counts
# filter the data to remove genes with few counts
filtering_thr <- 5
# keep genes with counts > 5 in 3 or more samples
keep <- rowSums(counts(dds, normalized = TRUE) >= filtering_thr) >=3
dds_bis <- DESeq(dds[keep, ])
res_bis <- results(dds_bis,
name = "Condition_KD_vs_mock",
independentFiltering = FALSE)
as_tibble(res_bis, rownames = "ENSEMBL") %>%
filter(is.na(padj)) %>% nrow()
# => Number of genes kept
as_tibble(res_bis, rownames = "ENSEMBL") %>%
filter(!is.na(padj)) %>% nrow()
# Compare with previous anaylsis with the independant filtering thr fixed by deseq2
as_tibble(res, rownames = "ENSEMBL") %>%
filter(!is.na(padj)) %>% nrow()
# Histogram of pvalues
hist(res_tbl$pvalue)
res_tbl %>%
filter(pvalue > 0.8 & pvalue < 0.85) %>%
head(10)
res_tbl %>%
filter(baseMean > metadata(res)$filterThreshold) %>%
pull(pvalue) %>%
hist()
# plot MA
plotMA(res)
# Volcano plot
res_tbl %>%
filter(!is.na(padj)) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj),
color = padj < 0.05 & abs(log2FoldChange) > 1)) +
scale_colour_manual(values = c("gray", "red")) +
geom_point(size = 0.5) +
geom_hline(yintercept = -log10(0.05)) +
geom_vline(xintercept = 1) +
geom_vline(xintercept = -1)
# plot counts of 6 best genes
best_genes <- res_tbl %>%
arrange(padj) %>%
head(6)
as_tibble(counts(dds[best_genes$ENSEMBL, ], normalize = T),
rownames = 'ENSEMBL') %>%
gather(sample, counts, -ENSEMBL) %>%
left_join(as_tibble(coldata, rownames = "sample")) %>%
ggplot(aes(x = sample, y = counts, fill = Condition)) +
geom_bar(stat = 'identity', color = "gray30") +
facet_wrap( ~ ENSEMBL, scales = "free", ncol = 3) +
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.title.x = element_blank(),
legend.position = "right",
legend.text = element_text(size = 7),
legend.title = element_text(size = 7))
#Identify and inspect counts of the genes plotted in red in the volcano-plot.
#These genes have a very large log2FC (|log2FC| > 5) but are far from bearing
#the lowest padjusted value (their padjusted value is between 0.05 and 1e-5).
selected_genes <- res_tbl %>%
filter(padj < 0.05 & padj > 1e-5 & abs(log2FoldChange) > 5)
as_tibble(counts(dds[selected_genes$ENSEMBL, ], normalize = T),
rownames = 'ENSEMBL') %>%
gather(sample, counts, -ENSEMBL) %>%
left_join(as_tibble(coldata, rownames = "sample")) %>%
ggplot(aes(x = sample, y = counts, fill = Condition)) +
geom_bar(stat = 'identity', color = "gray30") +
facet_wrap( ~ ENSEMBL, scales = "free", ncol = 3) +
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.title.x = element_blank(),
legend.position = "right",
legend.text = element_text(size = 7),
legend.title = element_text(size = 7))
# Using dispersion() function, compare dispersion values for both group
#of genes
dispersions(dds[best_genes$ENSEMBL,])
dispersions(dds[selected_genes$ENSEMBL,])
# Add genes names
library("biomaRt")
library("org.Hs.eg.db")
# Load homo sapiens ensembl dataset
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
#Attributes define the values we are interested to retrieve.
listAttributes(mart)
ensembl_to_geneName <- getBM(
attributes = c("ensembl_gene_id", "external_gene_name", "entrezgene_id"),
mart = mart)
ensembl_to_geneName
names(ensembl_to_geneName) <- c("ENSEMBL", "gene", "ENTREZID")
res_tbl <- res_tbl %>%
left_join(ensembl_to_geneName) %>%
arrange(padj)
saveRDS(dds, file = "./data/dds.rds")
saveRDS(res_tbl, file = "./data/res_tbl.rds")
saveRDS(ensembl_to_geneName, file = "./data/ensembl_to_geneName.rds")
readRDS(file = "./data/en")