Commit a0f69495 authored by Lucia Gomez Teijeiro's avatar Lucia Gomez Teijeiro
Browse files

co of course directory and rename to adv_scrnaseq_2020_Lucia

parent 01af6b8b
Pipeline #36094 passed with stage
in 1 minute and 2 seconds
# History files
.Rhistory
.Rapp.history
# Session Data files
.RData
# User-specific files
.Ruserdata
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
# R Environment Variables
.Renviron
# RStudio project files
*.Rproj
combining_python_and_R/pythonlib
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
library(reticulate)
## For setup on the renku environment:
reticulate::use_virtualenv("/opt/conda")
## Only for xenon6 setup:
reticulate::use_virtualenv("/tungstenfs/groups/gbioinfo/sharedSoft/virtualenvs/r-reticulate-keras-2.3.0-tensorflow-2.0.0-gpu") # TF 2.00
#reticulate::use_virtualenv("/tungstenfs/groups/gbioinfo/sharedSoft/virtualenvs/r-reticulate-keras-2.2.5-tensorflow-1.14.0-gpu") # TF 1.14
Sys.setenv("CUDA_VISIBLE_DEVICES" = "1" ) # Define visible GPU devices. Make certain that the set device is not in use by checking the output of nvidia-smi
library(keras)
use_implementation("tensorflow")
library(tensorflow)
library(Matrix)
library(SingleCellExperiment)
K <- backend() # manual add-on
tf$version$VERSION
sce <- readRDS(gzcon(url("https://github.com/fmicompbio/adv_scrnaseq_2020/blob/master/DGNs/data/SCE_MammaryGland.rds?raw=true")))
assays(sce )[["lognorm"]] <- log2(sweep( counts(sce),2,sce$library_size ,FUN="/")*1e4 +1)
combined.df.filtered <- as.matrix(assays(sce )[["lognorm"]] )
combined.annot.study <- sce@colData$study
combined.annot.ct <- sce@colData$cell.class
combined.annot <- paste(sce@colData$study,sce@colData$cell.class,sep="_")
####### Splitting in training and validation data, converting to array
set.seed(1)
holback.fraction=0.2
holdback.samples=sample(1:ncol(sce),round(holback.fraction*ncol(sce)) )
##### Training Data:
study_annot.train=combined.annot.study[-holdback.samples]
ct_annot.train=combined.annot.ct[-holdback.samples]
M=combined.df.filtered[,-holdback.samples]
sc_train_x=array(M, dim= c(dim(M)[1], prod(dim(M)[-1]))) # convert to an array
sc_train_x=t(sc_train_x) #Need to transpose before passing to the model
rm(M)
##### Validation Data:
study_annot.test=combined.annot.study[holdback.samples]
ct_annot.test=combined.annot.ct[holdback.samples]
M=combined.df.filtered[,holdback.samples]
sc_test_x=array( M, dim= c(dim(M)[1], prod(dim(M)[-1]))) # convert to an array
sc_test_x=t(sc_test_x) # Need to transpose before passing to the model
rm(M)
###################################################################
##### Sparse variational autoencoder with one hot encodding for auxiliary input fed after the latent layer
## Ensure compatibility with both TF2 nd TF1:
if (tensorflow::tf$executing_eagerly())
tensorflow::tf$compat$v1$disable_eager_execution()
# Parameters --------------------------------------------------------------
neck <- 64L #256L
drop_rate=0.2 #less than 0.2
gene_dim <- ncol(sc_train_x) #Number of features (genes) in your dataset
latent_dim <- neck
intermediate_dim <- 512L #
epsilon_std <- 0.8 #Standard deviation of the prior latent distribution (def=1)
var_prior <- epsilon_std**2
log_var_prior <- log(var_prior)
kl_weight=0.1 #Weight got the kulllback leibler divergence loss (def=1 )
# Encoder definition --------------------------------------------------------
x <- layer_input(shape = c(gene_dim),name="gene_input")
h <- layer_dense(x, intermediate_dim, activation = "elu") #softsign +elu +linear
h <- layer_dropout(h, rate = drop_rate)
h <- layer_dense(h,256,activation="elu")
h <- layer_dropout(h, rate = drop_rate)
h <- layer_dense(h,128, activation = "elu")
h <- layer_dropout(h, rate = drop_rate)
z_mean <- layer_dense(h, latent_dim)
z_log_var <- layer_dense(h, latent_dim)
#### Sampling from the latent space:
sampling <- function(arg){
z_mean <- arg[, 1:(latent_dim)]
z_log_var <- arg[, (latent_dim + 1):(2 * latent_dim)]
epsilon <- K$random_normal(
shape = c(K$shape(z_mean)[[1]]),
mean=0.,
stddev=epsilon_std
)
z_mean + K$exp(z_log_var/2)*epsilon
}
# note that "output_shape" isn't necessary with the TensorFlow backend
z <- layer_concatenate(list(z_mean, z_log_var)) %>%
layer_lambda(sampling)
# we instantiate the decoder separately so as to reuse it later
decoder_h <- keras_model_sequential()
decoder_h %>%
layer_dense(units=128,activation="elu") %>% #Start with /4 when the extra optional layer is used. Otherwise /2
layer_dropout(rate = drop_rate) %>%
layer_dense(units=256,activation="elu") %>%
layer_dropout(rate = drop_rate) %>%
layer_dense(intermediate_dim, activation = "elu") %>%
layer_dropout(rate = drop_rate)
decoder_mean <- layer_dense(units = gene_dim, activation = "relu")
h_decoded <- decoder_h(z)
x_decoded_mean <- decoder_mean(h_decoded)
# end-to-end autoencoder
vae <- keras_model(x, x_decoded_mean)
# encoder, from inputs to latent space
encoder <- keras_model(x, z_mean)
# generator, from latent space to reconstructed inputs
decoder_input <- layer_input(shape = latent_dim)
h_decoded_2 <- decoder_h(decoder_input)
x_decoded_mean_2 <- decoder_mean(h_decoded_2)
generator <- keras_model(decoder_input, x_decoded_mean_2)
vae_loss <- function(x, x_decoded_mean){
reconstruction_loss <- loss_mean_squared_error(x, x_decoded_mean)
kl_loss <- -kl_weight*0.5*K$mean(1 + z_log_var-log_var_prior - K$square(z_mean)/var_prior - K$exp(z_log_var)/var_prior, axis = -1L) # More general formula
reconstruction_loss + kl_loss
}
#compiling the defined model with metric = accuracy and optimiser adam.
opt <- optimizer_adam(lr =0.001,amsgrad = TRUE)#
vae %>% compile(
loss = vae_loss,
optimizer = opt
#,
#experimental_run_tf_function=FALSE
#run_eagerly=TRUE
)
########## Fitting the model:
batch_size <- 512
nepochs=20 # Run 150-250 epochs with a low lr
burn_in_lr=0.00005 # 0.000001 x 50 For GTEx-TCGA
###### Training:
k_set_value(vae$optimizer$lr, burn_in_lr )
history2 <- vae %>% fit(
x=sc_train_x,
y=sc_train_x,
shuffle = TRUE,
epochs = nepochs,
initial_epoch=41,
batch_size = batch_size,
validation_data=list(sc_test_x,sc_test_x)
)
.bold-large {
font-size: 1.6em;
font-weight: bold;
}
.bullet-level1 {
font-size: 1.6em;
font-weight: bold;
color: steelblue;
}
.bullet-level2 {
font-size: 1.2em;
font-weight: bold;
color: steelblue;
}
.green-large {
font-size: 1.6em;
font-weight: bold;
color: rgb(77, 175, 74);
}
.red-large {
font-size: 1.6em;
font-weight: bold;
color: rgb(228, 26, 28);
}
.verylarge {
font-size: 3.6em;
font-weight: bold;
}
.inline-code {
color: rgb(199, 37, 78);
background-color: rgb(236, 236, 236);
padding: 2px;
border: 1px solid rgb(192, 192, 192);
border-radius: 5px;
font-family: "Lucida Console", Monaco, monospace;
}
.pythonchunk {
background-color: #faf2dc !important;
}
.rchunk {
background-color: #f5f5f5 !important;
}
.bashchunk {
background-color: #e3eefa !important;
}
.hvcentered {
text-align: center;
vertical-align: middle;
}
.larger {
font-size: 1.2em;
}
.bold {
font-weight: bold;
}
.footer {
color: black; background: #E8E8E8;
position: fixed; top: 90%;
text-align:center; width:100%;
}
# Course Material for "Advanced topics in single-cell transcriptomics"
This repository hosts the course material for the SIB course
[Advanced topics in single-cell transcriptomics](https://www.sib.swiss/training/course/2020-05-adv-scrna)
## Teachers:
- Alma Andersson
- Panagiotis Papasaikas
- Rok Roskar
- Mike Smith
- Charlotte Soneson
- Avi Srivastava
- Michael Stadler
## Program
| Wednesday, May 27 | |
|------------------:|:-------------------------------------------------------|
| 9:00 - 9:30 | Welcome and setup of working environment (*C. Soneson*)<ul><li>Zoom</li><li>slack workspace</li><li>HackMD document</li><li>Introducing participants (ice breaker)</li></ul> |
| 9:30 - 10:00 | Introduction to Renku (*R. Roskar*) |
| 10:00 - 10:30 | Introduction to the course (*M. Stadler*)<ul><li>Course overview, expected background</li><li>Prepare renku workspace</li></ul> |
| 10:30 - 10:45 | *Break* |
| 10:45 - 11:15 | Combining the best of two worlds: Python + R (introduction, *M. Stadler*) |
| 11:15 - 11:30 | *Break* |
| 11:30 - 12:30 | Combining the best of two worlds: Python + R (interactive exercise, *M. Stadler*) |
| 12:30 - 13:30 | *Lunch* |
| 13:30 - 14:30 | Python + R (continued) |
| 14:30 - 14:45 | *Break* |
| 14:45 - 15:30 | Beyond cellranger: Quantification of gene expression (presentation, *A. Srivastava*) |
| 15:30 - 15:45 | *Break* |
| 15:45 - 16:45 | Quantification of gene expression (exercises, *A. Srivastava*) |
| 16:45 - 17:00 | *Break* |
| 17:00 - 18:00 | Quantification of gene expression (continued) |
| Thursday, May 28 | |
| -----------------:|:-------------------------------------------------------|
| 9:00 - 10:00 | RNA velocity (presentation, *C. Soneson*) |
| 10:00 - 10:15 | *Break* |
| 10:15 - 11:15 | RNA velocity (exercises, *C. Soneson*) |
| 11:15 - 11:30 | *Break* |
| 11:30 - 12:30 | RNA velocity (continued) |
| 12:30 - 14:00 | *Lunch* |
| 14:00 - 15:00 | Spatial transcriptomics (presentation, *A. Andersson*) |
| 15:00 - 15:15 | *Break* |
| 15:15 - 16:15 | Spatial transcriptomics (exercises, *A. Andersson*) |
| 16:15 - 16:30 | *Break* |
| 16:30 - 17:30 | Spatial transcriptomics (continued) |
| Friday, May 29 | |
| -----------------:|:-------------------------------------------------------|
| 9:00 - 10:00 | Working with on-disk data formats (presentation, *M. Smith*) |
| 10:00 - 10:15 | *Break* |
| 10:15 - 11:15 | Working with on-disk data formats (exercises, *M. Smith*) |
| 11:15 - 11:30 | *Break* |
| 11:30 - 12:30 | Working with on-disk data formats (continued) |
| 12:30 - 13:30 | *Lunch* |
| 13:30 - 14:30 | Deep Generative Networks (presentation and exercises, *P. Papasaikas*) |
| 14:30 - 14:45 | *Break* |
| 14:45 - 15:45 | Deep Generative Networks (continued) |
| 15:45 - 16:00 | *Break* |
| 16:00 - 17:00 | Deep Generative Networks (continued) |
| 17:00 - 17:30 | Wrap-up (feedback form, slack workspace, all) |
---
title: 'Combining the best of two worlds: Python & R'
author: "Michael Stadler"
date: "May 27, 2020"
output:
ioslides_presentation:
css: styles.css
widescreen: true
---
<!-- formatting help: -->
<!-- https://garrettgman.github.io/rmarkdown/ioslides_presentation_format.html -->
<!-- https://bookdown.org/yihui/rmarkdown/ioslides-presentation.html -->
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
# python lib for required packages not available on the system
# install e.g. using:
# pip3 install numpy scipy rpy2 matplotlib sinfo pandas --target ./combining_python_and_R/pythonlib/ --upgrade
Sys.setenv(PYTHONPATH = "/tungstenfs/groups/gbioinfo/stadler/documents/teaching/SIB_advanced-scRNAseq-course_2020/adv_scrnaseq_2020/pythonlib")
Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python3.6") # use system python 3.6
library(reticulate)
py_config()
```
## Outline
- <div class="bullet-level1">Motivation for combining R and Python</div>
- <div class="bullet-level1">Levels of integration</div>
* Break into homogeneous chunks
* Use a "bridge"
* Truly integrated workflow
## Why Python or R for data science? {.smaller}
- <div class="bullet-level1">Both</div>
* open-source programming language with a large community
* new libraries or tools are added continuously
* interface to compiled language libraries
- ![Python](figures/python-logo-generic.svg){width=150px}
* general purpose programming language
* clean, efficient, modular design (software deployment)
* scipy, scVelo, scikit-learn, Keras, PyTorch
- ![R](figures/Rlogo.svg){width=60px}
* statistical modeling and data analysis language
* great for visualization
* Bioconductor (edgeR, DESeq2, scran, scater), tidyverse, ggplot2, shiny
## Many "Python versus R" comparisons out there... {.smaller}
<!-- - why choose if you can have both -->
<!-- https://www.guru99.com/r-vs-python.html -->
<!-- https://www.datacamp.com/community/blog/when-to-use-python-or-r -->
<!-- http://ucanalytics.com/blogs/r-vs-python-comparison-and-awsome-books-free-pdfs-to-learn-them/ -->
<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Which <a href="https://twitter.com/hashtag/superheroe?src=hash&amp;ref_src=twsrc%5Etfw">#superheroe</a> are you?(<a href="https://twitter.com/hashtag/batman?src=hash&amp;ref_src=twsrc%5Etfw">#batman</a> Vs. <a href="https://twitter.com/hashtag/Superman?src=hash&amp;ref_src=twsrc%5Etfw">#Superman</a>) == (<a href="https://twitter.com/hashtag/R?src=hash&amp;ref_src=twsrc%5Etfw">#R</a> Vs. <a href="https://twitter.com/hashtag/Python?src=hash&amp;ref_src=twsrc%5Etfw">#Python</a>)? <a href="https://twitter.com/hashtag/datascience?src=hash&amp;ref_src=twsrc%5Etfw">#datascience</a> <a href="https://twitter.com/roopamu?ref_src=twsrc%5Etfw">@roopamu</a> <a href="https://t.co/B1gO8MT1Zr">https://t.co/B1gO8MT1Zr</a> <a href="https://t.co/GR3pUiZ6rS">pic.twitter.com/GR3pUiZ6rS</a></p>&mdash; Antoine (@AntoineTrdc) <a href="https://twitter.com/AntoineTrdc/status/660953806168072193?ref_src=twsrc%5Etfw">November 1, 2015</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
## Why combining the two? {.larger}
- any data scientist constantly combines tools from various sources
- unlikely that all functionality is available from a single source
- nice to have fewer scripts / steps / intermediate files
<!-- * <span class="inline-code">PyTorch</span> is my favorite ML framework. -->
<!-- * I would like to use <span class="inline-code">ggplot2</span> for plotting. -->
- use case (why I started looking into combining R and Python):
* primarily use <span class="inline-code">R/BioC</span> for analysis of single cell data
* want to make use of <span class="inline-code">scVelo</span> python package for RNA-velocity analysis
## Levels of integration {.smaller}
1. <div class="bullet-level1">Break into homogeneous chunks</div>
- each chunk is pure <span class="inline-code">R</span> or pure <span class="inline-code">python</span>
- chunks are run separately from each other
- input and output is read from and stored in intermediate files
2. <div class="bullet-level1">Use a "bridge"</div>
- primarily work with one language
- use a <strong>specialized package</strong> that allows calling the 2nd language from the first
<!-- - learn its syntax and object mapping and add appropriate calls -->
<!-- - works well for few/well encapsulated calls to second language, if required objects can be shared over "bridge" -->
3. <div class="bullet-level1">Truly integrated workflow</div>
- use a single script or notebook
- run it through a <strong>pair of connected R and python processes</strong>
- <strong>objects are shared</strong> between these processes (no need for input/output files)
## Approach 1: break into pure R/python chunks
- <div class="bold-large">can be organized using workflow tools</div>
* (make, Snakemake, knime, custom pipelines, ...)
- <div class="green-large">Advantages</div>
* flexible, can combine any tool (scripts, binaries, ...)
- <div class="red-large">Disadvantages</div>
* no real integration
* need to store state in intermediate files
* need for cut-and-glue code
## Approach 2: Use a "bridge"
- <div class="bold-large">made possible by "bridge" packages</div>
* Call python from R: [reticulate](https://rstudio.github.io/reticulate/) ![reticulate](figures/reticulated_python.png){width=50px}
* Call R from python: [rpy2](https://rpy2.github.io/) ![rpy2](figures/rpy2_logo2013.png){width=40px}
- <div class="green-large">Advantages</div>
* easy to use (primarly use one language)
- <div class="red-large">Disadvantages</div>
* indirect access to "other" language
* need to learn bridge package syntax
## Display conventions (also for exercises)
I will use background colors to indicate code from different languages:
```{r, echo=TRUE, class.source = "rchunk"}
# R code
R.version.string
```
```{python, echo=TRUE, class.source = "pythonchunk"}
# python code
import sys
sys.version
```
```{bash, echo=TRUE, class.source = "bashchunk"}
# shell script (bash)
echo ${BASH_VERSION}
```
## Example: Calling python from R using [reticulate](https://rstudio.github.io/reticulate/) {data-background="figures/reticulated_python.png" data-background-size="120px" data-background-position="right bottom" data-background-repeat="no-repeat"}
```{r pythonFromR, echo=TRUE, class.source = "rchunk"}
library(reticulate)
os <- import("os")
os$listdir(".")
```
```{python, echo=TRUE, class.source = "pythonchunk"}
import os
os.listdir(".")
```
## [reticulate](https://rstudio.github.io/reticulate/) type conversions {.smaller data-background="figures/reticulated_python.png" data-background-size="120px" data-background-position="right bottom" data-background-repeat="no-repeat"}
R Python Examples
--------------------- ----------------- ---------------
Single-element vector Scalar 1, 1L, TRUE, "foo"
Multi-element vector List c(1.0, 2.0, 3.0), c(1L, 2L, 3L)
List of multiple types Tuple list(1L, TRUE, "foo")
Named list Dict list(a = 1L, b = 2.0), dict(x = x_data)
Matrix/Array NumPy ndarray matrix(c(1,2,3,4), nrow = 2, ncol = 2)
Data Frame Pandas DataFrame data.frame(x = c(1,2,3), y = c("a", "b", "c"))
Function Python function function(x) x + 1
Raw Python bytearray as.raw(c(1:10))
NULL, TRUE, FALSE None, True, False NULL, TRUE, FALSE
## [reticulate](https://rstudio.github.io/reticulate/) used in other packages {.smaller data-background="figures/reticulated_python.png" data-background-size="120px" data-background-position="right bottom" data-background-repeat="no-repeat"}
- many packages use `reticulate` in the background to bridge to Python
- prominent examples are the R packages [Tensorflow](https://tensorflow.rstudio.com/) and [Keras](https://keras.rstudio.com/) that implement an R API strongly resembling the original Python APIs ([Tensorflow](https://www.tensorflow.org/) and [Keras](https://keras.io/)), and use `reticulate` for Python calls and object translation
- you will see those packages in use on the third of this course in the discussion of deep generative networks
- for package developers, the newly released [basilisk](https://bioconductor.org/packages/basilisk/) Bioconductor package may also be interesting: It provides a mechanism to distribute a controled Python environment
<div class="hvcentered">![](figures/tensorflow-logo.svg){width=300px} ![](figures/keras_logo.png){width=300px}</div>
## Example: Calling R from python using [rpy2](https://rpy2.github.io/) {data-background="figures/rpy2_logo2013.png" data-background-size="120px" data-background-position="right bottom" data-background-repeat="no-repeat"}
```{python rFromPython, eval = FALSE, echo=TRUE, class.source = "pythonchunk"}
import rpy2.robjects as robjects
pi = robjects.r['pi']
pi[0]
```
<!-- cannot evaluate the above:
Rmarkdown has already setup reticulate to bridge R+python,
cannot at the same time import rpy2 to bridge back!
Fix: use the reticulate "r" object the access pi from python :-) -->
```{python, echo=FALSE}
r.pi
```
```{r, echo=TRUE, class.source = "rchunk"}
pi
```
## [rpy2](https://rpy2.github.io/) type conversions {data-background="figures/rpy2_logo2013.png" data-background-size="120px" data-background-position="right bottom" data-background-repeat="no-repeat"}
- rpy2's object conversion system is complex and powerful:
* a lower-level interface (implemented as protocols): [rpy2.rinterface](https://rpy2.github.io/doc/latest/html/rinterface.html#module-rpy2.rinterface)
* a higher-level interface using converter functions: <span class="inline-code">rpy2.robjects.conversion.Converter()</span>
* custom converters can be implemented for new objects
- for details see: https://rpy2.github.io/doc/latest/html/robjects_convert.html
## Approach 3: Integrated workflow
- <div class="bold-large">use a single script or notebook</div>
* use a pair of connected R and python processes
* processes can share objects similarly as with "bridge" approach
<!-- * not new: Emacs org-mode, Beaker notebook (not developped anymore), ... -->
* supported by RStudio: <span class="inline-code">rmarkdown</span> + <span class="inline-code">reticulate</span>, see also [blog post](https://blog.rstudio.com/2018/03/26/reticulate-r-interface-to-python/)
* supported by Jupyter: [<span class="inline-code">rpy2.ipython.rmagic</span>](https://rpy2.github.io/doc/latest/html/interactive.html#module-rpy2.ipython.rmagic)
- <div class="green-large">Advantages</div>
* easy to use