Commit e282a3ac authored by Cyril Matthey-Doret's avatar Cyril Matthey-Doret
Browse files

add jupytext as a submodule

parent a773d46e
Pipeline #313791 passed with stage
in 13 seconds
[submodule "jupytext"]
path = jupytext
url = https://github.com/mwouts/jupytext
# In this notebook, we benchmark the Jupytext formats for Jupyter notebooks against the base format
# Open this script as a notebook in Jupyter to run it and see the plots
import time
import copy
import pandas as pd
import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS
import nbformat
import jupytext
# The notebook to be tested
notebook = jupytext.read('World population.ipynb')
# Same notebook, with no outputs, for a fair comparison
notebook_no_outputs = copy.deepcopy(notebook)
for cell in notebook_no_outputs.cells:
cell.outputs = []
cell.execution_count = None
# +
JUPYTEXT_FORMATS = ['ipynb', 'md', 'py:light', 'py:percent', 'py:sphinx']
# Let's see if we have pandoc here
try:
jupytext.writes(notebook, fmt='md:pandoc')
JUPYTEXT_FORMATS.append('md:pandoc')
except jupytext.formats.JupytextFormatError as err:
print(str(err))
# Let's see if we have myst-parser installed here
try:
jupytext.writes(notebook, fmt='myst')
JUPYTEXT_FORMATS.append('myst')
except jupytext.formats.JupytextFormatError as err:
print(str(err))
# -
def sample_perf(nb, n=30):
samples = pd.DataFrame(
pd.np.NaN,
index=pd.MultiIndex.from_product(
(range(n), ['nbformat'] + JUPYTEXT_FORMATS), names=['sample', 'implementation']),
columns=pd.Index(['size', 'read', 'write'], name='measure'))
for i, fmt in samples.index:
t0 = time.time()
if fmt == 'nbformat':
text = nbformat.writes(nb)
else:
text = jupytext.writes(nb, fmt)
t1 = time.time()
samples.loc[(i, fmt), 'write'] = t1 - t0
samples.loc[(i, fmt), 'size'] = len(text)
t0 = time.time()
if fmt == 'nbformat':
nbformat.reads(text, as_version=4)
else:
jupytext.reads(text, fmt)
t1 = time.time()
samples.loc[(i, fmt), 'read'] = t1 - t0
return samples
def performance_plot(perf, title):
formats = ['nbformat'] + JUPYTEXT_FORMATS
mean = perf.groupby('implementation').mean().loc[formats]
std = perf.groupby('implementation').std().loc[formats]
data = [go.Bar(x=mean.index,
y=mean[col],
error_y=dict(
type='data',
array=std[col],
color=color,
thickness=0.5
) if col != 'size' else dict(),
name=col,
yaxis={'read': 'y1', 'write': 'y2', 'size': 'y3'}[col])
for col, color in zip(mean.columns, DEFAULT_PLOTLY_COLORS)]
layout = go.Layout(title=title,
xaxis=dict(title='Implementation', anchor='y3'),
yaxis=dict(domain=[0.7, 1], title='Read (secs)'),
yaxis2=dict(domain=[0.35, .65], title='Write (secs)'),
yaxis3=dict(domain=[0, .3], title='Size')
)
return go.Figure(data=data, layout=layout)
perf_no_outputs = sample_perf(notebook_no_outputs, 30)
performance_plot(perf_no_outputs, 'Benchmarking Jupytext on the World Population notebook<br>(Outputs filtered)')
perf = sample_perf(notebook, 30)
performance_plot(perf, 'Benchmarking Jupytext on the World Population notebook<br>(With outputs)')
# This is a notebook that I used to generate Jupytext's word cloud.
# To open this script as a notebook in JupyterLab, right-click on this file, and select _Open with/Notebook_.
from wordcloud import WordCloud
text = """
Jupytext
Notebook
JupyterLab
Git
GitHub
Version control
Markdown
R Markdown
Text
Scripts
Code
Notebook Template
Binder
Visual Studio Code
PyCharm
Atom
Spyder
Hydrogen
RStudio
Sphinx-Gallery
Documentation
black
pytest
autopep8
Metadata
Reproducible research
R
Julia
Python
Bash
Powershell
Scala
Scheme
Clojure
Matlab
Octave
C++
q/kdb+
IDL
TypeScript
Javascript
Scala
Rust
Robot Framework
"""
wordcloud = WordCloud(
random_state=1,
background_color='white',
width=1200, height=500
).generate_from_frequencies({word: 1 for word in text.splitlines()})
wordcloud.to_image()
wordcloud.to_file('../docs/jupytext_word_cloud.png')
# Testing a Jupyter notebook with pytest
In this notebook we describe how to test a notebook with `jupytext`.
## Writing assertions and tests in a notebook
Our notebook defines a function that we wish to test. Our function is simply
```python
def f(x, n=5):
return [x + i for i in range(n)]
```
We can test the assertion in Jupyter with simply
```python
assert f(5) == [5,6,7,8,9]
```
Since the assertion above works, we don't get any message. It's more interesting to see what happens when an assertion fails. Remove one element of the list above and change the assertion to, say,
assert f(5) == [5,6,8,9]
When we run the above in Jupyter, we get
```stderr
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-3-1383ac5d204f> in <module>
----> 1 assert f(5) == [5,6,8,9]
AssertionError:
```
Now if we run the notebook with `jupytext --check pytest 'Tests in a notebook.md'`, we get a more detailed description of the issue, thanks to `pytest`'s rewriting of assertions:
```output
[jupytext] Reading Tests in a notebook.md
=========================== test session starts ===========================
platform win32 -- Python 3.7.5, pytest-5.2.2, py-1.8.0, pluggy-0.13.0
rootdir: C:\Users\Marco
collected 0 items / 1 errors
================================ ERRORS ===================================
_________ ERROR collecting Tests in a notebook vhs_lscr.py ________________
Tests in a notebook vhs_lscr.py:19: in <module>
assert f(5) == [5,6,8,9]
E assert [5, 6, 7, 8, 9] == [5, 6, 8, 9]
E + where [5, 6, 7, 8, 9] = <function f at 0x000002440A0D1798>(5)
!!!!!!!!!!!!! Interrupted: 1 errors during collection !!!!!!!!!!!!!!!!!!!!!
=========================== 1 error in 0.09s ==============================
```
Once all of our assertions pass, we can move them to a test function. In Jupyter the function is not evaluated - only when we run `jupytext --check pytest` on the notebook, the function is actually executed.
```python
def test_f():
assert f(5) == [5,6,7,8,9]
```
## Going further
- [nbval](https://github.com/computationalmodelling/nbval) is a plugin for `pytest` that allows you to make sure that Jupyter notebooks run properly, and that their new outputs match the current ones. Use it as `pytest --nbval notebook.ipynb`.
- [ipytest](https://github.com/chmp/ipytest) defines a `%%run_pytest` cell magic that allows you to execute the tests in a cell directly in Jupyter.
---
jupyter:
jupytext:
cell_markers: region,endregion
formats: ipynb,.pct.py:percent,.lgt.py:light,.spx.py:sphinx,md,Rmd,.pandoc.md:pandoc
text_representation:
extension: .Rmd
format_name: rmarkdown
format_version: '1.1'
jupytext_version: 1.1.0
kernelspec:
display_name: Python 3
language: python
name: python3
---
# A quick insight at world population
## Collecting population data
In the below we retrieve population data from the
[World Bank](http://www.worldbank.org/)
using the [wbdata](https://github.com/OliverSherouse/wbdata) python package
```{python}
import pandas as pd
import wbdata as wb
pd.options.display.max_rows = 6
pd.options.display.max_columns = 20
```
Corresponding indicator is found using search method - or, directly,
the World Bank site.
```{python}
wb.search_indicators('Population, total') # SP.POP.TOTL
# wb.search_indicators('area')
# => https://data.worldbank.org/indicator is easier to use
```
Now we download the population data
```{python}
indicators = {'SP.POP.TOTL': 'Population, total',
'AG.SRF.TOTL.K2': 'Surface area (sq. km)',
'AG.LND.TOTL.K2': 'Land area (sq. km)',
'AG.LND.ARBL.ZS': 'Arable land (% of land area)'}
data = wb.get_dataframe(indicators, convert_date=True).sort_index()
data
```
World is one of the countries
```{python}
data.loc['World']
```
Can we classify over continents?
```{python}
data.loc[(slice(None), '2017-01-01'), :]['Population, total'].dropna(
).sort_values().tail(60).index.get_level_values('country')
```
Extract zones manually (in order of increasing population)
```{python}
zones = ['North America', 'Middle East & North Africa',
'Latin America & Caribbean', 'Europe & Central Asia',
'Sub-Saharan Africa', 'South Asia',
'East Asia & Pacific'][::-1]
```
And extract population information (and check total is right)
```{python}
population = data.loc[zones]['Population, total'].swaplevel().unstack()
population = population[zones]
assert all(data.loc['World']['Population, total'] == population.sum(axis=1))
```
## Stacked area plot with matplotlib
```{python}
import matplotlib.pyplot as plt
```
```{python}
plt.clf()
plt.figure(figsize=(10, 5), dpi=100)
plt.stackplot(population.index, population.values.T / 1e9)
plt.legend(population.columns, loc='upper left')
plt.ylabel('Population count (B)')
plt.show()
```
## Stacked bar plot with plotly
```{python}
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
```
```{python}
data = [go.Scatter(x=population.index, y=population[zone], name=zone, stackgroup='World')
for zone in zones]
fig = go.Figure(data=data,
layout=go.Layout(title='World population'))
offline.iplot(fig)
```
This diff is collapsed.
# ---
# jupyter:
# jupytext:
# cell_markers: region,endregion
# formats: ipynb,.pct.py:percent,.lgt.py:light,.spx.py:sphinx,md,Rmd,.pandoc.md:pandoc
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.4'
# jupytext_version: 1.1.0
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# # A quick insight at world population
#
# ## Collecting population data
#
# In the below we retrieve population data from the
# [World Bank](http://www.worldbank.org/)
# using the [wbdata](https://github.com/OliverSherouse/wbdata) python package
# region
import pandas as pd
import wbdata as wb
pd.options.display.max_rows = 6
pd.options.display.max_columns = 20
# endregion
# Corresponding indicator is found using search method - or, directly,
# the World Bank site.
wb.search_indicators('Population, total') # SP.POP.TOTL
# wb.search_indicators('area')
# => https://data.worldbank.org/indicator is easier to use
# Now we download the population data
indicators = {'SP.POP.TOTL': 'Population, total',
'AG.SRF.TOTL.K2': 'Surface area (sq. km)',
'AG.LND.TOTL.K2': 'Land area (sq. km)',
'AG.LND.ARBL.ZS': 'Arable land (% of land area)'}
data = wb.get_dataframe(indicators, convert_date=True).sort_index()
data
# World is one of the countries
data.loc['World']
# Can we classify over continents?
data.loc[(slice(None), '2017-01-01'), :]['Population, total'].dropna(
).sort_values().tail(60).index.get_level_values('country')
# Extract zones manually (in order of increasing population)
zones = ['North America', 'Middle East & North Africa',
'Latin America & Caribbean', 'Europe & Central Asia',
'Sub-Saharan Africa', 'South Asia',
'East Asia & Pacific'][::-1]
# And extract population information (and check total is right)
population = data.loc[zones]['Population, total'].swaplevel().unstack()
population = population[zones]
assert all(data.loc['World']['Population, total'] == population.sum(axis=1))
# ## Stacked area plot with matplotlib
import matplotlib.pyplot as plt
plt.clf()
plt.figure(figsize=(10, 5), dpi=100)
plt.stackplot(population.index, population.values.T / 1e9)
plt.legend(population.columns, loc='upper left')
plt.ylabel('Population count (B)')
plt.show()
# ## Stacked bar plot with plotly
# region
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
# endregion
data = [go.Scatter(x=population.index, y=population[zone], name=zone, stackgroup='World')
for zone in zones]
fig = go.Figure(data=data,
layout=go.Layout(title='World population'))
offline.iplot(fig)
---
jupyter:
jupytext:
cell_markers: region,endregion
formats: ipynb,.pct.py:percent,.lgt.py:light,.spx.py:sphinx,md,Rmd,.pandoc.md:pandoc
text_representation:
extension: .md
format_name: markdown
format_version: '1.1'
jupytext_version: 1.1.0
kernelspec:
display_name: Python 3
language: python
name: python3
---
# A quick insight at world population
## Collecting population data
In the below we retrieve population data from the
[World Bank](http://www.worldbank.org/)
using the [wbdata](https://github.com/OliverSherouse/wbdata) python package
```python
import pandas as pd
import wbdata as wb
pd.options.display.max_rows = 6
pd.options.display.max_columns = 20
```
Corresponding indicator is found using search method - or, directly,
the World Bank site.
```python
wb.search_indicators('Population, total') # SP.POP.TOTL
# wb.search_indicators('area')
# => https://data.worldbank.org/indicator is easier to use
```
Now we download the population data
```python
indicators = {'SP.POP.TOTL': 'Population, total',
'AG.SRF.TOTL.K2': 'Surface area (sq. km)',
'AG.LND.TOTL.K2': 'Land area (sq. km)',
'AG.LND.ARBL.ZS': 'Arable land (% of land area)'}
data = wb.get_dataframe(indicators, convert_date=True).sort_index()
data
```
World is one of the countries
```python
data.loc['World']
```
Can we classify over continents?
```python
data.loc[(slice(None), '2017-01-01'), :]['Population, total'].dropna(
).sort_values().tail(60).index.get_level_values('country')
```
Extract zones manually (in order of increasing population)
```python
zones = ['North America', 'Middle East & North Africa',
'Latin America & Caribbean', 'Europe & Central Asia',
'Sub-Saharan Africa', 'South Asia',
'East Asia & Pacific'][::-1]
```
And extract population information (and check total is right)
```python
population = data.loc[zones]['Population, total'].swaplevel().unstack()
population = population[zones]
assert all(data.loc['World']['Population, total'] == population.sum(axis=1))
```
## Stacked area plot with matplotlib
```python
import matplotlib.pyplot as plt
```
```python
plt.clf()
plt.figure(figsize=(10, 5), dpi=100)
plt.stackplot(population.index, population.values.T / 1e9)
plt.legend(population.columns, loc='upper left')
plt.ylabel('Population count (B)')
plt.show()
```
## Stacked bar plot with plotly
```python
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
```
```python
data = [go.Scatter(x=population.index, y=population[zone], name=zone, stackgroup='World')
for zone in zones]
fig = go.Figure(data=data,
layout=go.Layout(title='World population'))
offline.iplot(fig)
```
---
jupytext:
formats: ipynb,.pct.py:percent,.lgt.py:light,.spx.py:sphinx,md,Rmd,.pandoc.md:pandoc,.myst.md:myst
text_representation:
extension: '.md'
format_name: myst
format_version: '0.7'
jupytext_version: 1.4.0+dev
kernelspec:
display_name: Python 3
language: python
name: python3
---
# A quick insight at world population
## Collecting population data
In the below we retrieve population data from the
[World Bank](http://www.worldbank.org/)
using the [wbdata](https://github.com/OliverSherouse/wbdata) python package
```{code-cell} ipython3
import pandas as pd
import wbdata as wb
pd.options.display.max_rows = 6
pd.options.display.max_columns = 20
```
Corresponding indicator is found using search method - or, directly,
the World Bank site.
```{code-cell} ipython3
wb.search_indicators('Population, total') # SP.POP.TOTL
# wb.search_indicators('area')
# => https://data.worldbank.org/indicator is easier to use
```
Now we download the population data
```{code-cell} ipython3
indicators = {'SP.POP.TOTL': 'Population, total',
'AG.SRF.TOTL.K2': 'Surface area (sq. km)',
'AG.LND.TOTL.K2': 'Land area (sq. km)',
'AG.LND.ARBL.ZS': 'Arable land (% of land area)'}
data = wb.get_dataframe(indicators, convert_date=True).sort_index()
data
```
World is one of the countries
```{code-cell} ipython3
data.loc['World']
```
Can we classify over continents?
```{code-cell} ipython3
data.loc[(slice(None), '2017-01-01'), :]['Population, total'].dropna(
).sort_values().tail(60).index.get_level_values('country')
```