Commit b33110f1 authored by Aaron Spring's avatar Aaron Spring 🚼
Browse files

add year week as coord not replacing forecast_time

parent 3fd65723
Pipeline #231797 passed with stages
in 9 minutes and 13 seconds
%% Cell type:markdown id: tags:
# Create biweekly renku datasets from `climetlab-s2s-ai-challenge`
%% Cell type:markdown id: tags:
Goal:
- Create biweekly renku datasets from [`climatelab-s2s-ai-challenge`](https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge).
- These renku datasets are then used in notebooks:
- `ML_train_and_predict.ipynb` to train the ML model and do ML-based predictions
- `RPSS_verification.ipynb` to calculate RPSS of the ML model
- `mean_bias_reduction.ipynb` to remove the mean bias and
Requirements:
- [`climetlab`](https://github.com/ecmwf/climetlab)
- [`climatelab-s2s-ai-challenge`](https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge)
- S2S and CPC observations uploaded on [European Weather Cloud (EWC)](https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/0.3.0/netcdf/index.html)
Output: [renku dataset](https://renku-python.readthedocs.io/en/latest/commands.html#module-renku.cli.dataset) `s2s-ai-challenge`
- observations
- deterministic:
- `hindcast-like-observations_2000-2019_biweekly_deterministic.zarr`
- `forecast-like-observations_2020_biweekly_deterministic.zarr`
- edges:
- `hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc`
- probabilistic:
- `hindcast-like-observations_2000-2019_biweekly_terciled.zarr`
- `forecast-like-observations_2020_biweekly_terciled.nc`
- forecasts/hindcasts
- deterministic:
- `ecmwf_hindcast-input_2000-2019_biweekly_deterministic.zarr`
- `ecmwf_forecast-input_2020_biweekly_deterministic.zarr`
- more models could be added
- benchmark:
- probabilistic:
- `ecmwf_recalibrated_benchmark_2020_biweekly_terciled.nc`
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
import xarray as xr
import xskillscore as xs
import pandas as pd
import climetlab_s2s_ai_challenge
import climetlab as cml
print(f'Climetlab version : {cml.__version__}')
print(f'Climetlab-s2s-ai-challenge plugin version : {climetlab_s2s_ai_challenge.__version__}')
xr.set_options(keep_attrs=True)
xr.set_options(display_style='text')
```
%%%% Output: stream
Matplotlib is building the font cache; this may take a moment.
%%%% Output: stream
Climetlab version : 0.8.0
Climetlab-s2s-ai-challenge plugin version : 0.7.1
%%%% Output: execute_result
<xarray.core.options.set_options at 0x7f2a6e35bac0>
%% Cell type:code id: tags:
``` python
# caching path for climetlab
cache_path = "/work/mh0727/m300524/S2S_AI/cache" # set your own path
cml.settings.set("cache-directory", cache_path)
```
%% Cell type:code id: tags:
``` python
cache_path = "../data"
```
%% Cell type:markdown id: tags:
# Download and cache
Download all files for the observations, forecast and hindcast.
%% Cell type:code id: tags:
``` python
# shortcut
from scripts import download
#download()
```
%% Cell type:markdown id: tags:
## hindcast and forecast `input`
%% Cell type:code id: tags:
``` python
# starting dates forecast_time in 2020
dates = xr.cftime_range(start='20200102',freq='7D', periods=53).strftime('%Y%m%d').to_list()
forecast_dataset_labels = ['training-input','test-input'] # ML community
# equiv to
forecast_dataset_labels = ['hindcast-input','forecast-input'] # NWP community
varlist_forecast = ['tp','t2m'] # can add more
center_list = ['ecmwf'] # 'ncep', 'eccc'
```
%% Cell type:code id: tags:
``` python
%%time
# takes ~ 10-30 min to download for one model one variable depending on number of model realizations
# and download settings https://climetlab.readthedocs.io/en/latest/guide/settings.html
for center in center_list:
for ds in forecast_dataset_labels:
cml.load_dataset(f"s2s-ai-challenge-{ds}", origin=center, parameter=varlist_forecast, format='netcdf').to_xarray()
```
%% Cell type:markdown id: tags:
## observations `output-reference`
%% Cell type:code id: tags:
``` python
obs_dataset_labels = ['training-output-reference','test-output-reference'] # ML community
# equiv to
obs_dataset_labels = ['hindcast-like-observations','forecast-like-observations'] # NWP community
varlist_obs = ['tp', 't2m']
```
%% Cell type:code id: tags:
``` python
%%time
# takes 10min to download
for ds in obs_dataset_labels:
print(ds)
# only netcdf, no format choice
cml.load_dataset(f"s2s-ai-challenge-{ds}", date=dates, parameter=varlist_obs).to_xarray()
```
%% Cell type:code id: tags:
``` python
# download obs_time for to create output-reference/observations for other models than ecmwf and eccc,
# i.e. ncep or any S2S or Sub model
obs_time = cml.load_dataset(f"s2s-ai-challenge-observations", parameter=['t2m', 'pr']).to_xarray()
```
%% Cell type:markdown id: tags:
# create bi-weekly aggregates
%% Cell type:code id: tags:
``` python
from scripts import aggregate_biweekly, ensure_attributes
#aggregate_biweekly??
```
%% Cell type:code id: tags:
``` python
for c, center in enumerate(center_list): # forecast centers (could also take models)
for dsl in obs_dataset_labels + forecast_dataset_labels: # climetlab dataset labels
for p, parameter in enumerate(varlist_forecast): # variables
if c != 0 and 'observation' in dsl: # only do once for observations
continue
print(f"datasetlabel: {dsl}, center: {center}, parameter: {parameter}")
if 'input' in dsl:
ds = cml.load_dataset(f"s2s-ai-challenge-{dsl}", origin=center, parameter=parameter, format='netcdf').to_xarray()
elif 'observation' in dsl: # obs only netcdf, no choice
if parameter not in ['t2m', 'tp']:
continue
ds = cml.load_dataset(f"s2s-ai-challenge-{dsl}", parameter=parameter, date=dates).to_xarray()
if p == 0:
ds_biweekly = ds.map(aggregate_biweekly)
else:
ds_biweekly[parameter] = ds.map(aggregate_biweekly)[parameter]
ds_biweekly = ds_biweekly.map(ensure_attributes, biweekly=True)
ds_biweekly = ds_biweekly.sortby('forecast_time')
if 'test' in dsl:
ds_biweekly = ds_biweekly.chunk('auto')
else:
ds_biweekly = ds_biweekly.chunk({'forecast_time':'auto','lead_time':-1,'longitude':-1,'latitude':-1})
if 'hindcast' in dsl:
time = f'{int(ds_biweekly.forecast_time.dt.year.min())}-{int(ds_biweekly.forecast_time.dt.year.max())}'
if 'input' in dsl:
name = f'{center}_{dsl}'
elif 'observations':
name = dsl
elif 'forecast' in dsl:
time = '2020'
if 'input' in dsl:
name = f'{center}_{dsl}'
elif 'observations':
name = dsl
else:
assert False
# pattern: {model_if_not_observations}{observations/forecast/hindcast}_{time}_biweekly_deterministic.zarr
zp = f'{cache_path}/{name}_{time}_biweekly_deterministic.zarr'
ds_biweekly.attrs.update({'postprocessed_by':'https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/renku_datasets_biweekly.ipynb'})
print(f'save to: {zp}')
ds_biweekly.astype('float32').to_zarr(zp, consolidated=True, mode='w')
```
%% Cell type:markdown id: tags:
## add to `renku` dataset `s2s-ai-challenge`
%% Cell type:code id: tags:
``` python
# observations as hindcast
# run renku commands from projects root directory only
# !renku dataset add s2s-ai-challenge data/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr
```
%% Cell type:code id: tags:
``` python
# for further use retrieve from git lfs
# !renku storage pull ../data/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr
```
%% Cell type:code id: tags:
``` python
obs_2000_2019 = xr.open_zarr(f"{cache_path}/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr", consolidated=True)
print(obs_2000_2019.sizes,'\n',obs_2000_2019.coords,'\n', obs_2000_2019.nbytes/1e6,'MB')
```
%%%% Output: stream
Frozen(SortedKeysDict({'forecast_time': 1060, 'latitude': 121, 'lead_time': 2, 'longitude': 240}))
Coordinates:
* forecast_time (forecast_time) datetime64[ns] 2000-01-02 ... 2019-12-31
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
valid_time (lead_time, forecast_time) datetime64[ns] dask.array<chunksize=(2, 1060), meta=np.ndarray>
492.546744 MB
%% Cell type:code id: tags:
``` python
# observations as forecast
# run renku commands from projects root directory only
# !renku dataset add s2s-ai-challenge data/forecast-like-observations_2020_biweekly_deterministic.zarr
```
%% Cell type:code id: tags:
``` python
obs_2020 = xr.open_zarr(f"{cache_path}/forecast-like-observations_2020_biweekly_deterministic.zarr", consolidated=True)
print(obs_2020.sizes,'\n',obs_2020.coords,'\n', obs_2020.nbytes/1e6,'MB')
```
%%%% Output: stream
Frozen(SortedKeysDict({'forecast_time': 53, 'latitude': 121, 'lead_time': 2, 'longitude': 240}))
Coordinates:
* forecast_time (forecast_time) datetime64[ns] 2020-01-02 ... 2020-12-31
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
valid_time (lead_time, forecast_time) datetime64[ns] dask.array<chunksize=(2, 53), meta=np.ndarray>
24.630096 MB
%% Cell type:code id: tags:
``` python
# ecmwf hindcast-input
# run renku commands from projects root directory only
# !renku dataset add s2s-ai-challenge data/ecmwf_hindcast-input_2000-2019_biweekly_deterministic.zarr
```
%% Cell type:code id: tags:
``` python
hind_2000_2019 = xr.open_zarr(f"{cache_path}/ecmwf_hindcast-input_2000-2019_biweekly_deterministic.zarr", consolidated=True)
print(hind_2000_2019.sizes,'\n',hind_2000_2019.coords,'\n', hind_2000_2019.nbytes/1e6,'MB')
```
%%%% Output: stream
Frozen(SortedKeysDict({'forecast_time': 1060, 'latitude': 121, 'lead_time': 2, 'longitude': 240, 'realization': 11}))
Coordinates:
* forecast_time (forecast_time) datetime64[ns] 2000-01-02 ... 2019-12-31
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
* realization (realization) int64 0 1 2 3 4 5 6 7 8 9 10
valid_time (lead_time, forecast_time) datetime64[ns] dask.array<chunksize=(2, 1060), meta=np.ndarray>
5417.730832 MB
%% Cell type:code id: tags:
``` python
# ecmwf forecast-input
# run renku commands from projects root directory only
# !renku dataset add s2s-ai-challenge data/ecmwf_forecast-input_2020_biweekly_deterministic.zarr
```
%% Cell type:code id: tags:
``` python
fct_2020 = xr.open_zarr(f"{cache_path}/ecmwf_forecast-input_2020_biweekly_deterministic.zarr", consolidated=True)
print(fct_2020.sizes,'\n',fct_2020.coords,'\n', fct_2020.nbytes/1e6,'MB')
```
%%%% Output: stream
Frozen(SortedKeysDict({'forecast_time': 53, 'latitude': 121, 'lead_time': 2, 'longitude': 240, 'realization': 51}))
Coordinates:
* forecast_time (forecast_time) datetime64[ns] 2020-01-02 ... 2020-12-31
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
* realization (realization) int64 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50
valid_time (lead_time, forecast_time) datetime64[ns] dask.array<chunksize=(2, 53), meta=np.ndarray>
1255.926504 MB
%% Cell type:markdown id: tags:
# tercile edges
Create 2 tercile edges at 1/3 and 2/3 quantiles of the 2000-2019 biweekly distrbution for each week of the year
%% Cell type:code id: tags:
``` python
!renku storage pull ../data/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr
```
%%%% Output: stream
Warning: Run CLI commands only from project's root directory.

%% Cell type:code id: tags:
``` python
obs_2000_2019 = xr.open_zarr(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr', consolidated=True)
```
%% Cell type:code id: tags:
``` python
original_forecast_time = obs_2000_2019.forecast_time
```
%% Cell type:code id: tags:
``` python
def add_year_week_coords(ds):
"""Add year and week coordinates for groupby."""
year = ds.forecast_time.dt.year.to_index().unique()
week = sorted(ds.forecast_time.dt.weekofyear.to_index().unique())
tmp = ds.forecast_time.values.reshape(20,53)
ds = ds.assign_coords(year=y, week=w).stack(year_week=("year", "week")).unstack("year_week")
ds.coords["year_week"] = (["year", "week"], tmp)
ds.coords['week'].attrs['comment'] = "This week represents the number of forecast_time starting from 1 to 53. Note: This week is different from the ISO week from groupby('forecast_time.weekofyear'), see https://en.wikipedia.org/wiki/ISO_week_date and https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge/-/issues/29"
return ds
```
%% Cell type:code id: tags:
``` python
obs_2000_2019 = add_year_week_coords(obs_2000_2019)
```
%% Cell type:code id: tags:
``` python
obs_2000_2019
```
%%%% Output: execute_result
<xarray.Dataset>
Dimensions: (forecast_time: 1060, latitude: 121, lead_time: 2, longitude: 240, week: 53, year: 20)
Coordinates:
* forecast_time (forecast_time) datetime64[ns] 2000-01-02 ... 2019-12-31
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
valid_time (lead_time, forecast_time) datetime64[ns] dask.array<chunksize=(2, 1060), meta=np.ndarray>
* year (year) int64 2000 2001 2002 2003 2004 ... 2016 2017 2018 2019
* week (week) int64 1 2 3 4 5 6 7 8 9 ... 45 46 47 48 49 50 51 52 53
year_week (year, week) datetime64[ns] 2000-01-02 ... 2019-12-31
Data variables:
t2m (lead_time, forecast_time, latitude, longitude) float32 dask.array<chunksize=(2, 530, 121, 240), meta=np.ndarray>
tp (lead_time, forecast_time, latitude, longitude) float32 dask.array<chunksize=(2, 530, 121, 240), meta=np.ndarray>
Attributes:
created_by_script: tools/observations/makefile
created_by_software: climetlab-s2s-ai-challenge
function: climetlab_s2s_ai_challenge.extra.forecast_like_obse...
postprocessed_by: https://renkulab.io/gitlab/aaron.spring/s2s-ai-chal...
source_dataset_name: NOAA NCEP CPC UNIFIED_PRCP GAUGE_BASED GLOBAL v1p0 ...
source_hosting: IRIDL
source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...
%% Cell type:code id: tags:
``` python
tercile_file = f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc'
```
%% Cell type:code id: tags:
``` python
%%time
xr.open_zarr(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_deterministic.zarr',
consolidated=True).chunk({'forecast_time':-1,'longitude':'auto'}).groupby('forecast_time.weekofyear').quantile(q=[1./3.,2./3.], dim=['forecast_time']).rename({'quantile':'category_edge'}).astype('float32').to_netcdf(tercile_file)
obs_2000_2019.chunk({'forecast_time':-1,'longitude':'auto'}).groupby('week').quantile(q=[1./3.,2./3.], dim=['forecast_time']).rename({'quantile':'category_edge'}).astype('float32').to_netcdf(tercile_file)
```
%%%% Output: stream
/work/mh0727/m300524/conda-envs/s2s-ai/lib/python3.7/site-packages/xarray/core/accessor_dt.py:381: FutureWarning: dt.weekofyear and dt.week have been deprecated. Please use dt.isocalendar().week instead.
FutureWarning,
/work/mh0727/m300524/conda-envs/s2s-ai/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1390: RuntimeWarning: All-NaN slice encountered
overwrite_input, interpolation)
%%%% Output: stream
CPU times: user 21min 25s, sys: 9min 19s, total: 30min 45s
Wall time: 18min 4s
%% Cell type:code id: tags:
``` python
tercile_edges = xr.open_dataset(tercile_file)
tercile_edges
```
%%%% Output: execute_result
<xarray.Dataset>
Dimensions: (category_edge: 2, latitude: 121, lead_time: 2, longitude: 240, weekofyear: 53)
Coordinates:
* latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0
* lead_time (lead_time) timedelta64[ns] 14 days 28 days
* longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5
* category_edge (category_edge) float64 0.3333 0.6667
* weekofyear (weekofyear) int64 1 2 3 4 5 6 7 8 ... 47 48 49 50 51 52 53
Data variables:
t2m (weekofyear, category_edge, lead_time, latitude, longitude) float32 ...
tp (weekofyear, category_edge, lead_time, latitude, longitude) float32 ...
Attributes:
created_by_script: tools/observations/makefile
created_by_software: climetlab-s2s-ai-challenge
function: climetlab_s2s_ai_challenge.extra.forecast_like_obse...
postprocessed: by https://renkulab.io/gitlab/aaron.spring/s2s-ai-c...
source_dataset_name: NOAA NCEP CPC UNIFIED_PRCP GAUGE_BASED GLOBAL v1p0 ...
source_hosting: IRIDL
source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...
%% Cell type:code id: tags:
``` python
tercile_edges.nbytes*1e-6,'MB'
```
%%%% Output: execute_result
(49.255184, 'MB')
%% Cell type:code id: tags:
``` python
# run renku commands from projects root directory only
# tercile edges
#!renku dataset add s2s-ai-challenge data/hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc
```
%% Cell type:code id: tags:
``` python
# to use retrieve from git lfs
#!renku storage pull ../data/hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc
#xr.open_dataset("../data/hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc")
```
%% Cell type:markdown id: tags:
# observations in categories
- counting how many deterministic forecasts realizations fall into each category, like counting rps
- categorize forecast-like-observations 2020 into categories
%% Cell type:code id: tags:
``` python
obs_2020 = xr.open_zarr(f'{cache_path}/forecast-like-observations_2020_biweekly_deterministic.zarr', consolidated=True)
obs_2020.sizes
```
%%%% Output: execute_result
Frozen(SortedKeysDict({'forecast_time': 53, 'latitude': 121, 'lead_time': 2, 'longitude': 240}))
%% Cell type:code id: tags:
``` python
# create a mask for land grid
mask = obs_2020.std(['lead_time','forecast_time']).notnull()
```
%% Cell type:code id: tags:
``` python
# mask.to_array().plot(col='variable')
```
%% Cell type:code id: tags:
``` python
# total precipitation in arid regions are masked
# Frederic Vitart suggested by email: "Based on your map we could mask all the areas where the lower tercile boundary is lower than 0.1 mm"
# we are using a dry mask as in https://doi.org/10.1175/MWR-D-17-0092.1
th = 0.01
tp_arid_mask = tercile_edges.tp.isel(category_edge=0, lead_time=0, drop=True) > th
#tp_arid_mask.where(mask.tp).plot(col='forecast_time', col_wrap=4)
#plt.suptitle(f'dry mask: week 3-4 tp 1/3 category_edge > {th} kg m-2',y=1., x=.4)
#plt.savefig('dry_mask.png')
```
%% Cell type:code id: tags:
``` python
# look into tercile edges
```
%% Cell type:code id: tags:
``` python
#tercile_edges.isel(forecast_time=0)['tp'].plot(col='lead_time',row='category_edge', robust=True)
```
%% Cell type:code id: tags:
``` python
#tercile_edges.isel(forecast_time=[0,20],category_edge=1)['tp'].plot(col='lead_time', row='forecast_time', robust=True)
```
%% Cell type:code id: tags:
``` python
# tercile_edges.tp.mean(['forecast_time']).plot(col='lead_time',row='category_edge',vmax=.5)
```
%% Cell type:markdown id: tags:
## categorize observations
%% Cell type:markdown id: tags:
### forecast 2020
%% Cell type:code id: tags:
``` python
from scripts import make_probabilistic
```
%% Cell type:code id: tags:
``` python
obs_2020_p = make_probabilistic(obs_2020, tercile_edges, mask=mask)
```
%%%% Output: stream
/work/mh0727/m300524/conda-envs/s2s-ai/lib/python3.7/site-packages/xarray/core/accessor_dt.py:381: FutureWarning: dt.weekofyear and dt.week have been deprecated. Please use dt.isocalendar().week instead.
FutureWarning,
%% Cell type:code id: tags:
``` python
obs_2020_p.nbytes/1e6, 'MB'
```
%%%% Output: execute_result
(147.75984, 'MB')
%% Cell type:code id: tags:
``` python
obs_2020_p.astype('float32').to_netcdf(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc')
```
%%%% Output: stream
/work/mh0727/m300524/conda-envs/s2s-ai/lib/python3.7/site-packages/dask/array/numpy_compat.py:40: RuntimeWarning: invalid value encountered in true_divide
x = np.divide(x1, x2, out)
%% Cell type:code id: tags: