scripts.py 15.2 KB
Newer Older
Aaron Spring's avatar
Aaron Spring committed
1
2
3
import xarray as xr
import pandas as pd
import numpy as np
4
5
import climetlab_s2s_ai_challenge
import climetlab as cml
Aaron Spring's avatar
Aaron Spring committed
6
7
8
9

cache_path = '../data'


10
11
12
13
14
15
16
17
18
19
20
21
def download(varlist_forecast=['tp','t2m'],
             center_list=['ecwmf'],
             forecast_dataset_labels=['hindcast-input','forecast-input'],
             obs_dataset_labels=['hindcast-like-observations','forecast-like-observations'],
             varlist_observations=['t2m','tp'],
             benchmark=True,
             format='netcdf'
            ):
    """Download files with climetlab to cache_path. Set cache_path:
    cml.settings.set("cache-directory", cache_path)
    """
    if isinstance(center_list, str):
22
        center_list = [center_list]
23
24
25
26
27
28
29
    if isinstance(varlist_forecast, str):
        varlist_forecast = [varlist_forecast]

    dates = xr.cftime_range(start='20200102',freq='7D', periods=53).strftime('%Y%m%d').to_list()
    
    if forecast_dataset_labels:
        print(f'Downloads variables {varlist_forecast} from datasets {forecast_dataset_labels} from center {center_list} in {format} format.')
30
        for center in center_list:
31
32
33
            for ds in forecast_dataset_labels:
                for parameter in varlist_forecast: 
                    try:
34
                        cml.load_dataset(f"s2s-ai-challenge-{ds}", origin=center, parameter=varlist_forecast, format=format).to_xarray()
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
                    except:
                        pass
    if obs_dataset_labels:
        print(f'Downloads variables tp and t2m from datasets {obs_dataset_labels} netcdf format. Additionally downloads raw t2m and pr observations with a time dimension.')
        try:
            for ds in obs_dataset_labels:
                for parameter in varlist_observations:
                    cml.load_dataset(f"s2s-ai-challenge-{ds}", date=dates, parameter=parameter).to_xarray()
        except:
            pass
        # raw
        cml.load_dataset(f"s2s-ai-challenge-observations", parameter=varlist_observations).to_xarray()
    if benchmark:
        cml.load_dataset("s2s-ai-challenge-test-output-benchmark", parameter=['tp','t2m']).to_xarray()
    print('finished')
    return


Aaron Spring's avatar
Aaron Spring committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def add_valid_time_from_forecast_reference_time_and_lead_time(forecast, init_dim='forecast_time'):
    """Creates valid_time(forecast_time, lead_time).
    
    lead_time: pd.Timedelta
    forecast_time: datetime
    """
    times = xr.concat(
        [
            xr.DataArray(
                forecast[init_dim] + lead,
                dims=init_dim,
                coords={init_dim: forecast[init_dim]},
            )
            for lead in forecast.lead_time
        ],
        dim="lead_time",
        join="inner",
        compat="broadcast_equals",
    )
    forecast = forecast.assign_coords(valid_time=times)
    return forecast


76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def aggregate_biweekly(da):
    """
    Aggregate initialized S2S forecasts biweekly for xr.DataArrays.
    Use ds.map(aggregate_biweekly) for xr.Datasets.
    
    Applies to the ECMWF S2S data model: https://confluence.ecmwf.int/display/S2S/Parameters
    """
    # biweekly averaging
    w34 = [pd.Timedelta(f'{i} d') for i in range(14,28)]
    w34 = xr.DataArray(w34,dims='lead_time', coords={'lead_time':w34})
    
    w56 = [pd.Timedelta(f'{i} d') for i in range(28,42)]
    w56 = xr.DataArray(w56,dims='lead_time', coords={'lead_time':w56})
    
    biweekly_lead = [pd.Timedelta(f"{i} d") for i in [14, 28]] # take first day of biweekly average as new coordinate

    v = da.name
    if climetlab_s2s_ai_challenge.CF_CELL_METHODS[v] == 'sum': # biweekly difference for sum variables: tp and ttr
        d34 = da.sel(lead_time=pd.Timedelta("28 d")) - da.sel(lead_time=pd.Timedelta("14 d")) # tp from day 14 to day 27
        d56 = da.sel(lead_time=pd.Timedelta("42 d")) - da.sel(lead_time=pd.Timedelta("28 d")) # tp from day 28 to day 42
        da_biweekly = xr.concat([d34,d56],'lead_time').assign_coords(lead_time=biweekly_lead)
    else: # t2m, see climetlab_s2s_ai_challenge.CF_CELL_METHODS # biweekly: mean [day 14, day 27]
        d34 = da.sel(lead_time=w34).mean('lead_time')
        d56 = da.sel(lead_time=w56).mean('lead_time')
        da_biweekly = xr.concat([d34,d56],'lead_time').assign_coords(lead_time=biweekly_lead)
    
    da_biweekly = add_valid_time_from_forecast_reference_time_and_lead_time(da_biweekly)
    da_biweekly['lead_time'].attrs = {'long_name':'forecast_period', 'description': 'Forecast period is the time interval between the forecast reference time and the validity time.',
                         'aggregate': 'The pd.Timedelta corresponds to the first day of a biweekly aggregate.',
                         'week34_t2m': 'mean[day 14, 27]',
                         'week56_t2m': 'mean[day 28, 41]',
                         'week34_tp': 'day 28 minus day 14',
                         'week56_tp': 'day 42 minus day 28'}
    return da_biweekly


def ensure_attributes(da, biweekly=False):
    """Ensure that coordinates and variables have proper attributes. Set biweekly==True to set special comments for the biweely aggregates."""
    template = cml.load_dataset('s2s-ai-challenge-test-input',parameter='t2m', origin='ecmwf', format='netcdf', date='20200102').to_xarray()
    for c in da.coords:
        if c in template.coords:
            da.coords[c].attrs.update(template.coords[c].attrs)
    
    if 'valid_time' in da.coords:
        da['valid_time'].attrs.update({'long_name': 'validity time',
                                     'standard_name': 'time',
                                     'description': 'time for which the forecast is valid',
                                     'calculate':'forecast_time + lead_time'})
    if 'forecast_time' in da.coords:
        da['forecast_time'].attrs.update({'long_name' : 'initial time of forecast', 'standard_name': 'forecast_reference_time',
                                      'description':'The forecast reference time in NWP is the "data time", the time of the analysis from which the forecast was made. It is not the time for which the forecast is valid.'})
    # fix tp
    if da.name == 'tp':
        da.attrs['units'] = 'kg m-2'
    if biweekly:
        da['lead_time'].attrs.update({'standard_name':'forecast_period', 'long_name': 'lead time',
                                      'description': 'Forecast period is the time interval between the forecast reference time and the validity time.',
                         'aggregate': 'The pd.Timedelta corresponds to the first day of a biweekly aggregate.',
                         'week34_t2m': 'mean[14 days, 27 days]',
                         'week56_t2m': 'mean[28 days, 41 days]',
                         'week34_tp': '28 days minus 14 days',
                         'week56_tp': '42 days minus 28 days'})
        if da.name == 'tp':
            da.attrs.update({'aggregate_week34': '28 days minus 14 days',
                      'aggregate_week56': '42 days minus 28 days',
                      'description': 'https://confluence.ecmwf.int/display/S2S/S2S+Total+Precipitation'})
        if da.name == 't2m':
            da.attrs.update({'aggregate_week34': 'mean[14 days, 27 days]',
                      'aggregate_week56': 'mean[28 days, 41 days]',
                      'variable_before_categorization': 'https://confluence.ecmwf.int/display/S2S/S2S+Surface+Air+Temperature'})
    return da


149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def add_year_week_coords(ds):
    import numpy as np
    if 'week' not in ds.coords and 'year' not in ds.coords:
        year = ds.forecast_time.dt.year.to_index().unique()
        week = (list(np.arange(1,54)))
        weeks = week * len(year)
        years = np.repeat(year,len(week))
        ds.coords["week"] = ("forecast_time", weeks)
        ds.coords['week'].attrs['description'] = "This week represents the number of forecast_time starting from 1 to 53. Note: This week is different from the ISO week from groupby('forecast_time.weekofyear'), see https://en.wikipedia.org/wiki/ISO_week_date and https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge/-/issues/29"
        ds.coords["year"] = ("forecast_time", years)
        ds.coords['year'].attrs['long_name'] = "calendar year"
    return ds


def make_probabilistic(ds, tercile_edges, member_dim='realization', mask=None, groupby_coord='week'):
Aaron Spring's avatar
Aaron Spring committed
164
165
    """Compute probabilities from ds (observations or forecasts) based on tercile_edges."""
    # broadcast
166
167
    ds = add_year_week_coords(ds)
    tercile_edges = tercile_edges.sel({groupby_coord: ds.coords[groupby_coord]})
Aaron Spring's avatar
Aaron Spring committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    bn = ds < tercile_edges.isel(category_edge=0, drop=True)  # below normal
    n = (ds >= tercile_edges.isel(category_edge=0, drop=True)) & (ds < tercile_edges.isel(category_edge=1, drop=True))  # normal
    an = ds >= tercile_edges.isel(category_edge=1, drop=True)  # above normal
    if member_dim in ds.dims:
        bn = bn.mean(member_dim)
        an = an.mean(member_dim)
        n = n.mean(member_dim)
    ds_p = xr.concat([bn, n, an],'category').assign_coords(category=['below normal', 'near normal', 'above normal'])
    if mask is not None:
        ds_p = ds_p.where(mask)
    if 'tp' in ds_p.data_vars:
        # mask arid grid cells where category_edge are too close to 0
        # we are using a dry mask as in https://doi.org/10.1175/MWR-D-17-0092.1
        tp_arid_mask = tercile_edges.tp.isel(category_edge=0, lead_time=0, drop=True) > 0.01
        ds_p['tp'] = ds_p['tp'].where(tp_arid_mask)
183
184
185
186
187
188
189
190
191
192
    ds_p['category'].attrs = {'long_name': 'tercile category probabilities', 'units': '1',
                        'description': 'Probabilities for three tercile categories. All three tercile category probabilities must add up to 1.'}
    ds_p['tp'].attrs = {'long_name': 'Probability of total precipitation in tercile categories', 'units': '1',
                      'comment': 'All three tercile category probabilities must add up to 1.',
                      'variable_before_categorization': 'https://confluence.ecmwf.int/display/S2S/S2S+Total+Precipitation'
                     }
    ds_p['t2m'].attrs = {'long_name': 'Probability of 2m temperature in tercile categories', 'units': '1',
                      'comment': 'All three tercile category probabilities must add up to 1.',
                      'variable_before_categorization': 'https://confluence.ecmwf.int/display/S2S/S2S+Surface+Air+Temperature'
                      }
193
194
195
196
    if 'year' in ds_p.coords:
        del ds_p.coords['year']
    if groupby_coord in ds_p.coords:
        ds_p = ds_p.drop(groupby_coord)
Aaron Spring's avatar
Aaron Spring committed
197
198
199
    return ds_p


200
def skill_by_year(preds, adapt=False):
Aaron Spring's avatar
Aaron Spring committed
201
    """Returns pd.Dataframe of RPSS per year."""
Aaron Spring's avatar
Aaron Spring committed
202
203
204
205
206
207
208
209
210
211
212
    # similar verification_RPSS.ipynb
    # as scorer bot but returns a score for each year
    import xarray as xr
    import xskillscore as xs
    import pandas as pd
    import numpy as np
    xr.set_options(keep_attrs=True)
    
    # from root
    #renku storage pull data/forecast-like-observations_2020_biweekly_terciled.nc
    #renku storage pull data/hindcast-like-observations_2000-2019_biweekly_terciled.nc
213
    cache_path = '../data'
Aaron Spring's avatar
Aaron Spring committed
214
215
216
217
    if 2020 in preds.forecast_time.dt.year:
        obs_p = xr.open_dataset(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc').sel(forecast_time=preds.forecast_time)
    else:
        obs_p = xr.open_dataset(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_terciled.zarr', engine='zarr').sel(forecast_time=preds.forecast_time)
218
    
Aaron Spring's avatar
Aaron Spring committed
219
220
    # ML probabilities
    fct_p = preds
221
222

    
Aaron Spring's avatar
Aaron Spring committed
223
224
225
226
    # climatology
    clim_p = xr.DataArray([1/3, 1/3, 1/3], dims='category', coords={'category':['below normal', 'near normal', 'above normal']}).to_dataset(name='tp')
    clim_p['t2m'] = clim_p['tp']
    
227
228
229
230
231
232
233
234
235
236
237
238
    if adapt:
        # select only obs_p where fct_p forecasts provided
        for c in ['longitude', 'latitude', 'forecast_time', 'lead_time']:
            obs_p = obs_p.sel({c:fct_p[c]})
        obs_p = obs_p[list(fct_p.data_vars)]
        clim_p = clim_p[list(fct_p.data_vars)]
    
    else:
        # check inputs
        assert_predictions_2020(obs_p)
        assert_predictions_2020(fct_p)
    
Aaron Spring's avatar
Aaron Spring committed
239
240
241
    ## RPSS
    # rps_ML
    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim=[], input_distributions='p').compute()
Aaron Spring's avatar
Aaron Spring committed
242
    # rps_clim
243
    rps_clim = xs.rps(obs_p, clim_p, category_edges=None, dim=[], input_distributions='p').compute()
Aaron Spring's avatar
Aaron Spring committed
244
    
245
246
    # rpss
    rpss = 1 - (rps_ML / rps_clim)
Aaron Spring's avatar
Aaron Spring committed
247
    
248
249
250
251
    # https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/issues/7

    # penalize
    penalize = obs_p.where(fct_p!=1, other=-10).mean('category')
252
    rpss = rpss.where(penalize!=0, other=-10)
Aaron Spring's avatar
Aaron Spring committed
253

254
255
256
257
258
    # clip
    rpss = rpss.clip(-10, 1)

    # average over all forecasts
    rpss = rpss.groupby('forecast_time.year').mean()
Aaron Spring's avatar
Aaron Spring committed
259
260
    
    # weighted area mean
261
    weights = np.cos(np.deg2rad(np.abs(rpss.latitude)))
Aaron Spring's avatar
Aaron Spring committed
262
    # spatially weighted score averaged over lead_times and variables to one single value
263
    scores = rpss.sel(latitude=slice(None, -60)).weighted(weights).mean('latitude').mean('longitude')
Aaron Spring's avatar
Aaron Spring committed
264
    scores = scores.to_array().mean(['lead_time', 'variable'])
Aaron Spring's avatar
Aaron Spring committed
265
    return scores.to_dataframe('RPSS')
Aaron Spring's avatar
Aaron Spring committed
266
267


268
def assert_predictions_2020(preds_test, exclude='weekofyear'):
Aaron Spring's avatar
Aaron Spring committed
269
    """Check the variables, coordinates and dimensions of 2020 predictions."""
270
271
    from xarray.testing import assert_equal # doesnt care about attrs but checks coords
    
Aaron Spring's avatar
Aaron Spring committed
272
273
274
275
    # is dataset
    assert isinstance(preds_test, xr.Dataset)

    # has both vars: tp and t2m
276
277
278
    if 'data_vars' in exclude:
        assert 'tp' in preds_test.data_vars
        assert 't2m' in preds_test.data_vars
Aaron Spring's avatar
Aaron Spring committed
279
280
    
    ## coords
281
282
283
284
    # ignore weekofyear coord if not dim
    if 'weekofyear' in exclude and 'weekofyear' in preds_test.coords and 'weekofyear' not in preds_test.dims:
        preds_test = preds_test.drop('weekofyear')
    
Aaron Spring's avatar
Aaron Spring committed
285
    # forecast_time
286
287
288
289
    if 'forecast_time' in exclude:
        d = pd.date_range(start='2020-01-02', freq='7D', periods=53)
        forecast_time = xr.DataArray(d, dims='forecast_time', coords={'forecast_time':d}, name='forecast_time')
        assert_equal(forecast_time,  preds_test['forecast_time'])
Aaron Spring's avatar
Aaron Spring committed
290
291

    # longitude
292
293
294
295
296
    if 'longitude' in exclude:
        lon = np.arange(0., 360., 1.5)
        longitude = xr.DataArray(lon, dims='longitude', coords={'longitude': lon}, name='longitude')
        assert_equal(longitude, preds_test['longitude'])

Aaron Spring's avatar
Aaron Spring committed
297
    # latitude
298
299
300
301
    if 'latitude' in exclude:
        lat = np.arange(-90., 90.1, 1.5)[::-1]
        latitude = xr.DataArray(lat, dims='latitude', coords={'latitude': lat}, name='latitude')
        assert_equal(latitude, preds_test['latitude'])
Aaron Spring's avatar
Aaron Spring committed
302
303
    
    # lead_time
304
305
306
307
    if 'lead_time' in exclude:
        lead = [pd.Timedelta(f'{i} d') for i in [14, 28]]
        lead_time = xr.DataArray(lead, dims='lead_time', coords={'lead_time': lead}, name='lead_time')
        assert_equal(lead_time, preds_test['lead_time'])
Aaron Spring's avatar
Aaron Spring committed
308
309
    
    # category
310
311
312
313
    if 'category' in exclude:
        cat = np.array(['below normal', 'near normal', 'above normal'], dtype='<U12')
        category = xr.DataArray(cat, dims='category', coords={'category': cat}, name='category')
        assert_equal(category, preds_test['category'])
Aaron Spring's avatar
Aaron Spring committed
314
315
    
    # size
316
317
318
319
320
321
    if 'size' in exclude:
        from dask.utils import format_bytes
        size_in_MB = float(format_bytes(preds_test.nbytes).split(' ')[0])
        # todo: refine for dtypes
        assert size_in_MB > 50
        assert size_in_MB < 250
Aaron Spring's avatar
Aaron Spring committed
322
323
    
    # no other dims
324
325
326
    if 'dims' in exclude:
        assert set(preds_test.dims) - {'category', 'forecast_time', 'latitude', 'lead_time', 'longitude'} == set()