Coverage for mlair/helpers/statistics.py: 46%
241 statements
« prev ^ index » next coverage.py v6.4.2, created at 2023-06-01 13:03 +0000
« prev ^ index » next coverage.py v6.4.2, created at 2023-06-01 13:03 +0000
1"""Collection of stastical methods: Transformation and Skill Scores."""
3from scipy import stats
5__author__ = 'Lukas Leufen, Felix Kleinert'
6__date__ = '2019-10-23'
8import numpy as np
9import xarray as xr
10import pandas as pd
11from typing import Union, Tuple, Dict, List
12import itertools
13from collections import OrderedDict
14from mlair.helpers import to_list
15from mlair.helpers.helpers import squeeze_coords
18Data = Union[xr.DataArray, pd.DataFrame]
21def apply_inverse_transformation(data: Data, method: str = "standardise", mean: Data = None, std: Data = None,
22 max: Data = None, min: Data = None, feature_range: Data = None) -> Data:
23 """
24 Apply inverse transformation for given statistics.
26 :param data: transform this data back
27 :param method: transformation method (optional)
28 :param mean: mean of transformation (optional)
29 :param std: standard deviation of transformation (optional)
30 :param max: maximum value for min/max transformation (optional)
31 :param min: minimum value for min/max transformation (optional)
33 :return: inverse transformed data
34 """
35 if method == 'standardise': # pragma: no branch
36 return standardise_inverse(data, mean, std)
37 elif method == 'centre': # pragma: no branch
38 return centre_inverse(data, mean)
39 elif method == 'min_max': # pragma: no branch
40 return min_max_inverse(data, min, max, feature_range)
41 elif method == "log":
42 return log_inverse(data, mean, std)
43 else:
44 raise NotImplementedError
47def standardise(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]:
48 """
49 Standardise a xarray.dataarray (along dim) or pandas.DataFrame (along axis) with mean=0 and std=1.
51 :param data: data to standardise
52 :param dim: name (xarray) or axis (pandas) of dimension which should be standardised
53 :return: standardised data, and dictionary with keys method, mean, and standard deviation
54 """
55 return (data - data.mean(dim)) / data.std(dim), {"mean": data.mean(dim), "std": data.std(dim),
56 "method": "standardise"}
59def standardise_inverse(data: Data, mean: Data, std: Data) -> Data:
60 """
61 Apply inverse function of `standardise` on data and therefore vanishes the standardising.
63 :param data: standardised data
64 :param mean: mean of standardisation
65 :param std: standard deviation of transformation
67 :return: inverse standardised data
68 """
69 return data * std + mean
72def standardise_apply(data: Data, mean: Data, std: Data) -> Data:
73 """
74 Apply `standardise` on data using given mean and std.
76 :param data: data to transform
77 :param mean: mean to use for transformation
78 :param std: standard deviation for transformation
80 :return: transformed data
81 """
82 return (data - mean) / std
85def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]:
86 """
87 Centre a xarray.dataarray (along dim) or pandas.DataFrame (along axis) to mean=0.
89 :param data: data to centre
90 :param dim: name (xarray) or axis (pandas) of dimension which should be centred
92 :return: centred data, and dictionary with keys method, and mean
93 """
94 return data - data.mean(dim), {"mean": data.mean(dim), "method": "centre"}
97def centre_inverse(data: Data, mean: Data) -> Data:
98 """
99 Apply inverse function of `centre` and therefore add given values of mean to data.
101 :param data: data to apply inverse centering
102 :param mean: mean to use for inverse transformation
104 :return: inverted centering transformation data
105 """
106 return data + mean
109def centre_apply(data: Data, mean: Data) -> Data:
110 """
111 Apply `centre` on data using given mean.
113 :param data: data to transform
114 :param mean: mean to use for transformation
116 :return: transformed data
117 """
118 return data - mean
121def min_max(data: Data, dim: Union[str, int], feature_range: Tuple = (0, 1)) -> Tuple[Data, Dict[(str, Data)]]:
122 """
123 Apply min/max scaling using (x - x_min) / (x_max - x_min). Returned data is in interval [0, 1].
125 :param data: data to transform
126 :param dim: name (xarray) or axis (pandas) of dimension which should be centred
127 :param feature_range: scale data to any interval given in feature range. Default is scaling on interval [0, 1].
128 :return: transformed data, and dictionary with keys method, min, and max
129 """
130 d_max = data.max(dim)
131 d_min = data.min(dim)
132 d_scaled = (data - d_min) / (d_max - d_min) * (max(feature_range) - min(feature_range)) + min(feature_range)
133 return d_scaled, {"min": d_min, "max": d_max, "method": "min_max", "feature_range": feature_range}
136def min_max_inverse(data: Data, _min: Data, _max: Data, feature_range: Tuple = (0, 1)) -> Data:
137 """
138 Apply inverse transformation of `min_max` scaling.
140 :param data: data to apply inverse scaling
141 :param _min: minimum value to use for min/max scaling
142 :param _max: maximum value to use for min/max scaling
143 :param feature_range: scale data to any interval given in feature range. Default is scaling on interval [0, 1].
144 :return: inverted min/max scaled data
145 """
146 return (data - min(feature_range)) / (max(feature_range) - min(feature_range)) * (_max - _min) + _min
149def min_max_apply(data: Data, _min: Data, _max: Data, feature_range: Data = (0, 1)) -> Data:
150 """
151 Apply `min_max` scaling with given minimum and maximum.
153 :param data: data to apply scaling
154 :param _min: minimum value to use for min/max scaling
155 :param _max: maximum value to use for min/max scaling
156 :param feature_range: scale data to any interval given in feature range. Default is scaling on interval [0, 1].
157 :return: min/max scaled data
158 """
159 if not isinstance(feature_range, xr.DataArray): 159 ↛ 162line 159 didn't jump to line 162, because the condition on line 159 was never false
160 return (data - _min) / (_max - _min) * (max(feature_range) - min(feature_range)) + min(feature_range)
161 else:
162 return (data - _min) / (_max - _min) * (feature_range.max() - feature_range.min()) + feature_range.min()
165def log(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]:
166 """
167 Apply logarithmic transformation (and standarization) to data. This method first uses the logarithm for
168 transformation and second applies the `standardise` method additionally. A logarithmic function numpy's log1p is
169 used (`res = log(1+x)`) instead of the pure logarithm to be applicable to values of 0 too.
171 :param data: transform this data
172 :param dim: name (xarray) or axis (pandas) of dimension which should be transformed
173 :return: transformed data, and option dictionary with keys method, mean, and std
174 """
175 transformed_standardized, opts = standardise(np.log1p(data), dim)
176 opts.update({"method": "log"})
177 return transformed_standardized, opts
180def log_inverse(data: Data, mean: Data, std: Data) -> Data:
181 """
182 Apply inverse log transformation (therefore exponential transformation). Because `log` is using `np.log1p` this
183 method is based on the equivalent method `np.exp1m`. Data are first rescaled using `standardise_inverse` and then
184 given to the exponential function.
186 :param data: apply inverse log transformation on this data
187 :param mean: mean of the standarization
188 :param std: std of the standarization
189 :return: inverted data
190 """
191 data_rescaled = standardise_inverse(data, mean, std)
192 return np.expm1(data_rescaled)
195def log_apply(data: Data, mean: Data, std: Data) -> Data:
196 """
197 Apply numpy's log1p on given data. Further information can be found in description of `log` method.
199 :param data: transform this data
200 :param mean: mean of the standarization
201 :param std: std of the standarization
202 :return: transformed data
203 """
204 return standardise_apply(np.log1p(data), mean, std)
207def mean_squared_error(a, b, dim=None):
208 """Calculate mean squared error."""
209 return np.square(a - b).mean(dim)
212def mean_absolute_error(a, b, dim=None):
213 """Calculate mean absolute error."""
214 return np.abs(a - b).mean(dim)
217def mean_error(a, b, dim=None):
218 """Calculate mean error where a is forecast and b the reference (e.g. observation)."""
219 return a.mean(dim) - b.mean(dim)
222def index_of_agreement(a, b, dim=None):
223 """Calculate index of agreement (IOA) where a is the forecast and b the reference (e.g. observation)."""
224 num = (np.square(b - a)).sum(dim)
225 b_mean = (b * np.ones(1)).mean(dim)
226 den = (np.square(np.abs(b - b_mean) + np.abs(a - b_mean))).sum(dim)
227 frac = num / den
228 # issue with 0/0 division for exactly equal arrays
229 if isinstance(frac, (int, float)):
230 frac = 0 if num == 0 else frac
231 else:
232 frac[num == 0] = 0
233 return 1 - frac
236def modified_normalized_mean_bias(a, b, dim=None):
237 """Calculate modified normalized mean bias (MNMB) where a is the forecast and b the reference (e.g. observation)."""
238 N = np.count_nonzero(a) if len(a.shape) == 1 else a.notnull().sum(dim)
239 return 2 * ((a - b) / (a + b)).sum(dim) / N
242def calculate_error_metrics(a, b, dim):
243 """Calculate MSE, ME, RMSE, MAE, IOA, and MNMB. Additionally, return number of used values for calculation.
245 :param a: forecast data to calculate metrics for
246 :param b: reference (e.g. observation)
247 :param dim: dimension to calculate metrics along
249 :returns: dict with results for all metrics indicated by lowercase metric short name
250 """
251 mse = mean_squared_error(a, b, dim)
252 me = mean_error(a, b, dim)
253 rmse = np.sqrt(mse)
254 mae = mean_absolute_error(a, b, dim)
255 ioa = index_of_agreement(a, b, dim)
256 mnmb = modified_normalized_mean_bias(a, b, dim)
257 n = (a - b).notnull().sum(dim)
258 results = {"mse": mse, "me": me, "rmse": rmse, "mae": mae, "ioa": ioa, "mnmb": mnmb, "n": n}
259 return {k: squeeze_coords(v) for k, v in results.items()}
262def get_error_metrics_units(base_unit):
263 return {"mse": f"{base_unit}$^2$", "me": base_unit, "rmse": base_unit, "mae": base_unit, "ioa": None, "mnmb": None,
264 "n": None}
267def get_error_metrics_long_name():
268 return {"mse": "mean squared error", "me": "mean error", "rmse": "root mean squared error",
269 "mae": "mean absolute error", "ioa": "index of agreement", "mnmb": "modified normalized mean bias",
270 "n": "count"}
273def mann_whitney_u_test(data: pd.DataFrame, reference_col_name: str, **kwargs):
274 """
275 Calculate Mann-Whitney u-test. Uses pandas' .apply() on scipy.stats.mannwhitneyu(x, y, ...).
276 :param data:
277 :type data:
278 :param reference_col_name: Name of column which is used for comparison (y)
279 :type reference_col_name:
280 :param kwargs:
281 :type kwargs:
282 :return:
283 :rtype:
284 """
285 res = data.apply(stats.mannwhitneyu, y=data[reference_col_name], **kwargs)
286 res = res.rename(index={0: "statistics", 1: "p-value"})
287 return res
290def represent_p_values_as_asteriks(p_values: pd.Series, threshold_representation: OrderedDict = None):
291 """
292 Represent p-values as asteriks based on its value.
293 :param p_values:
294 :type p_values:
295 :param threshold_representation:
296 :type threshold_representation:
297 :return:
298 :rtype:
299 """
300 if threshold_representation is None:
301 threshold_representation = OrderedDict([(1, "ns"), (0.05, "*"), (0.01, "**"), (0.001, "***")])
303 if not all(x > y for x, y in zip(list(threshold_representation.keys()), list(threshold_representation.keys())[1:])):
304 raise ValueError(
305 f"`threshold_representation' keys mus be in strictly "
306 f"decreasing order but is: {threshold_representation.keys()}")
308 asteriks = pd.Series().reindex_like(p_values)
309 for k, v in threshold_representation.items():
310 asteriks[p_values < k] = v
311 return asteriks
314class SkillScores:
315 r"""
316 Calculate different kinds of skill scores.
318 Skill score on MSE:
319 Calculate skill score based on MSE for given forecast, reference and observations.
321 .. math::
323 \text{SkillScore} = 1 - \frac{\text{MSE(obs, for)}}{\text{MSE(obs, ref)}}
325 To run:
327 .. code-block:: python
329 skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name)
331 Competitive skill score:
332 Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE.
333 Currently required forecasts are CNN, OLS and persi, as well as the observation obs.
335 .. code-block:: python
337 skill_scores_class = SkillScores(internal_data) # must contain columns CNN, OLS, persi and obs.
338 skill_scores = skill_scores_class.skill_scores(window_lead_time=3)
340 Skill score according to Murphy:
341 Follow climatological skill score definition of Murphy (1988). External data is data from another time period
342 than the internal data set on initialisation. In other terms, this should be the train and validation data
343 whereas the external data is the test data. This sounds perhaps counter-intuitive, but if a skill score is
344 evaluated to a model to another, this must be performed on test data set. Therefore, for this case the foreign
345 data is test.
347 .. code-block:: python
349 skill_scores_class = SkillScores(external_data) # must contain columns obs and CNN.
350 skill_scores_clim = skill_scores_class.climatological_skill_scores(internal_data, window_lead_time=3)
352 """
353 models_default = ["cnn", "persi", "ols"]
355 def __init__(self, external_data: Union[Data, None], models=None, observation_name="obs", ahead_dim="ahead",
356 type_dim="type", index_dim="index"):
357 """Set internal data."""
358 self.external_data = external_data
359 self.models = self.set_model_names(models)
360 self.observation_name = observation_name
361 self.ahead_dim = ahead_dim
362 self.type_dim = type_dim
363 self.index_dim = index_dim
365 def set_model_names(self, models: List[str]) -> List[str]:
366 """Either use given models or use defaults."""
367 if models is None:
368 models = self.models_default
369 return self._reorder(models)
371 @staticmethod
372 def _reorder(model_list: List[str]) -> List[str]:
373 """Set elements persi and obs at the very end of given list."""
374 for element in ["persi", "obs"]:
375 if element in model_list:
376 model_list.append(model_list.pop(model_list.index(element)))
377 return model_list
379 def get_model_name_combinations(self):
380 """Return all combinations of two models as tuple and string."""
381 combinations = list(itertools.combinations(self.models, 2))
382 combination_strings = [f"{first} - {second}" for (first, second) in combinations]
383 return combinations, combination_strings
385 def skill_scores(self) -> [pd.DataFrame, pd.DataFrame]:
386 """
387 Calculate skill scores for all combinations of model names.
389 :return: skill score for each comparison and forecast step
390 """
391 ahead_names = list(self.external_data[self.ahead_dim].data)
392 combinations, combination_strings = self.get_model_name_combinations()
393 skill_score = pd.DataFrame(index=combination_strings)
394 count = pd.DataFrame(index=combination_strings)
395 for iahead in ahead_names:
396 data = self.external_data.sel({self.ahead_dim: iahead})
397 skill_score[iahead] = [self.general_skill_score(data, dim=self.index_dim,
398 forecast_name=first,
399 reference_name=second,
400 observation_name=self.observation_name)
401 for (first, second) in combinations]
402 count[iahead] = [self.get_count(data, dim=self.index_dim) for _ in combinations]
403 return skill_score, count
405 def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray:
406 """
407 Calculate climatological skill scores according to Murphy (1988).
409 Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data
410 is part of parameters.
412 :param internal_data: internal data
413 :param forecast_name: name of the forecast to use for this calculation (must be available in `data`)
415 :return: all CASES as well as all terms
416 """
417 if self.external_data is not None:
418 ahead_names = list(self.external_data[self.ahead_dim].data)
419 else:
420 ahead_names = list(internal_data[self.ahead_dim].data)
422 all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
423 'CASE IV']
424 skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names],
425 dims=['terms', self.ahead_dim])
427 for iahead in ahead_names:
428 data = internal_data.sel({self.ahead_dim: iahead})
430 skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score(
431 data, mu_type=1, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten())
433 skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score(
434 data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten())
436 if self.external_data is not None and self.observation_name in self.external_data.coords[self.type_dim]:
437 external_data = self.external_data.sel({self.ahead_dim: iahead, self.type_dim: [self.observation_name]})
438 skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score(
439 data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name,
440 external_data=external_data).values.flatten())
441 try:
442 skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(
443 self._climatological_skill_score(data, mu_type=4, forecast_name=forecast_name,
444 observation_name=self.observation_name,
445 external_data=external_data).values.flatten())
446 except ValueError:
447 pass
449 return skill_score
451 def _climatological_skill_score(self, internal_data, observation_name, forecast_name, mu_type=1,
452 external_data=None):
453 kwargs = {"external_data": external_data} if external_data is not None else {}
454 return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(internal_data, observation_name, forecast_name,
455 **kwargs)
457 def general_skill_score(self, data: Data, forecast_name: str, reference_name: str,
458 observation_name: str = None, dim: str = "index") -> np.ndarray:
459 """
460 Calculate general skill score based on mean squared error.
462 :param data: internal data containing data for observation, forecast and reference
463 :param observation_name: name of observation
464 :param forecast_name: name of forecast
465 :param reference_name: name of reference
467 :return: skill score of forecast
468 """
469 if observation_name is None:
470 observation_name = self.observation_name
471 data = data.dropna(dim)
472 observation = data.sel({self.type_dim: observation_name})
473 forecast = data.sel({self.type_dim: forecast_name})
474 reference = data.sel({self.type_dim: reference_name})
475 mse = mean_squared_error
476 skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim)
477 return skill_score.values
479 def get_count(self, data: Data, dim: str = "index") -> np.ndarray:
480 """Count data and return number"""
481 data = data.dropna(dim)
482 return data.count(dim).max().values
484 def skill_score_pre_calculations(self, data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray,
485 np.ndarray,
486 np.ndarray,
487 Data,
488 Dict[
489 str, Data]]:
490 """
491 Calculate terms AI, BI, and CI, mean, variance and pearson's correlation and clean up data.
493 The additional information on mean, variance and pearson's correlation (and the p-value) are returned as
494 dictionary with the corresponding keys mean, sigma, r and p.
496 :param data: internal data to use for calculations
497 :param observation_name: name of observation
498 :param forecast_name: name of forecast
500 :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value
501 """
502 data = data.sel({self.type_dim: [observation_name, forecast_name]}).drop(self.ahead_dim)
503 data = data.dropna(self.index_dim)
505 mean = data.mean(self.index_dim)
506 sigma = np.sqrt(data.var(self.index_dim))
507 r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [forecast_name, observation_name]])
509 AI = np.array(r ** 2)
510 BI = ((r - (sigma.sel({self.type_dim: forecast_name}, drop=True) / sigma.sel({self.type_dim: observation_name},
511 drop=True))) ** 2).values
512 CI = (((mean.sel({self.type_dim: forecast_name}, drop=True) - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel(
513 {self.type_dim: observation_name}, drop=True)) ** 2).values
515 suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p}
516 return AI, BI, CI, data, suffix
518 def skill_score_mu_case_1(self, internal_data, observation_name, forecast_name):
519 """Calculate CASE I."""
520 AI, BI, CI, data, _ = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name)
521 skill_score = np.array(AI - BI - CI)
522 return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
524 def skill_score_mu_case_2(self, internal_data, observation_name, forecast_name):
525 """Calculate CASE II."""
526 AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name)
527 monthly_mean = self.create_monthly_mean_from_daily_data(data)
528 data = xr.concat([data, monthly_mean], dim=self.type_dim)
529 sigma = suffix["sigma"]
530 sigma_monthly = np.sqrt(monthly_mean.var())
531 r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]])
532 AII = np.array(r ** 2)
533 BII = ((r - sigma_monthly / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values
534 skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII))
535 return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
537 def skill_score_mu_case_3(self, internal_data, observation_name, forecast_name, external_data=None):
538 """Calculate CASE III."""
539 AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name)
540 mean, sigma = suffix["mean"], suffix["sigma"]
541 AIII = (((external_data.mean().values - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel(
542 {self.type_dim: observation_name}, drop=True)) ** 2).values
543 skill_score = np.array((AI - BI - CI + AIII) / (1 + AIII))
544 return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
546 def skill_score_mu_case_4(self, internal_data, observation_name, forecast_name, external_data=None):
547 """Calculate CASE IV."""
548 AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name)
549 monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data[self.index_dim])
550 data = xr.concat([data, monthly_mean_external], dim=self.type_dim).dropna(dim=self.index_dim)
551 mean, sigma = suffix["mean"], suffix["sigma"]
552 mean_external = monthly_mean_external.mean()
553 sigma_external = np.sqrt(monthly_mean_external.var())
554 r_mu, p_mu = stats.pearsonr(
555 *[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]])
556 AIV = np.array(r_mu ** 2)
557 BIV = ((r_mu - sigma_external / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values
558 CIV = (((mean_external - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel({self.type_dim: observation_name},
559 drop=True)) ** 2).values
560 skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV))
561 return pd.DataFrame(
562 {"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array()
564 def create_monthly_mean_from_daily_data(self, data, columns=None, index=None):
565 """
566 Calculate average for each month and save as daily values with flag 'X'.
568 :param data: data to average
569 :param columns: columns to work on (all columns from given data are used if empty)
570 :param index: index of returned data (index of given data is used if empty)
572 :return: data containing monthly means in daily resolution
573 """
574 if columns is None:
575 columns = data.coords[self.type_dim].values
576 if index is None:
577 index = data.coords[self.index_dim].values
578 coordinates = [index, [v + "X" for v in list(columns)]]
579 empty_data = np.full((len(index), len(columns)), np.nan)
580 monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=[self.index_dim, self.type_dim])
581 mu = data.groupby(f"{self.index_dim}.month").mean()
583 for month in mu.month:
584 monthly_mean[monthly_mean[self.index_dim].dt.month == month, :] = mu[mu.month == month].values.flatten()
586 return monthly_mean
589def create_single_bootstrap_realization(data: xr.DataArray, dim_name_time: str) -> xr.DataArray:
590 """
591 Return a bootstraped realization of data
592 :param data: data from which to draw ONE bootstrap realization
593 :param dim_name_time: name of time dimension
594 :return: bootstrapped realization of data
595 """
596 num_of_blocks = data.coords[dim_name_time].shape[0]
597 boot_idx = np.random.choice(num_of_blocks, size=num_of_blocks, replace=True)
598 return data.isel({dim_name_time: boot_idx})
601def calculate_average(data: xr.DataArray, **kwargs) -> xr.DataArray:
602 """
603 Calculate mean of data
604 :param data: data for which to calculate mean
605 :return: mean of data
606 """
607 return data.mean(**kwargs)
610def create_n_bootstrap_realizations(data: xr.DataArray, dim_name_time: str, dim_name_model: str, n_boots: int = 1000,
611 dim_name_boots: str = 'boots', seasons: List = None) -> Dict[str, xr.DataArray]:
612 """
613 Create n bootstrap realizations and calculate averages across realizations
615 :param data: original data from which to create bootstrap realizations
616 :param dim_name_time: name of time dimension
617 :param dim_name_model: name of model dimension
618 :param n_boots: number of bootstap realizations
619 :param dim_name_boots: name of bootstap dimension
620 :param seasons: calculate errors for given seasons in addition (default None)
621 :return:
622 """
623 seasons = [] if seasons is None else to_list(seasons) # assure seasons to be empty list if None
624 res_dims = [dim_name_boots]
625 dims = list(data.dims)
626 other_dims = [v for v in dims if v in set(dims).difference([dim_name_time])]
627 coords = {dim_name_boots: range(n_boots), **{dim_name: data.coords[dim_name] for dim_name in other_dims}}
628 if len(dims) > 1:
629 res_dims = res_dims + other_dims
630 realizations = {k: xr.DataArray(np.nan, dims=res_dims, coords=coords) for k in seasons + [""]}
631 for boot in range(n_boots):
632 shuffled = create_single_bootstrap_realization(data, dim_name_time=dim_name_time)
633 realizations[""][boot] = calculate_average(shuffled, dim=dim_name_time, skipna=True)
634 for season in seasons:
635 assert season in ["DJF", "MAM", "JJA", "SON"]
636 sel = shuffled[dim_name_time].dt.season == season
637 realizations[season][boot] = calculate_average(shuffled.sel({dim_name_time: sel}),
638 dim=dim_name_time, skipna=True)
639 return realizations
642def calculate_bias_free_data(data, time_dim="index", window_size=30):
644 # overall mean
645 overall_mean = data.mean(time_dim)
646 bias_free_data = data - overall_mean
648 # seasonal mean
649 seasonal_mean = data.rolling(**{time_dim: window_size}, min_periods=int(window_size/2), center=True).mean()
650 seasonal_bias_free_data = data - seasonal_mean
652 return bias_free_data, seasonal_bias_free_data