Coverage for mlair/helpers/data_sources/era5.py: 8%

48 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2023-06-01 13:03 +0000

1"""Methods to load era5 data.""" 

2__author__ = "Lukas Leufen" 

3__date__ = "2022-06-09" 

4 

5import logging 

6import os 

7 

8import pandas as pd 

9import xarray as xr 

10 

11from mlair import helpers 

12from mlair.configuration.era5_settings import era5_settings 

13from mlair.configuration.toar_data_v2_settings import toar_data_v2_settings 

14from mlair.helpers.data_sources.toar_data_v2 import load_station_information, combine_meta_data, correct_timezone 

15from mlair.helpers.data_sources.data_loader import EmptyQueryResult 

16from mlair.helpers.meteo import relative_humidity_from_dewpoint 

17 

18 

19def load_era5(station_name, stat_var, sampling, data_origin): 

20 

21 # make sure station_name parameter is a list 

22 station_name = helpers.to_list(station_name) 

23 

24 # get data path 

25 data_path, file_names = era5_settings(sampling) 

26 

27 # correct stat_var values if data is not aggregated (hourly) 

28 if sampling == "hourly": 

29 stat_var = {key: "values" for key in stat_var.keys()} 

30 else: 

31 raise ValueError(f"Given sampling {sampling} is not supported, only hourly sampling can be used.") 

32 

33 # load station meta using toar-data v2 API 

34 meta_url_base, headers = toar_data_v2_settings("meta") 

35 station_meta = load_station_information(station_name, meta_url_base, headers) 

36 

37 # sel data for station using sel method nearest 

38 logging.info(f"load data for {station_meta['codes'][0]} from ERA5") 

39 try: 

40 with xr.open_mfdataset(os.path.join(data_path, file_names)) as data: 

41 lon, lat = station_meta["coordinates"]["lng"], station_meta["coordinates"]["lat"] 

42 station_dask = data.sel(lon=lon, lat=lat, method="nearest", drop=True) 

43 station_data = station_dask.to_array().T.compute() 

44 except OSError as e: 

45 logging.info(f"Cannot load era5 data from path {data_path} and filenames {file_names} due to: {e}") 

46 return None, None 

47 

48 # transform data and meta to pandas 

49 station_data = station_data.to_pandas() 

50 if "relhum" in stat_var: 

51 station_data["RHw"] = relative_humidity_from_dewpoint(station_data["D2M"], station_data["T2M"]) 

52 station_data.columns = _rename_era5_variables(station_data.columns) 

53 

54 # check if all requested variables are available 

55 if set(stat_var).issubset(station_data.columns) is False: 

56 missing_variables = set(stat_var).difference(stat_var) 

57 origin = helpers.select_from_dict(data_origin, missing_variables) 

58 options = f"station={station_name}, origin={origin}" 

59 raise EmptyQueryResult(f"No data found for variables {missing_variables} and options {options} in JOIN.") 

60 else: 

61 station_data = station_data[stat_var] 

62 

63 # convert to local timezone 

64 station_data = correct_timezone(station_data, station_meta, sampling) 

65 

66 variable_meta = _emulate_meta_data(station_data) 

67 meta = combine_meta_data(station_meta, variable_meta) 

68 meta = pd.DataFrame.from_dict(meta, orient='index') 

69 meta.columns = station_name 

70 return station_data, meta 

71 

72 

73def _emulate_meta_data(station_data): 

74 general_meta = {"sampling_frequency": "hourly", "data_origin": "model", "data_origin_type": "model"} 

75 roles_meta = {"roles": [{"contact": {"organisation": {"name": "ERA5", "longname": "ECMWF"}}}]} 

76 variable_meta = {var: {"variable": {"name": var}, **roles_meta, ** general_meta} for var in station_data.columns} 

77 return variable_meta 

78 

79 

80def _rename_era5_variables(era5_names): 

81 mapper = {"SP": "press", "U10M": "u", "V10M": "v", "T2M": "temp", "D2M": "dew", "BLH": "pblheight", 

82 "TCC": "cloudcover", "RHw": "relhum"} 

83 era5_names = list(era5_names) 

84 try: 

85 join_names = list(map(lambda x: mapper[x], era5_names)) 

86 return join_names 

87 except KeyError as e: 

88 raise KeyError(f"Cannot map names from era5 to join naming convention: {e}")