Calibration of HBV model for Leven at Newby Bridge using eWaterCycle#
This notebook uses eWaterCycle to calibrate the HBV model. It is assumed you have seen this notebook that shows how to run a single model.
First we repeat the cells for setting up a single model: paths, region and forcing.
# general python
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import shutil
#general python
import numpy as np
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
#niceties
from rich import print
from ipywidgets import IntProgress
from IPython.display import display
# general eWaterCycle
import ewatercycle
import ewatercycle.models
import ewatercycle.forcing
Choose region#
See the documentation on the single model notebook
camelsgb_id = "camelsgb_73010"
set up paths#
forcing_path = Path.home() / "Forcing"
forcing_path.mkdir(exist_ok=True)
forcing_path_basin = forcing_path / camelsgb_id
forcing_path_basin.mkdir(exist_ok=True)
forcing_path_caravan = forcing_path_basin / "caravan"
forcing_path_caravan.mkdir(exist_ok=True)
prepared_forcing_path_caravan_user = forcing_path_basin / "caravan-prepared"
prepared_forcing_path_caravan_user.mkdir(exist_ok=True)
prepared_forcing_path_caravan_central = Path("/data/eurocsdms-data/forcing/camelsgb_73010/caravan")
shutil.copytree(prepared_forcing_path_caravan_central, prepared_forcing_path_caravan_user, dirs_exist_ok=True)
forcing_path_ERA5 = forcing_path_basin / "ERA5"
forcing_path_ERA5.mkdir(exist_ok=True)
prepared_forcing_path_ERA5_user = forcing_path_basin / "ERA5-prepared"
prepared_forcing_path_ERA5_central = Path("/data/eurocsdms-data/forcing/camelsgb_73010/ERA5")
shutil.copytree(prepared_forcing_path_ERA5_central / "work", prepared_forcing_path_ERA5_user / "work", dirs_exist_ok=True)
PosixPath('/home/rhut/Forcing/camelsgb_73010/ERA5-prepared/work')
experiment_start_date = "1997-08-01T00:00:00Z"
experiment_end_date = "2000-08-31T00:00:00Z"
calibration_start_date = "1997-08-01T00:00:00Z"
calibration_end_date = "2000-08-31T00:00:00Z"
Forcing#
See the documentation on the single model notebook
# # option one: generate forcing data
# camelsgb_forcing = ewatercycle.forcing.sources['CaravanForcing'].generate(start_time = experiment_start_date,
# end_time = experiment_end_date,
# directory = forcing_path_caravan,
# basin_id = camelsgb_id,
# )
# # option two: load data you generated yourself
# camelsgb_forcing = ewatercycle.forcing.sources['CaravanForcing'].load(directory = forcing_path_caravan)
# option three: load data someone else generated
camelsgb_forcing = ewatercycle.forcing.sources['CaravanForcing'].load(directory = prepared_forcing_path_caravan_user)
print(camelsgb_forcing)
CaravanForcing( start_time='1997-08-01T00:00:00Z', end_time='2000-08-31T00:00:00Z', directory=PosixPath('/home/rhut/Forcing/camelsgb_73010/caravan-prepared'), shape=PosixPath('/home/rhut/Forcing/camelsgb_73010/caravan-prepared/camelsgb_73010.shp'), filenames={ 'tasmax': 'camelsgb_73010_1997-08-01_2000-08-31_tasmax.nc', 'evspsblpot': 'camelsgb_73010_1997-08-01_2000-08-31_evspsblpot.nc', 'Q': 'camelsgb_73010_1997-08-01_2000-08-31_Q.nc', 'tasmin': 'camelsgb_73010_1997-08-01_2000-08-31_tasmin.nc', 'pr': 'camelsgb_73010_1997-08-01_2000-08-31_pr.nc', 'tas': 'camelsgb_73010_1997-08-01_2000-08-31_tas.nc' } )
print(camelsgb_forcing)
CaravanForcing( start_time='1997-08-01T00:00:00Z', end_time='2000-08-31T00:00:00Z', directory=PosixPath('/home/rhut/Forcing/camelsgb_73010/caravan-prepared'), shape=PosixPath('/home/rhut/Forcing/camelsgb_73010/caravan-prepared/camelsgb_73010.shp'), filenames={ 'tasmax': 'camelsgb_73010_1997-08-01_2000-08-31_tasmax.nc', 'evspsblpot': 'camelsgb_73010_1997-08-01_2000-08-31_evspsblpot.nc', 'Q': 'camelsgb_73010_1997-08-01_2000-08-31_Q.nc', 'tasmin': 'camelsgb_73010_1997-08-01_2000-08-31_tasmin.nc', 'pr': 'camelsgb_73010_1997-08-01_2000-08-31_pr.nc', 'tas': 'camelsgb_73010_1997-08-01_2000-08-31_tas.nc' } )
ds = xr.open_dataset(camelsgb_forcing['Q'])
caravan_observation_df = ds['Q'].to_dataframe().reset_index()
caravan_observation_df = caravan_observation_df.set_index('time')
caravan_observation_df = caravan_observation_df[['Q']] #NOTE: mm/day
caravan_observation_df.plot()
<Axes: xlabel='time'>
ERA5_forcing_dir = forcing_path / camelsgb_id / "ERA5"
# # option one: generate forcing:
# ERA5_forcing = ewatercycle.forcing.sources["LumpedMakkinkForcing"].generate(
# dataset="ERA5",
# start_time=experiment_start_date,
# end_time=experiment_end_date,
# shape=camelsgb_forcing.shape,
# directory=forcing_path_ERA5
# )
# # option two: load data you generated yourself
# load_location = forcing_path_ERA5 / "work" / "diagnostic" / "script" #this is needed because ERA5 forcing data is stored deep in a sub-directory
# ERA5_forcing = ewatercycle.forcing.sources["LumpedMakkinkForcing"].load(directory=load_location)
# option three: load data someone else generated for you
load_location = prepared_forcing_path_ERA5_user / "work" / "diagnostic" / "script" #this is needed because ERA5 forcing data is stored deep in a sub-directory
ERA5_forcing = ewatercycle.forcing.sources["LumpedMakkinkForcing"].load(directory=load_location)
Create a collection of parameter sets#
We will be running the model a thousand time, each with a different set of parameters, to find the best combination of parameters possible. First we will create arrays to hold these parameters. It is important to limit the range per parameter.
Using just random sampling is not the smartest strategy in calibrating hydrological models, but is presented here for simplicty.
p_names = ["$I_{max}$", "$C_e$", "$Su_{max}$", "β", "$P_{max}$", "$T_{lag}$", "$K_f$", "$K_s$", "FM"]
S_names = ["Interception storage", "Unsaturated Rootzone Storage", "Fastflow storage", "Groundwater storage", "Snowpack storage"]
param_names = ["Imax","Ce", "Sumax", "Beta", "Pmax", "Tlag", "Kf", "Ks", "FM"]
stor_names = ["Si", "Su", "Sf", "Ss", "Sp"]
#The number of ensemble members in our ensemble
N = 1000
## Array of parameters min/max bounds
## Imax, Ce, Sumax, beta, Pmax, T_lag, Kf, Ks, FM
p_min_initial= np.array([0, 0.2, 40, .5, .001, 1, .01, .0001, 6])
p_max_initial = np.array([8, 1, 800, 4, .3, 10, .1, .01, 0.1])
#create empty array to fill
parameters = np.zeros([len(p_min_initial), N])
#fill with random values bounded by the min and max given above.
for param in range(len(p_min_initial)):
parameters[param,:] = np.random.uniform(p_min_initial[param],p_max_initial[param],N)
Just as with the normal model run, we need a set of initial storages:
Si, Su, Sf, Ss
s_0 = np.array([0, 100, 0, 5, 0])
Objective function for calibration#
We need to compare the output of the model to observations and come up with a “score” to indicate how ‘fit’ the combination of parameters is. Here we define a function that takes in two data frames and compares the timeseries in those and returns a root-mean-square value. Other objective functions can be defined of course.
def calibrationObjective(modelOutput,observation,start_calibration,end_calibration):
# a function that takes in two dataFrames, interpolates the model output to the
# observations and calculates the average absolute difference between the two.
#combine the two in one dataFrame
hydro_data = pd.concat([modelOutput.reindex(observation.index, method = 'ffill'), observation], axis=1)
#only select the calibration period
hydro_data = hydro_data[hydro_data.index > pd.to_datetime(pd.Timestamp(start_calibration).date())]
hydro_data = hydro_data[hydro_data.index < pd.to_datetime(pd.Timestamp(end_calibration).date())]
#calculate mean absolute difference
diff = hydro_data['Q'] - hydro_data['model output']
rms = np.sqrt(np.mean(diff**2))
return rms
Setting up an ensemble of models#
Since models are objects in eWaterCycle, arrays of models can be created, making it easy to start a lot of models with roughly the same settings. The one major difference in starting the models is that a specific configuration directory is provided, because otherwise multiple models might end up with the same config file name, because it is timestamp based.
A new development is that an extension of eWaterCycle is near completion called eWaterCycle Data Assimilation (ewatercycle.da), which includes ensembles of models as a native concept and makes running ensemble members in parallel easier.
from ewatercycle_HBV.model import HBVLocal
ensemble = []
for counter in range(N):
#add a model to the ensemble
ensemble.append(HBVLocal(forcing=ERA5_forcing))
for counter in range(N):
#create the config_file for this specific model, make sure the right parameters get passed
config_file, _ = ensemble[counter].setup(
parameters = parameters[:,counter],
initial_storage=s_0,
cfg_dir = "/home/rhut/configFiles/" + camelsgb_id + "_hbv_ensembleMember_" + str(counter),
)
#use this config file to initialize this ensemble member
ensemble[counter].initialize(config_file)
counter
999
Run all the models#
Here we run all the models, one by one, and calculate and save the value of the objective function for this ensemble member.
For more complex models, this loop can easily be run in parallel. Here we keep it simple. We do add a progressbar to indicate things are happening.
#an object to show a progress bar, since this can take a while:
f = IntProgress(min=0, max=N) # instantiate the bar
display(f) # display the bar
#an empty array to store the results in
objectives = []
#loop over all ensemble members
for ensembleMember in ensemble:
Q_m = []
time = []
while ensembleMember.time < ensembleMember.end_time:
ensembleMember.update()
discharge_this_timestep = ensembleMember.get_value("Q") #* Caravan_observations_area[caravan_basin_id] / (1000 * 86400)
Q_m.append(discharge_this_timestep[0])
time.append(pd.Timestamp(ensembleMember.time_as_datetime.date()))
#calculate the objective function
discharge_dataframe = pd.DataFrame({'model output': Q_m}, index=pd.to_datetime(time))
objective_this_model = calibrationObjective(discharge_dataframe,caravan_observation_df,calibration_start_date,calibration_end_date)
objectives.append(objective_this_model)
#it is good practice to remove any variable you don't need anymore to save memory.
del Q_m, time, discharge_dataframe, objective_this_model
#update progress bar
f.value += 1
ensembleMember.finalize()
Save and display results#
We make a plot for each parameter to show how they score on the objective function over the range provided. We print out the combination of parameters with the lowest objective function and we save those values to a csv file to read in the other notebook.
xFigNr = 2
yFigNr = 4
fig, axs = plt.subplots(xFigNr, yFigNr,figsize = (15,15))
for xFig in range(xFigNr):
for yFig in range(yFigNr):
paramCounter = xFig*yFigNr + yFig
axs[xFig,yFig].plot(parameters[paramCounter,:],objectives,'.')
axs[xFig,yFig].set_title(p_names[paramCounter])
plt.title('calibration parameters ' + camelsgb_id)
plt.savefig('/home/rhut/figures/calibration parameters ' + camelsgb_id + '.png')
parameters_minimum_index = np.argmin(np.array(objectives))
parameters_minimum = parameters[:,parameters_minimum_index]
np.savetxt("/home/rhut/configFiles/calibration" + camelsgb_id + ".csv", parameters_minimum, delimiter=",")
print(parameters_minimum)
[7.08470506e+00 8.36714074e-01 7.63731139e+01 1.11241746e+00 2.44743132e-01 7.80079088e+00 9.58685207e-02 3.28903468e-03 2.26225980e-01]