align code with min_coverage and modular newyear clipping (#172)

* supplied min_coverage to calc_havengetallen and calc_gemiddeldgetij * made newyear clipping modular including test * cleaned up comments
Deltares-research · Oct 25, 2024 · c800135 · c800135
1 parent 09d5cb6
commit c800135
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 21 deletions.
diff --git a/examples/KWK_process.py b/examples/KWK_process.py
@@ -61,7 +61,7 @@
 
 
 nap_correction = False
-min_coverage = 0.9 # for tidalindicators and slotgemiddelde #TODO: can also be used for havengetallen and gemgetij
+min_coverage = 0.9
 drop_duplicates = True
 
 compute_indicators = True
@@ -122,8 +122,7 @@
 
 
     #### SLOTGEMIDDELDEN
-    # TODO: nodal cycle is not in same phase for all stations, this is not physically correct.
-    # TODO: more data is needed for proper working of fitting for some stations (2011: BAALHK, BRESKVHVN, GATVBSLE, SCHAARVDND)
+    # TODO: more data is needed for proper working of fitting for some stations (2011: BAALHK, BRESKVHVN, GATVBSLE, SCHAARVDND) >> still after linear?
     if compute_slotgem:
         print(f'slotgemiddelden for {current_station}')
 
@@ -176,7 +175,7 @@
     ### HAVENGETALLEN 
     if compute_havengetallen:
         print(f'havengetallen for {current_station}')
-        df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_todate, return_df_ext=True)
+        df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_todate, return_df_ext=True, min_coverage=min_coverage)
 
         # plot hwlw per timeclass including median
         fig, axs = kw.plot_HWLW_pertimeclass(df_ext=df_HWLW, df_havengetallen=df_havengetallen)
@@ -201,13 +200,16 @@
         # derive getijkrommes: raw, scaled to havengetallen, scaled to havengetallen and 12h25min period
         gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=None,
                                               freq=pred_freq, nb=0, nf=0, 
-                                              scale_extremes=False, scale_period=False)
+                                              scale_extremes=False, scale_period=False,
+                                              min_coverage=min_coverage)
         gemgetij_corr = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate,
                                                freq=pred_freq, nb=1, nf=1, 
-                                               scale_extremes=True, scale_period=False)
+                                               scale_extremes=True, scale_period=False,
+                                               min_coverage=min_coverage)
         gemgetij_corr_boi = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate,
                                                    freq=pred_freq, nb=0, nf=4, 
-                                                   scale_extremes=True, scale_period=True)
+                                                   scale_extremes=True, scale_period=True,
+                                                   min_coverage=min_coverage)
 
         # TODO: the shape of the validation lines are different, so compare krommes to gele boekje instead
         # p:\archivedprojects\11205258-005-kpp2020_rmm-g5\C_Work\00_KenmerkendeWaarden\07_Figuren\figures_ppSCL_2\final20201211

diff --git a/kenmerkendewaarden/gemiddeldgetij.py b/kenmerkendewaarden/gemiddeldgetij.py
@@ -52,9 +52,11 @@ def calc_gemiddeldgetij(
         Timeseries of waterlevel extremes (1/2 only). The last 10 years of this 
         timeseries are used to compute the getijkrommes. The default is None.
     min_coverage : float, optional
-        The minimal required coverage of the df_ext timeseries. Passed on to `calc_havengetallen()`. The default is None.
+        The minimal required coverage of the df_ext timeseries. Passed on to 
+        `calc_havengetallen()`. The default is None.
     freq : str, optional
-        Frequency of the prediction, a value of 60 seconds or lower is adivisable for decent results. The default is "60sec".
+        Frequency of the prediction, a value of 60 seconds or lower is adivisable for 
+        decent results. The default is "60sec".
     nb : int, optional
         Amount of periods to repeat backward. The default is 0.
     nf : int, optional
@@ -351,12 +353,6 @@ def get_gemgetij_components(data_pd_meas):
     # components should not be reduced, since higher harmonics are necessary
     comp_frommeasurements_avg, _ = calc_getijcomponenten(df_meas=data_pd_meas)
 
-    # #check if all years are available
-    # comp_years = comp_frommeasurements_allyears['A'].columns
-    # expected_years = tstop_dt.year-tstart_dt.year
-    # if len(comp_years) < expected_years:
-    #     raise Exception('ERROR: analysis result contains not all years')
-
     # check if nans in analysis
     if comp_frommeasurements_avg.isnull()["A"].any():
         raise ValueError("analysis result contains nan values")

diff --git a/kenmerkendewaarden/slotgemiddelden.py b/kenmerkendewaarden/slotgemiddelden.py
@@ -12,6 +12,7 @@
     calc_wltidalindicators,
     calc_HWLWtidalindicators,
 )
+from kenmerkendewaarden.utils import clip_timeseries_last_newyearsday
 import logging
 
 __all__ = [
@@ -57,10 +58,7 @@ def calc_slotgemiddelden(
     slotgemiddelden_dict = {}
 
     # clip last value of the timeseries if this is exactly newyearsday
-    if df_meas.index[-1] == pd.Timestamp(
-        df_meas.index[-1].year, 1, 1, tz=df_meas.index.tz
-    ):
-        df_meas = df_meas.iloc[:-1]
+    df_meas = clip_timeseries_last_newyearsday(df_meas)
 
     # calculate yearly means
     dict_wltidalindicators = calc_wltidalindicators(df_meas, min_coverage=min_coverage)

diff --git a/kenmerkendewaarden/utils.py b/kenmerkendewaarden/utils.py
@@ -19,10 +19,16 @@ def raise_extremes_with_aggers(df_ext):
         )
 
 
-def crop_timeseries_last_nyears(df, nyears):
-    # remove last timestep if equal to "yyyy-01-01 00:00:00"
+def clip_timeseries_last_newyearsday(df):
+    # clip last value of the timeseries if this is exactly newyearsday
+    # so remove last timestep if equal to "yyyy-01-01 00:00:00"
     if '-01-01 00:00:00' in str(df.index[-1]):
         df = df.iloc[:-1]
+    return df
+
+
+def crop_timeseries_last_nyears(df, nyears):
+    df = clip_timeseries_last_newyearsday(df)
 
     # last_year, for instance 2020
     last_year = df.index[-1].year

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -6,6 +6,7 @@
 """
 import pytest
 from kenmerkendewaarden.utils import (raise_extremes_with_aggers,
+                                      clip_timeseries_last_newyearsday,
                                       crop_timeseries_last_nyears)
 import pandas as pd
 import numpy as np
@@ -27,6 +28,14 @@ def test_raise_extremes_with_aggers_pass_12df(df_ext_12_2010):
     raise_extremes_with_aggers(df_ext_12_2010)
 
 
+@pytest.mark.unittest
+def test_clip_timeseries_last_newyearsday(df_meas, df_meas_2010):
+    df_meas_clipped = clip_timeseries_last_newyearsday(df_meas)
+    df_meas_2010_clipped = clip_timeseries_last_newyearsday(df_meas_2010)
+    assert len(df_meas_clipped) == len(df_meas)-1
+    assert len(df_meas_2010_clipped) == len(df_meas_2010)
+
+
 @pytest.mark.unittest
 def test_crop_timeseries_last_nyears(df_meas):
     assert df_meas.index[0] == pd.Timestamp("1987-01-01 00:00:00+01:00 ")