Source code for vtools.functions.lag_cross_correlation

import pandas as pd
import numpy as np

__all__ = ["calculate_lag"]


[docs] def icrosscorr(lag, ts0, ts1): """Lag-N cross correlation. Shifted data filled with NaNs Parameters ---------- lag : int, default 0 ts0, ts1 : pandas.Series objects of equal length Returns ---------- crosscorr : float """ return ts0.corr(ts1.shift(int(lag)))
[docs] def mincrosscorr(lag, ts0, ts1): return -icrosscorr(lag, ts0, ts1)
[docs] def calculate_lag(lagged, base, max_lag, res, interpolate_method="linear"): """Calculate shift in lagged, that maximizes cross-correlation with base. Parameters ---------- base,lagged: :class:`Pandas.Series` time series to compare. The result is relative to base max_lag: interval Maximum pos/negative time shift to consider in cross-correlation (ie, from -max_lag to +max_lag). Required windows in lagged will account for this bracket. For series dominated by a single frequency (eg 1c/12.5 hours for tides), the algorithm can tolerate a range of 180 degrees (6 hours) res: interval Resolution of analysis. The series lagged will be interpolated to this resolution using interpolate_method. Unit used here will determine the type of the output. See documentation of the interval concept which is most compatible with pandas.tseries.offsets, not timeDelta, because of better math properties in things like division -- vtime helpers like minutes(1) may be helpful interpolate_method: str, optional Interpolate method to refine lagged to res. Must be compatible with pandas interpolation method names (and hence scipy) Returns ------- lag : interval Shift as a pandas.tseries.offsets subtype that matches units with res This shift is the apparent lateness (pos) or earliness (neg). It must be applied to base or removed to lagged to align the features. """ from scipy.optimize import brent if not isinstance(base, pd.Series): raise ValueError("base and lagged arguments must be Series") if not isinstance(lagged, pd.Series): raise ValueError("base and lagged arguments must be Series") lagged_hr = lagged.resample(res).interpolate(interpolate_method) do_plot = False if do_plot: lagstride = 6 ilag = np.arange(-max_lag / res, max_lag / res, lagstride) ccors = [] for i in ilag: ccors.append(icrosscorr(i, base, lagged_hr)) dflag = pd.DataFrame(data=ccors, index=ilag) dflag.plot() # This uses a simple heuristic for bracketing a maximum # that is guaranteed to work if there is a single local maximum # in the interior. This allows the use of a 3-point bracket in brent, which # won't drift outside of the stipulated interval (the 2-point bracket # tends to do this for max_lag > 0.25 period, wh ereas we generally # want max_lag to be about half a period) mlag = max_lag / res bracket = [-mlag, -mlag // 3, mlag // 3] if icrosscorr(-mlag // 3, base, lagged_hr) > icrosscorr(mlag // 3, base, lagged_hr): bracket = [-mlag, -mlag // 3, mlag // 3] else: bracket = [-mlag // 3, mlag // 3, mlag] try: lagres = brent(mincrosscorr, args=(base, lagged_hr), brack=bracket) except ValueError as e0: if "bracketing" in str(e0).lower(): arglist = list(e0.args) arglist[0] = ( e0.args[0] + " Argument max_interval may not contain a maximizing lag" ) e0.args = tuple(arglist) raise return int(lagres) * res