Source code for vtools.data.gap

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

__all__ = ["gap_count", "gap_size", "gap_distance"]


[docs] def gap_count(ts, state="gap", dtype=int): """Count missing data Identifies gaps (runs of missing or non-missing data) and quantifies the length of the gap in terms of number of samples, which works better for regular series. Each time point receives the length of the run. Parameters ---------- ts : :class:`DataFrame <pandas:pandas.DataFrame>` Time series to analyze state : `str` one of 'gap'|'good'|'both' State to count. If state is gap, block size of missing data are counted and reported for time points in the gap (every point in a given gap will receive the same value). Non missing data will have a size of zero. Setting state to 'good' inverts this -- missing blocks are reported as zero and good data are counted. dtype : `str` or `type` Data type of output, should be acceptable to pandas :meth:`astype <pandas:pandas.DataFrame.astype>` """ def column_gap_count(ser): s = ser.index.to_series() tsout = ser.fillna(0).astype(dtype) miss = ser.isna() # create consecutive groups that increment each time the "is missing state" (na or not na) changes g = miss.ne(miss.shift()).cumsum() # identify beginning (min time) of each state count = s.groupby(g).count() # g contains a group index for each member of out, and here # we map g to out which has cumulative time tsout = g.map(count) if state == "gap": tsout.loc[~miss] = 0 elif state == "good": tsout.loc[miss] = 0 return tsout if hasattr(ts, "columns"): return ts.apply(column_gap_count, axis=0, result_type="broadcast").astype(dtype) else: return column_gap_count(ts).astype(dtype)
[docs] def gap_size(ts): """ Identifies gaps (runs of missing data) and quantifies the length of the gap. Each time point receives the length of the run in terms of seconds or number of values in the time dimension, with non-missing data returning zero. Time is measured from the time the data first started being missing to when the data first starts being not missing . Parameters ---------- ts : :class:`DataFrame <pandas:pandas.DataFrame>` Returns ------- result : :class:`DataFrame <pandas:pandas.DataFrame>` A new regular time series with the same freq as the argument holding the size of the gap. Examples -------- >>> ndx = pd.date_range(pd.Timestamp(2017,1,1,12),freq='15min',periods=10) >>> vals0 = np.arange(0.,10.,dtype='d') >>> vals1 = np.arange(0.,10.,dtype='d') >>> vals2 = np.arange(0.,10.,dtype='d') >>> vals0[0:3] = np.nan >>> vals0[7:-1] = np.nan >>> vals1[2:4] = np.nan>>> >>> vals1[6] = np.nan >>> vals1[9] = np.nan >>> df = pd.DataFrame({'vals0':vals0,'vals1':vals1,'vals2':vals2},index = ndx) >>> out = gap_size(df) >>> print(df) vals0 vals1 vals2 2017-01-01 12:00:00 NaN 0.0 0.0 2017-01-01 12:15:00 NaN 1.0 1.0 2017-01-01 12:30:00 NaN NaN 2.0 2017-01-01 12:45:00 3.0 NaN 3.0 2017-01-01 13:00:00 4.0 4.0 4.0 2017-01-01 13:15:00 5.0 5.0 5.0 2017-01-01 13:30:00 6.0 NaN 6.0 2017-01-01 13:45:00 NaN 7.0 7.0 2017-01-01 14:00:00 NaN 8.0 8.0 2017-01-01 14:15:00 9.0 NaN 9.0 >>> print(out) vals0 vals1 vals2 2017-01-01 12:00:00 45.0 0.0 0.0 2017-01-01 12:15:00 45.0 0.0 0.0 2017-01-01 12:30:00 45.0 30.0 0.0 2017-01-01 12:45:00 0.0 30.0 0.0 2017-01-01 13:00:00 0.0 0.0 0.0 2017-01-01 13:15:00 0.0 0.0 0.0 2017-01-01 13:30:00 0.0 15.0 0.0 2017-01-01 13:45:00 30.0 0.0 0.0 2017-01-01 14:00:00 30.0 0.0 0.0 2017-01-01 14:15:00 0.0 0.0 0.0 """ ts_out = ts * 0.0 s = ts.index.to_series() for c in ts.columns: # test missing values miss = ts[c].isna() # create consecutive groups that increment each time the "is missing state" (na or not na) changes g = miss.ne(miss.shift()).cumsum() # identify beginning (min time) of each state m1 = s.groupby(g).min() # get beginning of next groups, last value is replaced last value of index m2 = m1.shift(-1).fillna(ts.index[-1]) # get difference, convert to minutes diffs = m2.sub(m1).dt.total_seconds().div(60).astype(int) # g contains a group index for each member of out, and here # we map g to out which has cumulative time ts_out[c] = g.map(diffs) ts_out.loc[~miss, c] = 0.0 return ts_out
[docs] def gap_distance(ts, disttype="count", to="good"): """ For each element of ts, count the distance to the nearest good data/or bad data. Parameters ---------- ts : :class:`DataFrame <pandas:pandas.DataFrame>` Time series to analyze disttype : `str` one of 'bad'|'good' If disttype = "count" this is the number of values. If dist_type="freq" it is in the units of ts.freq (so if freq == "15min" it is in minutes") to : `str` one of 'bad'|'good' If to = "good" this is the distance to the nearest good data (which is 0 for good data). If to = "bad", this is the distance to the nearest nan (which is 0 for nan). Returns ------- result : :class:`DataFrame <pandas:pandas.DataFrame>` A new regular time series with the same freq as the argument holding the distance of good/bad data. """ si = ts.index.to_series() ts_out = ts.to_frame() if isinstance(ts, pd.Series) else ts.copy() cols = ts_out.columns for col in cols: id_key = True # test missing values miss = ts_out[col].isna() if to == "good": ts_out.at[~miss, col] = 0 elif to == "bad": ts_out.at[miss, col] = 0 id_key = False else: raise ValueError("invalid input to, must be good or bad") if np.any(miss == (id_key)): mm = si.groupby(miss).indices for i in mm[id_key]: # ts_out.iloc[i][col]=np.min(np.abs(i-mm[not(id_key)])) ts_out.at[si[i], col] = np.min(np.abs(i - mm[not (id_key)])) if disttype == "count": return ts_out elif disttype == "freq": return ts_out * ts.index.freq else: raise ValueError("invalid input disttype, must be count or freq")
import pandas as pd import numpy as np
[docs] def describe_series_gaps(s: pd.Series, name: str, context: int = 2): """ Print gaps in a single Series s, showing `context` non-null points before and after each gap, with an ellipsis marker in between. """ mask = s.isna().to_numpy() idx = s.index.to_numpy() if not mask.any(): print(f"{name}: no missing values\n") return # find rising edges (gap starts) and falling edges (gap ends) diffs = np.diff(mask.astype(int), prepend=0, append=0) starts = np.where(diffs == 1)[0] ends = np.where(diffs == -1)[0] - 1 for i, (st, en) in enumerate(zip(starts, ends), 1): gap_len = en - st + 1 print(f"\n{name} — gap #{i}:") print(f" from {idx[st]} to {idx[en]} ({gap_len} samples missing)") # pre-gap context pre_idxs = [] j = st - 1 while j >= 0 and len(pre_idxs) < context: if not mask[j]: pre_idxs.append(j) j -= 1 for pi in reversed(pre_idxs): print(f" → {idx[pi]} : {s.iloc[pi]}") # ellipsis marker print(" ... [ missing block ] ...") # post-gap context post_idxs = [] j = en + 1 N = len(mask) while j < N and len(post_idxs) < context: if not mask[j]: post_idxs.append(j) j += 1 for pi in post_idxs: print(f" ← {idx[pi]} : {s.iloc[pi]}") print()
[docs] def describe_null(dset, name, context=2): """ If dset is a DataFrame, run describe_series_gaps on each column. If it's a Series, just run it once. """ if isinstance(dset, pd.DataFrame): for col in dset.columns: describe_series_gaps(dset[col], f"{name}.{col}", context=context) else: describe_series_gaps(dset, name, context=context)
[docs] def example_gap(): import numpy as np ndx = pd.date_range(pd.Timestamp(2017, 1, 1, 12), freq="15min", periods=10) vals0 = np.arange(0.0, 10.0, dtype="d") vals1 = vals0.copy() vals2 = vals0.copy() vals0[0:3] = np.nan vals0[7:-1] = np.nan vals1[2:4] = np.nan vals1[6] = np.nan vals1[9] = np.nan df = pd.DataFrame({"vals0": vals0, "vals1": vals1, "vals2": vals2}, index=ndx) out = gap_count(df) print(df) print(out) out = gap_distance(df) print("**") print(out)
if __name__ == "__main__": example_gap()