Source code for bdschism.hotstart_nudging_data

#!/usr/bin/env python
# -*- coding: utf-8 -*-import pandas as pd

# Use example as command-line interface function:
# python hotstart_nudging_data.py --start_date 2018-02-19  --nudge_len 300 --dest_dir . --repo_dir $repo_path
# or with bdschism in your environment::
# hot_nudge_data --start_date 2018-02-19  --nudge_len 300 --dest_dir . --repo_dir $repo_path

import matplotlib.pyplot as plt
from dms_datastore.read_ts import *
from dms_datastore.dstore_config import *
from dms_datastore.read_multi import *
from vtools.functions.unit_conversions import *
from vtools.data.vtime import days
import glob
import pandas as pd
import os
import logging
import click

logger = logger = logging.getLogger(__name__)

logging.basicConfig(filename="hotstart_nudge_data.log", level=logging.INFO)

stations = [
    "anh",
    "benbr",
    "hsl",
    "bts",
    "snc",
    "ibs",
    "cyg",
    "hun",
    "bdl",
    "fmb",
    "msl",
    "cse",
    "gzl",
    "ryc",
    "hon",
    "c24",
    "pct",
    "flt",
    "mrz",
    "pct",
    "mal",
    "pts",
    "carqb",
    "benbr",
    "co5",
    "ssi",
    "emm",
    "sdi",
    "blp",
    "jer",
    "sjj",
    "dsj",
    "frp",
    "fal",
    "bet",
    "hol2",
    "hll",
    "orq2",
    "frk",
    "holm",
    "bac",
    "mdm",
    "dbi",
    "ori",
    "oh4",
    "mab",
    "pri",
    "ppt",
    "trn",
    "rindg",
    "sjc",
    "rri",
    "sjg",
    "bdt",
    "dvi",
    "sjr",
    "orx",
    "pdup",
    "tpp",
    "uni",
    "sga",
    "gle",
    "old",
    "twa",
    "orm",
    "oad",
    "trp",
    "glc2",
    "wci",
    "vcu",
    "mab",
    "mtb",
    "rri2",
    "mdmzq",
    "sdc",
    "ges",
    "swe",
    "gss",
    "nmr",
    "sus",
    "sss",
    "sut",
    "snod",
    "gln",
    "rye",
    "ryf",
    "rvb",
    "mir",
    "dws",
    "lib",
    "ucs",
    "has",
    "srh",
    "awb",
    "afo",
    "hst",
    "ist",
    "ssw",
    "von",
    "few",
    "fre",
    "wlk",
    "gys",
    "god",
    "sal",
]

# Stations where an "upper" and "lower" sublocation occur and we must distinguish the upper
add_upper = ["anh", "cse", "mrz", "emm", "mal", "pts"]
repo = "//cnrastore-bdo/Modeling_Data/jenkins_repo_staging/continuous/formatted"



[docs]
def hotstart_nudge_data(sdate, ndays, dest, repo_dir):

    t0 = sdate
    nudgelen = pd.Timedelta(days=ndays)

    repo = repo_dir
    ##
    station_df = station_dbase()

    buf = days(5)
    sdata = t0 - buf
    edata = t0 + nudgelen + buf

    station_df = station_df.loc[stations]

    no_such_file = []
    tndx = pd.date_range(t0, t0 + nudgelen, freq="h")
    all_vars = ["temperature", "salinity"]
    used_stations = set()
    nudging_dfs = {}
    accepted_loc = []
    for label_var in all_vars:
        var = {"temperature": "temp", "salinity": "ec"}[
            label_var
        ]  # working variable for data
        print(f"Working on variable: {label_var},{var}")
        logger.info(f"Working on variable: {label_var},{var}")
        vals = []
        accepted = {}

        for ndx, row in station_df.iterrows():
            x = row.x
            y = row.y
            fndx = ndx + "@upper" if ndx in add_upper else ndx

            pat = f"*_{fndx}_*_{var}*_20??.csv"
            pat = os.path.join(repo, pat)
            matches = glob.glob(pat)
            if len(matches) == 0:
                no_such_file.append((ndx, var))
                continue
            try:
                subloc = "upper" if ndx in add_upper else None
                ts = read_ts_repo(
                    ndx, var, subloc=subloc, repo=repo, src_priority="infer"
                )
                ts = ts.loc[sdata:edata]
                ts = ts.interpolate(limit=4)
                if ts.shape[1] > 1:
                    ts = ts.mean(axis=1)
                    ts.name = "value"
                else:
                    ts = ts.squeeze()
                if var == "temp":
                    topquant = ts.quantile(q=0.25)
                    if topquant > 35:
                        print("Transforming F to C based on 25% qyantuke > 35deg")
                        print("Transforming F to C based on 25% qyantuke > 35deg")
                        ts = fahrenheit_to_celsius(ts)
                    if ndx in ["clc"] and (ts < 0.0).all():
                        ts = celsius_to_farenheit(ts)
                elif var == "ec":
                    ts = ec_psu_25c(ts)
                else:
                    raise ValueError(
                        f"Haven't worked out transforms needed except for {var}, only salt/temp"
                    )

                val = ts.at[t0]
                if not np.isnan(val):
                    vals.append((ndx, x, y, val))

                # This is the fraction of missing data
                ts = ts.reindex(tndx)
                gap_frac = ts.isnull().sum() / len(ts)
                print(f"Fraction of mssing data for {ndx} {var} is {gap_frac}")
                if gap_frac < 0.25:
                    print(f"Accepted {ndx} {var}")
                    ts.columns = [ndx]
                    ts = ts.fillna(-9999.0)
                    accepted[ndx] = ts
                    if ndx not in used_stations:
                        accepted_loc.append((ndx, x, y))
                        used_stations.add(ndx)

            except Exception as err:
                print("Exception")
                print(str(err))
                print(ndx, var)
                print(err)
        var_df = pd.DataFrame(data=vals, columns=("station", "x", "y", f"{label_var}"))
        var_df.set_index("station")
        var_df.to_csv(
            os.path.join(dest, f"hotstart_data_{label_var}.csv"),
            sep=",",
            float_format="%.2f",
        )

        # Check if accepted is empty. If so, throw an exception
        if accepted == {}:
            raise ValueError(f"No data found for {label_var} in {repo}")
        else:
            nudging_df = pd.concat(accepted, axis=1)
        nudging_df.index.name = "datetime"
        nudging_dfs[label_var] = nudging_df
        print(nudging_df)
        logger.info(nudging_df)

    obs_xy = pd.DataFrame(data=accepted_loc, columns=["site", "x", "y"])
    print("reindexing and printing")
    logger.info("reindexing and printing")

    for label_var in all_vars:
        nudging_dfs[label_var].to_csv(
            os.path.join(dest, f"nudging_data_{label_var}.csv"),
            sep=",",
            float_format="%.2f",
        )
        # Deprecated
        # nudging_dfs[label_var].reindex(columns=obs_xy.site).to_csv(f"nudging_data_{label_var}_b.csv",sep=",",float_format="%.2f")

    obs_xy = obs_xy.set_index("site", drop=True)
    obs_xy.to_csv(os.path.join(dest, f"obs_xy.csv"), sep=",", float_format="%.2f")

    print("No such file")
    logger.info("No such files")
    for item in no_such_file:
        print(item)
        logger.info(item)



@click.command(
    help="""\n
    Download station data from repo and save in csv format for hotstart nudging.\n
    Usage:\n
    bds hot_nudge_data --start_date 2018-02-19 --nudge_len 300 --dest_dir . --repo_dir $repo_path\n
    """
)
@click.option(
    "--start_date",
    required=True,
    help="starting date of SCHISM model, must be \
                    format like 2018-02-19",
)
@click.option(
    "--nudge_len",
    required=True,
    type=int,
    help="number of days to be downloaded \
                        from starting date",
)
@click.option(
    "--dest_dir",
    default=".",
    help="folder to store downloaded obs nudging data. \
                        Default is current folder",
)
@click.option(
    "--repo_dir",
    default=repo,
    help=f"path to the repo of observed time series. \
                        Default is {repo}",
)
@click.help_option("-h", "--help")
def hotstart_nudge_data_cli(start_date, nudge_len, dest_dir, repo_dir):
    """
    Command-line interface for the hotstart_nudge_data function.
    """
    try:
        sdate = pd.to_datetime(start_date)
    except ValueError:
        raise ValueError("Invalid date format. Use YYYY-MM-DD.")

    hotstart_nudge_data(sdate, nudge_len, dest_dir, repo_dir)


if __name__ == "__main__":
    hotstart_nudge_data_cli()