Anyspell

Utilities for hot, wet, dry, cold or persistent spells, as well as predictions

`ExtremeExperiment`

Bases: object

Obsolete i tihnk. Use the functions they're up to date

Parameters:

Name	Type	Description	Default
`object`	`_type_`	description	required

Source code in jetutils/anyspell.py

class ExtremeExperiment(object):
    """
    Obsolete i tihnk. Use the functions they're up to date

    Parameters
    ----------
    object : _type_
        _description_
    """
    def __init__(
        self,
        data_handler: DataHandler,
        q: float = 0.95,
        mask: xr.DataArray | Literal["land"] | None = "land",
        season: str | list | None = "JJA",
        metric: str = "jaccard",
    ) -> None:
        self.data_handler = data_handler
        self.da = self.data_handler.da
        self.path = self.data_handler.path
        self.q = q
        self.mask_name = mask_name(mask)
        if mask and mask == "land":
            self.mask = get_land_mask()
        else:
            self.mask = mask
        if season is None:
            self.season = self.data_handler.get_metadata()["season"]
        else:
            self.season = season
        self.metric = metric
        self.path_suffix = f"{q}_{season}_{metric}_{self.mask_name}mask"
        self.region = self.data_handler.get_metadata()["region"]
        self.pred_path = self.path.joinpath("predictions")
        self.pred_path.mkdir(parents=True, exist_ok=True)

    def load_da(self, **kwargs):
        self.da = compute(self.da, **kwargs)

    def compute_linkage_quantile(
        self,
    ) -> np.ndarray:
        Z_path = f"Z_{self.path_suffix}.npy"
        Z_path = self.path.joinpath(Z_path)
        if Z_path.is_file():
            return np.load(Z_path)
        condition_function = partial(quantile_exceedence, q=self.q, dim="time")
        self.load_da()
        distances = spatial_pairwise_jaccard(
            self.da,
            condition_function,
            self.mask,
            season=self.season,
            metric=self.metric,
        )
        Z = linkage(squareform(distances), method="ward")
        np.save(Z_path, Z)
        return Z

    def spatial_clusters_as_da(
        self,
        n_clu: int,
    ) -> xr.DataArray:
        feature_dims = self.data_handler.get_feature_dims()
        clusters_da_file = f"clusters_{self.path_suffix}_{n_clu}.nc"
        clusters_da_file = self.path.joinpath(clusters_da_file)
        if clusters_da_file.is_file():
            return open_dataarray(clusters_da_file)

        Z = self.compute_linkage_quantile()
        clusters = cut_tree(Z, n_clusters=n_clu)[:, 0]
        lon, lat = feature_dims["lon"], feature_dims["lat"]
        stack_dims = {"lat_lon": ("lat", "lon")}
        if self.mask is not None:
            mask = self.mask.sel(lon=lon, lat=lat)
            mask_flat = mask.stack(stack_dims)
            clusters_da = np.zeros(mask_flat.shape, dtype=float)
            clusters_da[:] = np.nan
            clusters_da = mask_flat.copy(data=clusters_da)
            clusters_da[mask_flat] = clusters
        else:
            clusters_da = self.da.copy(data=np.zeros(self.da.shape))
            clusters_da = clusters_da.stack(stack_dims)
            clusters_da[:] = clusters
        clusters_da = clusters_da.unstack()
        to_netcdf(clusters_da, clusters_da_file)
        return clusters_da

    def create_targets(
        self,
        n_clu: int,
        q: float | None = None,
        simple: bool = False,
        return_folder: bool = False,
        **kwargs,
    ):
        if q is None:
            q = self.q
        metadata = dict(
            n_clu=n_clu,
            q=q,
            simple=simple,
            **kwargs,
        )
        sample_dims = list(self.data_handler.sample_dims)
        sample_dims_no_time = [dim for dim in sample_dims if dim != "time"]
        thispath = self.pred_path
        thispath = find_spot(thispath, metadata)
        ofiles = [
            "targets.parquet",
            "spells.parquet",
        ]
        ofiles = [thispath.joinpath(ofile) for ofile in ofiles]
        if all([ofile.is_file() for ofile in ofiles]):
            if return_folder:
                return thispath
            to_ret = []
            for ofile in ofiles:
                to_ret.append(pl.read_parquet(ofile))
            return tuple(to_ret)
        clusters = self.spatial_clusters_as_da(n_clu)
        targets = regionalize(self.da, clusters, sample_dims)

        targets = extract_season_from_df(targets, self.season)
        expr = pl.col(self.da.name)
        expr = expr > expr.quantile(q)
        spells = get_spells(targets, expr, group_by=[*sample_dims_no_time, "region"])
        targets = targets.join(
            spells[[*sample_dims, "region", "len"]],
            on=[*sample_dims, "region"],
            how="left",
        ).fill_null(0)
        targets = targets.rename({self.da.name: "value"})
        to_ret = targets, spells
        for to_save, ofile in zip(to_ret, ofiles):
            to_save.write_parquet(ofile)
        if return_folder:
            return thispath
        return to_ret

    def mask_timeseries(
        self,
        timeseries: xr.DataArray | xr.Dataset | pl.DataFrame,
        n_clu: int,
        i_clu: int | Sequence[int] | Literal["all"] = "all",
        q: float | None = None,
        simple: bool = False,
        **kwargs,
    ):
        _, spells = self.create_targets(n_clu, i_clu, q, simple, **kwargs)
        return mask_from_spells_pl(spells, timeseries)

    def full_prediction(
        self,
        predictors: xr.DataArray,
        create_target_kwargs: Mapping,
        type_: Literal["rf", "lr"] = "rf",
        do_base_pred: bool = True,
        n_folds: int = 1,
        prediction_kwargs: Mapping | None = None,
    ):
        targets_folder = self.create_targets(**create_target_kwargs, return_folder=True)
        targets = open_dataarray(targets_folder.joinpath("length_targets.nc")) > 0
        if do_base_pred:
            path_to_base_pred = targets_folder.joinpath("base_pred.nc")
            if path_to_base_pred.is_file():
                base_pred = open_dataarray(path_to_base_pred)
            else:
                base_pred = regress_against_time(targets)
                to_netcdf(base_pred, path_to_base_pred)
        else:
            base_pred = None
            path_to_base_pred = None
        predictor_names = predictors.predictor.values
        if "lag" in predictors:
            lags = predictors.lag.values.tolist()
        else:
            lags = [0]
        metadata = {
            "predictors": predictor_names.tolist(),
            "type": type_,
            "lags": lags,
            "base_pred": path_to_base_pred,
            "n_folds": n_folds,
            "prediction_kwargs": prediction_kwargs,
        }
        if prediction_kwargs is None:
            prediction_kwargs = {}
        path = targets_folder.joinpath("one_prediction")
        path.mkdir(mode=0o777, parents=True, exist_ok=True)
        path = find_spot(path, metadata)
        return predict_all(
            predictors,
            targets,
            base_pred,
            type_,
            True,
            n_folds=n_folds,
            save_path=path,
            **prediction_kwargs,
        )

`brier_score(y_true, y_proba=None, *, sample_weight=None, pos_label=None)`

Compute the Brier score.

The higher the Brier score, the better. The Brier score measures the mean squared difference between the predicted probability and the actual outcome. The Brier score always takes on a value between zero and one, since this is the largest possible difference between a predicted probability (which must be between zero and one) and the actual outcome (which can take on values of only 0 and 1). It can be decomposed as the sum of refinement loss and calibration loss.

The Brier score is appropriate for binary and categorical outcomes that can be structured as true or false, but is inappropriate for ordinal variables which can take on three or more values (this is because the Brier score assumes that all possible outcomes are equivalently "distant" from one another). Which label is considered to be the positive label is controlled via the parameter pos_label, which defaults to the greater label unless y_true is all 0 or all -1, in which case pos_label defaults to 1.

Parameters:

Name	Type	Description	Default
`y_true`	`array-like of shape (n_samples,)`	True targets.	required
`y_proba`	`array-like of shape (n_samples,)`	Probabilities of the positive class.	`None`
`sample_weight`	`array-like of shape (n_samples,)`	Sample weights.	`None`
`pos_label`	`(int, float, bool or str)`	Label of the positive class. `pos_label` will be inferred in the following manner: if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1; else if `y_true` contains string, an error will be raised and `pos_label` should be explicitly specified; otherwise, `pos_label` defaults to the greater label, i.e. `np.unique(y_true)[-1]`.	`None`

Returns:

Name	Type	Description
`score`	`float`	Brier score loss.
	`something else : float`	another thing.

References

.. [1] Wikipedia entry for the Brier score <https://en.wikipedia.org/wiki/Brier_score>_.

Examples:

>>> import numpy as np
>>> from sklearn.metrics import brier_score_loss
>>> y_true = np.array([0, 1, 1, 0])
>>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"])
>>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])
>>> brier_score_loss(y_true, y_prob)
np.float64(0.037...)
>>> brier_score_loss(y_true, 1-y_prob, pos_label=0)
np.float64(0.037...)
>>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham")
np.float64(0.037...)
>>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
np.float64(0.0)

Source code in jetutils/anyspell.py

def brier_score(y_true, y_proba=None, *, sample_weight=None, pos_label=None):
    """Compute the Brier score.

    The higher the Brier score, the better.
    The Brier score measures the mean squared difference between the predicted
    probability and the actual outcome. The Brier score always
    takes on a value between zero and one, since this is the largest
    possible difference between a predicted probability (which must be
    between zero and one) and the actual outcome (which can take on values
    of only 0 and 1). It can be decomposed as the sum of refinement loss and
    calibration loss.

    The Brier score is appropriate for binary and categorical outcomes that
    can be structured as true or false, but is inappropriate for ordinal
    variables which can take on three or more values (this is because the
    Brier score assumes that all possible outcomes are equivalently
    "distant" from one another). Which label is considered to be the positive
    label is controlled via the parameter `pos_label`, which defaults to
    the greater label unless `y_true` is all 0 or all -1, in which case
    `pos_label` defaults to 1.

    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        True targets.

    y_proba : array-like of shape (n_samples,)
        Probabilities of the positive class.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    pos_label : int, float, bool or str, default=None
        Label of the positive class. `pos_label` will be inferred in the
        following manner:

        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
        * else if `y_true` contains string, an error will be raised and
          `pos_label` should be explicitly specified;
        * otherwise, `pos_label` defaults to the greater label,
          i.e. `np.unique(y_true)[-1]`.

    Returns
    -------
    score : float
        Brier score loss.

    something else : float
        another thing.

    References
    ----------
    .. [1] `Wikipedia entry for the Brier score
            <https://en.wikipedia.org/wiki/Brier_score>`_.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.metrics import brier_score_loss
    >>> y_true = np.array([0, 1, 1, 0])
    >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"])
    >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])
    >>> brier_score_loss(y_true, y_prob)
    np.float64(0.037...)
    >>> brier_score_loss(y_true, 1-y_prob, pos_label=0)
    np.float64(0.037...)
    >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham")
    np.float64(0.037...)
    >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
    np.float64(0.0)
    """
    return 1 - brier_score_loss(
        y_true, y_proba, sample_weight=sample_weight, pos_label=pos_label
    )

`mask_from_spells_pl(spells, to_mask, force_pl=False, time_before=datetime.timedelta(0), time_after=datetime.timedelta(0))`

Huh i think this is borken

Parameters:

Name	Type	Description	Default
`spells`	`DataFrame`	description	required
`to_mask`	`DataArray \| Dataset \| DataFrame`	description	required
`force_pl`	`bool`	description, by default False	`False`
`time_before`	`timedelta`	description, by default datetime.timedelta(0)	`timedelta(0)`
`time_after`	`timedelta`	description, by default datetime.timedelta(0)	`timedelta(0)`

Returns:

Type	Description
`_type_`	description

Source code in jetutils/anyspell.py

def mask_from_spells_pl(
    spells: pl.DataFrame,
    to_mask: xr.DataArray | xr.Dataset | pl.DataFrame,
    force_pl: bool = False,
    time_before: datetime.timedelta = datetime.timedelta(0),
    time_after: datetime.timedelta = datetime.timedelta(0),
):
    """
    Huh i think this is borken

    Parameters
    ----------
    spells : pl.DataFrame
        _description_
    to_mask : xr.DataArray | xr.Dataset | pl.DataFrame
        _description_
    force_pl : bool, optional
        _description_, by default False
    time_before : datetime.timedelta, optional
        _description_, by default datetime.timedelta(0)
    time_after : datetime.timedelta, optional
        _description_, by default datetime.timedelta(0)

    Returns
    -------
    _type_
        _description_
    """
    spells = extend_spells(spells, time_before=time_before, time_after=time_after)
    index_columns = get_index_columns(spells, ("member", "time"))
    unique_index_spells = spells.select(index_columns).unique(index_columns)
    unique_times_to_mask = [
        pl.Series(index_column, to_mask[index_column].to_numpy())
        for index_column in index_columns
    ]
    unique_times_to_mask = pl.DataFrame(unique_times_to_mask).unique(index_columns)
    unique_times = np.intersect1d(unique_index_spells, unique_times_to_mask)
    if isinstance(to_mask, xr.DataArray | xr.Dataset):
        to_mask = compute(to_mask.sel(time=unique_times), progress=True)
        to_mask = xarray_to_polars(to_mask)
        index_columns_xarray = get_index_columns(
            to_mask, ["lat", "lon", "jet", "jet ID", "cluster"]
        )
    else:
        to_mask = to_mask.cast({"time": pl.Datetime("ns")})
        index_columns_xarray = None
    to_mask = to_mask.cast({"time": pl.Datetime("ns")})
    spells = spells.cast(
        {"time": pl.Datetime("ns"), "relative_time": pl.Duration("ns")}
    )
    index_columns = get_index_columns(to_mask, ["member", "time"])
    if "region" in spells.columns and "region" in to_mask.columns:
        index_columns.append("region")
    masked = spells.join(to_mask, on=index_columns)
    if "len_right" in masked:
        masked.drop_in_place("len_right")
    if not index_columns_xarray or (masked.shape[0] == 0) or force_pl:
        return masked
    index_to_mask = ["spell", "relative_index"]
    masked = polars_to_xarray(masked, [*index_to_mask, *index_columns_xarray])
    index_to_mask = ["spell", "relative_index"]
    i0 = np.argmax(np.asarray(list(masked.dims)) == "relative_index")
    j0 = np.argmax(masked["relative_index"].values == 0)
    ndim = masked["time"].ndim
    indexer = [0 if i >= len(index_to_mask) else slice(None) for i in range(ndim)]
    masked["time"] = masked["time"][*indexer]
    masked["relative_time"] = masked["relative_time"][*indexer]
    indexer[i0] = j0
    masked["len"] = masked["len"][*indexer]
    coords = ["time", "relative_time", "len"]
    if "value" in spells.columns:
        masked["value"] = masked["value"][*indexer]
        coords.append("value")
    masked["relative_time"] = masked["relative_time"].max(dim="spell", skipna=True)
    masked = masked.set_coords(coords)
    data_vars = list(masked.data_vars)
    if len(data_vars) == 1:
        masked = masked[data_vars[0]]
    return masked