Source code for energy_analysis_toolbox.timeseries.resample.index_transformation

"""Transforms indices of a time series to a new index according to a given function."""

from contextlib import suppress
from typing import TypeVar

import numpy as np
import pandas as pd
from pytz import BaseTzInfo
from scipy.stats import gaussian_kde, mode

from energy_analysis_toolbox.errors import EATUndefinedTimestepError
from energy_analysis_toolbox.timeseries.extract_features.basics import (
    index_to_timesteps,
    timestep_durations,
)

T = TypeVar("T", pd.Series, pd.DataFrame)



[docs]
def tz_convert_or_localize(
    timeseries: pd.Series | pd.DataFrame,
    tz: str | BaseTzInfo | None,
) -> pd.Series | pd.DataFrame:
    """Assign the requested timezone to the index of a timeseries.

    Parameters
    ----------
    timeseries : pd.Series or pd.DataFrame
        Timeseries to convert.
    tz : str or pytz.timezone or None
        Timezone to assign to the index of the timeseries.

    Returns
    -------
    pd.Series
        Timeseries with the requested timezone assigned to its index.

    .. note::

        This function is just syntactic sugar to avoid dealing with the `TypeError`
        when applying `tz_convert` to a time-naive timeseries.


    .. important::

        When localizing a time-naive timeseries, the `ambiguous` and `nonexistent`
        arguments are set to `True` and `'NaT'` respectively. This means that
        ambiguous times are localized to the beginning of the DST period
        and non-existent times are converted to `'NaT'`.

    """
    try:
        return timeseries.tz_convert(tz)
    except TypeError:
        return timeseries.tz_localize(tz, ambiguous=True, nonexistent="NaT")




[docs]
def index_to_freq(
    index: pd.DatetimeIndex,
    freq: str | pd.Timedelta | None,
    origin: str | pd.Timestamp | None = None,
    last_step_duration: float | None = None,
) -> pd.DatetimeIndex:
    """Return the expected index from resampling a time series to a given frequency.

    Parameters
    ----------
    index : pd.DatetimeIndex
        the index of the data to resample
    freq : str, pd.Timedelta
        the freq to which the series is resampled. Must be a valid
        pandas frequency.
    origin : {None, 'floor', 'ceil', pd.Timestamp}
        What origin should be used for the target resampling range. The following
        values are possible :

        - |None| : the default. Use the first index as the data a starting point.
        - ``'floor'`` : use the first index of the data, floored to the passed
          ``freq`` resolution.
        - ``'ceil'`` : use the first index of the data, ceiled to the passed
          ``freq`` resolution.
        - a ``pd.Timestamp`` : use the passed timestamp as starting point. The
          code tries to localize the value to the timezone of the first index in
          the data. Accordingly :

          * if the passed value is time-naive, it is localized to the timezone
            of the data;
          * if the data is time-naive, the timezone of the passed value is ignored
            and it is processed as if it were time-naive.

    last_step_duration : float, optional
        the duration of the last step of the resampling in (s).
        If |None|, the duration of the former-last time-step is used.
        Used to deduce the end of the resampling range.

    Returns
    -------
    pd.DatetimeIndex
        The resulting index of the resampling. Empty if the passed index is empty.

    """
    if index.empty:
        return pd.DatetimeIndex([], name=index.name, tz=index.tz, freq=freq)
    if origin is None:
        start = index[0]
    elif origin == "floor":
        start = index[0].floor(freq)
    elif origin == "ceil":
        start = index[0].ceil(freq)
    else:
        start = pd.Timestamp(origin)
        try:
            start = start.tz_localize(index.tz)
        except TypeError:
            try:
                start = start.tz_convert(index.tz)
            except TypeError:
                warn = (
                    "The passed origin could not be localized or converted to the "
                    "timezone of the original index. It is processed as if it were "
                    "time-naive."
                )
                raise Warning(warn) from None
    if last_step_duration is None:
        try:
            last_step_duration = (index[-1] - index[-2]).seconds
        except IndexError:
            err = (
                "The last step duration could not be determined from the index."
                " Please provide it explicitly."
            )
            raise EATUndefinedTimestepError(err) from None
    actual_end = index[-1] + pd.Timedelta(seconds=last_step_duration)
    return pd.date_range(
        start=start,
        end=actual_end,
        freq=freq,
        inclusive="left",
        name=index.name,
    )




[docs]
def estimate_timestep(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
    method: str = "median",
) -> float:
    """Return an estimation of the sampling period of a time series.

    .. note::
        Each method has its own advantages and drawbacks. The best method
        depends on the data. For instance:

        - if the data is regularly spaced, the ``mode`` is the best choice.
        - if the data is irregularly spaced, the ``kde`` is the best choice.
        - the ``median`` is not sensitive to outliers, and is a good choice if
            the data is irregularly spaced and has outliers.
        - the ``mean`` is almost never a good choice.

    Parameters
    ----------
    data : pd.Series, pd.DataFrame, pd.DatetimeIndex
        the data to analyse. Must have (or be) a DatetimeIndex.
    method : {'mean', 'median', 'mode', 'kde'}, optional
        the method used to compute the expected timestep. Defaults to 'median'.

    Returns
    -------
    float
        the expected timestep of the data in (s).

    Raises
    ------
    ValueError
        If the method is not one of {'mean', 'median', 'mode', 'kde'}.


    .. seealso::
        - :func:`median_time_step`
        - :func:`mean_time_step`
        - :func:`mode_time_step`
        - :func:`max_kde_time_step`

    """
    if method == "mean":
        return mean_time_step(data)
    if method == "median":
        return median_time_step(data)
    if method == "mode":
        return mode_time_step(data)
    if method == "kde":
        return max_kde_time_step(data)
    err = "method must be one of {'mean', 'median', 'mode', 'kde'}"
    raise ValueError(err)




[docs]
def median_time_step(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
) -> float:
    """Return the median timestep of a time series.

    Parameters
    ----------
    data : pd.Series, pd.DataFrame, pd.DatetimeIndex
        the data to analyse. Must have (or be) a DatetimeIndex.

    Returns
    -------
    float
        the median timestep of the data in (s).


    .. seealso::
        - :func:`estimate_timestep`
        - :func:`mode_time_step`

    """
    data = data_to_datetimeindex(data)
    timesteps = index_to_timesteps(data)
    return np.median(timesteps)




[docs]
def mean_time_step(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
) -> float:
    """Return the mean timestep of a time series.

    Parameters
    ----------
    data : pd.Series, pd.DataFrame, pd.DatetimeIndex
        the data to analyse. Must have (or be) a DatetimeIndex.

    Returns
    -------
    float
        the mean timestep of the data in (s).


    .. seealso::
        - :func:`estimate_timestep`
        - :func:`mode_time_step`

    """
    data = data_to_datetimeindex(data)
    timesteps = index_to_timesteps(data)
    return np.mean(timesteps)




[docs]
def mode_time_step(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
) -> float:
    """Return the mode timestep of a time series.

    .. warning:: The mode is the most frequent value. If there are several
        values with the same frequency, the first one is returned.
        If the values vary slightly around a central value, the mode
        is not representative of the data.

    Parameters
    ----------
    data : pd.Series, pd.DataFrame, pd.DatetimeIndex
        the data to analyse. Must have (or be) a DatetimeIndex.

    Returns
    -------
    float
        the mode timestep of the data in (s).


    .. seealso::
        - :func:`estimate_timestep`
        - :func:`max_kde_time_step`

    """
    data = data_to_datetimeindex(data)
    timesteps = index_to_timesteps(data)
    return mode(timesteps, nan_policy="omit").mode




[docs]
def max_kde_time_step(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
) -> float:
    """Return the maximum probable timestep of a time series.

    .. note:: It differs from the Mode as the distribution is first
       estimated using a KDE. Then, the max of this distribution is
       used.

    .. warning:: The KDE cannot be estimated if the data is regularly
        spaced. In this case, use another method.

    Parameters
    ----------
    data : pd.Series, pd.DataFrame, pd.DatetimeIndex
        the data to analyse. Must have (or be) a DatetimeIndex.

    Returns
    -------
    float
        the mode timestep of the data in (s).


    .. seealso::
        - :func:`estimate_timestep`
        - :func:`median_time_step`
        - :func:`mode_time_step`

    """
    data = data_to_datetimeindex(data)
    timesteps = index_to_timesteps(data)
    kde = gaussian_kde(timesteps)
    no_samples = 50
    samples = np.linspace(min(timesteps), max(timesteps), no_samples)
    probs = kde.evaluate(samples)
    maxima_index = probs.argmax()
    return samples[maxima_index]




[docs]
def data_to_datetimeindex(
    data: pd.Series | pd.DataFrame | pd.DatetimeIndex,
) -> pd.DatetimeIndex:
    """Convert the data to DatetimeIndex.

    Used to allow the use of the same functions for Series, DataFrame and
    DatetimeIndex.

    Parameters
    ----------
    data : pd.Series | pd.DataFrame | pd.DatetimeIndex
        A pandas object

    Returns
    -------
    pd.DatetimeIndex
        Return data if already an index, else the index of the data

    Raises
    ------
    ValueError
        If the data cannot be converted to pandas.DateTimeIndex

    """
    with suppress(AttributeError):
        data = data.index
    if not isinstance(data, pd.DatetimeIndex):
        err = "The data cannot be converted to pandas.DateTimeIndex"
        raise TypeError(err)
    return data




[docs]
def fill_missing_entries(
    data: pd.Series | pd.DataFrame,
    sampling_period: float,
    security_factor: float = 2,
    fill_value: float = pd.NA,
) -> pd.Series | pd.DataFrame:
    """Fill the data with new entries where the interval is too long.

    .. note:: The duration between the last new entry of a hole and the next
        (existing) entry is less or equal than the sampling_period.

    Parameters
    ----------
    data : pd.Series | pd.DataFrame
        The Data to process. Must have a DatetimeIndex
    sampling_period : float,
        The expected sampling period in (s)
    security_factor : float, optional
        The factor used to determine when a timestep is too long compared to
        the ``sampling_period``, which means that ``sampling_period * security_factor``
        is the maximum duration (excluded) between two entries.
        By default 2.
    fill_value : float, optional
        The value of the newly created entries, by default pd.NA

    Returns
    -------
    pd.Series | pd.DataFrame
        A copy of `data` with new created entries, sorted by index.


    .. seealso::

        - :func:`fill_data_holes`
        - :func:`estimate_timestep`

    """
    durations = timestep_durations(data)
    intervals_to_fill = durations[durations >= sampling_period * security_factor]
    if intervals_to_fill.empty:
        return data
    new_indexes = []
    for index, duration in intervals_to_fill.items():
        number_missing_entries = int(duration // sampling_period)
        tmp_indexes = [
            index + k * pd.Timedelta(seconds=sampling_period)
            for k in range(1, number_missing_entries)
        ]
        new_indexes += tmp_indexes
    missing_index = pd.DatetimeIndex(new_indexes)
    return data.reindex(
        data.index.append(missing_index).sort_values(),
        fill_value=fill_value,
    )




[docs]
def fill_data_holes(
    data: T,
    method: str = "mode",
    security_factor: float = 2,
    fill_value: float = pd.NA,
) -> T:
    """Return the data with new entries where the interval is too long.

    .. note:: the new indexes are created using the expected timestep determined
        by ``method``. The duration between the last new entry of a hole and the next
        (existing) entry is less or equal than the expected timestep.

    Parameters
    ----------
    data : pd.Series | pd.DataFrame
        The Data to process. Must have a DatetimeIndex
    method : {'mean', 'median', 'mode', 'kde'}, optional
        The method to estimate the expected Frequency, by default "mode".
        See :func:`estimate_timestep` for more details.
    security_factor : float, optional
        The factor used to determine a timestep is too long compared to
        the expected frequency, by default 2.
    fill_value : float, optional
        The value of the newly created entries, by default pd.NA

    Returns
    -------
    pd.Series | pd.DataFrame
        A copy of `data` with new created entries, sorted.


    .. seealso::
        - :func:`fill_missing_entries`
        - :func:`estimate_timestep`

    """
    sampling_period = estimate_timestep(data, method=method)
    return fill_missing_entries(
        data,
        sampling_period,
        security_factor=security_factor,
        fill_value=fill_value,
    )