Source code for torchtime.impute

"""
=======================
Missing data imputation
=======================

The following functions are provided to impute missing time series data:

* `Replace missing data with a fixed value <#torchtime.impute.replace_missing>`_
* `Forward imputation <#torchtime.impute.forward_impute>`_
"""

import torch
from torch import Tensor


[docs]def replace_missing(input: Tensor, fill: Tensor, select: Tensor = None) -> Tensor:
    """**Replace missing data with a fixed value by channel.**

    Imputes missing data by replacing all ``NaNs`` with a fixed value by channel. Fill
    values are specified by the ``fill`` argument. All channels are imputed by default,
    however a subset can be imputed by passing the indices to ``select``.

    A common choice of ``fill`` is the mean of each channel in the training data. Under
    this approach, no knowledge of the time series at times *t > i* is required when
    imputing values at time *i*. This is essential if you are developing a model that
    will make online predictions.

    Args:
        input: The tensor to impute. The final dimension must hold channel data.
        fill: Fill values for each channel in the same order as the data. ``fill`` must
            be the same length as the number of channels to be imputed i.e. the number
            of channels in the data or the length of ``select`` if shorter.
        select: Indices for the channels to be imputed (by default all channels are
            imputed).

    Returns:
        Imputed time series.
    """
    assert type(fill) is Tensor, "argument 'fill' must be a Tensor"
    if select is None:
        assert fill.size(0) == input.size(
            -1
        ), "Tensor 'fill' must have same number of channels as input ({})".format(
            input.size(-1)
        )
        select = torch.arange(input.size(-1))
    else:
        assert type(select) is Tensor and fill.size(0) == len(
            select
        ), "'select' must be a Tensor the same length as 'fill' ({})".format(
            fill.size(0)
        )
    # Replace missing values by channel
    output = input.clone()
    j = 0
    for i, channel in enumerate(torch.unbind(output, dim=-1)):
        if i in select:
            channel.nan_to_num_(fill[j])
            j += 1
    return output


[docs]def forward_impute(input: Tensor, fill: Tensor = None, select: Tensor = None) -> Tensor:
    """**Replace missing data with last observation carried forward.**

    Missing data (``NaNs``) are replaced by the previous observation in the channel.

    If the initial value(s) of a channel is ``NaN`` this is replaced with the respective
    value in ``fill`` (only required if an initial value is ``NaN``). All channels are
    imputed by default, however a subset can be imputed by passing the indices to
    ``select``.

    A common choice of ``fill`` is the mean of each channel in the training data. Under
    this approach, no knowledge of the time series at times *t > i* is required when
    imputing values at time *i*. This is essential if you are developing a model that
    will make online predictions.

    .. note::
        Only ``input`` tensors with 3 or fewer dimensions are currently supported. The
        final dimension must hold channel data.

    Args:
        input: The tensor to impute. The final dimension must hold channel data.
        fill: Fill values for each channel in the same order as the data. ``fill`` must
            be the same length as the number of channels to be imputed i.e. the number
            of channels in the data or the length of ``select`` if shorter.
        select: Indices for the channels to be imputed (by default all channels are
            imputed).

    Returns:
        Imputed time series.
    """
    assert len(input.size()) >= 2, "Tensor 'input' must have at least two dimensions"
    if select is None:
        select = torch.arange(input.size(-1))
    # Last observation carried forward (all channels)
    x = input.transpose(-2, -1)  # shape (n, c, s)
    x_mask = torch.logical_not(torch.isnan(x))
    x_mask = torch.cummax(x_mask, -1)[1]
    x_imputed = x.gather(-1, x_mask)
    x_imputed = x_imputed.transpose(-2, -1)  # shape (n, s, c)
    # Update selected channels with imputed data
    output = input.index_copy(-1, select, x_imputed[..., select])
    # Fill initial NaNs
    if torch.sum(torch.isnan(output[..., select])) > 0:
        assert fill is not None, "argument 'fill' must be provided"
        output = replace_missing(output, fill, select)
    return output