Source code for remedian.remedian

"""Contains an implementation of Remedian."""

# Author: Stefan Appelhoff <stefan.appelhoff@mailbox.org>
# License: MIT

import numpy as np



[docs]
class Remedian:
    """Remedian object for a robust averaging method for large data sets.

    Implementation of the Remedian algorithm, see [1]_ [2]_ [3]_ for
    references. This algorithm is used to approximate the median of several
    data chunks if these data chunks cannot (or should not) be loaded into
    memory at once. See "Notes" section for further information.

    Parameters
    ----------
    obs_size : ndarray
        The shape of each data chunk (=observation) to be fed into the Remedian
        object.
    n_obs : int
        The number of observations to be stored within each array.
        If `n_obs` >= `t`, Remedian will equal the median. The smaller this
        parameter, the fewer data have to be loaded into memory at once, but
        the less accurate the approximation of the median will be.
    t : int
        The total number of observations from which a median should be
        approximated.

    Attributes
    ----------
    obs_count : int
        Counter of number of observations that have already been given
        to the Remedian object.
    remedian : None | ndarray, shape(obs_size)
        The calculated remedian of the same shape as the input data.
        Will be None until all observations `n_obs` have been fed into
        the object using the add_obs method.

    Notes
    -----
    Given a data chunk of size `obs_size`, and `t` data chunks overall, the
    Remedian class sets up a number `k_arrs` of arrays of length `n_obs`.

    The median of the `t` data chunks of size `obs_size` is then approximated
    as follows: One data chunk after another is fed into the `n_obs` positions
    of the first array. When the first array is full, its median is calculated
    and stored in the first position of the second array. After this, the first
    array is re-used to fill the second position of the second array, etc.
    When the second array is full, the median of its values is stored in the
    first position of the third array, and so on.

    The final "Remedian" is the median of the last array, after all `t` data
    chunks have been fed into the object.

    In other words, given an n-dimensional array, the Remedian class
    approximates the median of this array across the ith dimension and you have
    to break up your n-dimensional array into `t` n-1-dimensional arrays that
    are given to Remedian one after another.

    References
    ----------
    .. [1] P.J. Rousseeuw, G.W. Bassett Jr., "The remedian: A robust averaging
       method for large data sets", Journal of the American Statistical
       Association, vol. 85 (1990), pp. 97-104

    .. [2] M. Chao, G. Lin, "The asymptotic distributions of the remedians",
       Journal of Statistical Planning and Inference, vol. 37 (1993), pp. 1-11

    .. [3] Domenico Cantone, Micha Hofri, "Further analysis of the remedian
       algorithm", Theoretical Computer Science, vol. 495 (2013), pp. 1-16

    """

    def __init__(self, obs_size, n_obs, t):
        """Initialize the Remedian object.

        See class docstring for more thorough information.

        Parameters
        ----------
        obs_size : ndarray
            Size of the observations. Must be (1,) for scalars.
        n_obs : int
            Observations per array.
        t : int
            Number of total observations.

        """
        if n_obs <= 1:
            raise ValueError('`n_obs` of <= 1 does not make sense.')

        self.obs_size = list(obs_size)
        self.n_obs = n_obs
        self.t = t

        # Calculate the number of arrays needed and their sizes
        self.k_arrs = self._calc_k_arrs()
        self.k_arr_sizes = self._calc_k_arr_sizes()

        # Initialize the arrays
        self.arrs = [np.zeros(self.obs_size+[s]) for s in self.k_arr_sizes]

        # counter for observations within each array
        self.obs_idx_counter = [0 for arr in range(self.k_arrs)]

        # Modulos of observations to assign to correct array later
        self.modulos = [self.n_obs**i for i in range(1, 1+self.k_arrs)]

        # Counter of received observations
        self.obs_count = 0

        # Set the median value to None until we have it
        self.remedian = None

    def _calc_k_arrs(self):
        """Calculate number of arrays to accommodate the observations."""
        tmp = self.n_obs
        k_arrs = 1
        while tmp <= self.t:
            tmp *= self.n_obs
            k_arrs += 1
        return k_arrs

    def _calc_k_arr_sizes(self):
        """Calculate the size of each array to accomodate the observations."""
        k_arr_sizes = [self.n_obs for i in range(self.k_arrs)]
        k_arr_sizes[-1] = int(np.ceil(self.t / (self.n_obs**(self.k_arrs-1))))
        return k_arr_sizes


[docs]
    def add_obs(self, obs):
        """Add an observation to the Remedian.

        Parameters
        ----------
        obs : ndarray, shape(obs_size)
            A single data observation.

        """
        # We only work if:
        # ... we get an observation of correct size
        # ... we have not yet received all observations already
        if list(obs.shape) != self.obs_size:
            raise ValueError('Expected observation of size {} but received: '
                             '{}'.format(self.obs_size, list(obs.shape)))
        if self.obs_count > (self.t - 1):
            raise RuntimeError('Already collected {} observations out of t={} '
                               'The remedian is {}'.format(self.obs_count,
                                                           self.t,
                                                           self.remedian))

        # We accept a new observation
        self.obs_count += 1

        # Add the data to the first array
        # and increment the counter for the next data
        obs_idx = self.obs_idx_counter[0]
        self.arrs[0][..., obs_idx] = obs
        self.obs_idx_counter[0] += 1

        # We can notice whenever an array is full using modulo operations
        # on the observation counter self.obs_count.
        # When an array is full, calculate the median and put the result
        # into the next array. Then reset the counters and start filling
        # previous arrays again
        for arr_i, mod in enumerate(self.modulos):
            if self.obs_count % mod == 0:
                data = self.arrs[arr_i]
                m_tmp = np.median(data, axis=-1, overwrite_input=True)
                self.arrs[arr_i+1][..., self.obs_idx_counter[arr_i+1]] = m_tmp
                self.obs_idx_counter[arr_i+1] += 1
                self.obs_idx_counter[arr_i] = 0

        # If all observations have been received,
        # calculate the median of the last array.
        # This is the robust approximation of the median
        if self.obs_count == self.t:
            self.remedian = np.median(self.arrs[-1], axis=-1,
                                      overwrite_input=True)