Source code for remedian.remedian

# -*- coding: utf-8 -*-
"""Contains an implementation of Remedian."""

# Author: Stefan Appelhoff <stefan.appelhoff@mailbox.org>
# License: MIT

import numpy as np


[docs]class Remedian(): """Remedian object for a robust averaging method for large data sets. Implementation of the Remedian algorithm, see [1]_ [2]_ [3]_ for references. This algorithm is used to approximate the median of several data chunks if these data chunks cannot (or should not) be loaded into memory at once. Given a data chunk of size `obs_size`, and `t` data chunks overall, the Remedian class sets up a number `k_arrs` of arrays of length `n_obs`. The median of the `t` data chunks of size `obs_size` is then approximated as follows: One data chunk after another is fed into the `n_obs` positions of the first array. When the first array is full, its median is calculated and stored in the first position of the second array. After this, the first array is re-used to fill the second position of the second array, etc. When the second array is full, the median of its values is stored in the first position of the third array, and so on. The final "Remedian" is the median of the last array, after all `t` data chunks have been fed into the object. Parameters ---------- obs_size : ndarray The shape of each data chunk (=observation) to be fed into the Remedian object. n_obs : int The number of observations to be stored within each array. If `n_obs` >= `t`, Remedian will equal the median. t : int The total number of observations from which a median should be approximated. Attributes ---------- obs_count : int Counter of number of observations that have already been given to the Remedian object. remedian : None | ndarray, shape(obs_size) The calculated remedian of the same shape as the input data. Will be None until all observations `n_obs` have been fed into the object using the add_obs method. References ---------- .. [1] P.J. Rousseeuw, G.W. Bassett Jr., "The remedian: A robust averaging method for large data sets", Journal of the American Statistical Association, vol. 85 (1990), pp. 97-104 .. [2] M. Chao, G. Lin, "The asymptotic distributions of the remedians", Journal of Statistical Planning and Inference, vol. 37 (1993), pp. 1-11 .. [3] Domenico Cantone, Micha Hofri, "Further analysis of the remedian algorithm", Theoretical Computer Science, vol. 495 (2013), pp. 1-16 Examples -------- >>> import numpy as np >>> from remedian.remedian import Remedian >>> # We can have data of any shape ... e.g., 3D: >>> data_shape = (2, 3, 4) >>> # Now we have to decide how many data observations we want to load into >>> # memory at a time before computing a first intermediate median from it >>> n_obs = 100 >>> # Pick some example number, assume we have `t` arrays of `data_shape` >>> # that we want to summarize with Remedian >>> t = 500 >>> # Initialize the object >>> r = Remedian(data_shape, n_obs, t) >>> # Feed it the data- For now, we just generate the data randomly. >>> for obs_i in range(t): >>> obs = np.random.random(data_shape) >>> r.add_obs(obs) >>> # This is the remedian >>> r.remedian assert r.remedian.ndim == data_shape.ndim """ def __init__(self, obs_size, n_obs, t): """Initialize the Remedian object. See class docstring for more thorough information. Parameters ---------- obs_size : ndarray size of the observations. Could be (1,) for scalars. n_obs : int observations per array t : int number of total observations """ if n_obs <= 1: raise ValueError('`n_obs` of <= 1 does not make sense.') self.obs_size = list(obs_size) self.n_obs = n_obs self.t = t # Calculate the number of arrays needed and their sizes self.k_arrs = self._calc_k_arrs() self.k_arr_sizes = self._calc_k_arr_sizes() # Initialize the arrays self.arrs = [np.zeros(self.obs_size+[s]) for s in self.k_arr_sizes] # counter for observations within each array self.obs_idx_counter = [0 for arr in range(self.k_arrs)] # Modulos of observations to assign to correct array later self.modulos = [self.n_obs**i for i in range(1, 1+self.k_arrs)] # Counter of received observations self.obs_count = 0 # Set the median value to None until we have it self.remedian = None def _calc_k_arrs(self): """Calculate number of arrays to acommodate the observations.""" tmp = self.n_obs k_arrs = 1 while tmp <= self.t: tmp *= self.n_obs k_arrs += 1 return k_arrs def _calc_k_arr_sizes(self): """Calculate the size of each array to accomodate the observations.""" k_arr_sizes = [self.n_obs for i in range(self.k_arrs)] k_arr_sizes[-1] = int(np.ceil(self.t / (self.n_obs**(self.k_arrs-1)))) return k_arr_sizes
[docs] def add_obs(self, obs): """Add an observation to the Remedian. Parameters ---------- obs : ndarray, shape(obs_size) """ # We only work if: # ... we get an observation of correct size # ... we have not yet received all observations already if list(obs.shape) != self.obs_size: raise ValueError('Expected observation of size {} but received: ' '{}'.format(self.obs_size, list(obs.shape))) if self.obs_count > (self.t - 1): raise RuntimeError('Already collected {} observations out of t={} ' 'The remedian is {}'.format(self.obs_count, self.t, self.remedian)) # We accept a new observation self.obs_count += 1 # Add the data to the first array # and increment the counter for the next data obs_idx = self.obs_idx_counter[0] self.arrs[0][..., obs_idx] = obs self.obs_idx_counter[0] += 1 # We can notice whenever an array is full using modulo operations # on the observation counter self.obs_count. # When an array is full, calculate the median and put the result # into the next array. Then reset the counters and start filling # previous arrays again for arr_i, mod in enumerate(self.modulos): if self.obs_count % mod == 0: data = self.arrs[arr_i] m_tmp = np.median(data, axis=-1, overwrite_input=True) self.arrs[arr_i+1][..., self.obs_idx_counter[arr_i+1]] = m_tmp self.obs_idx_counter[arr_i+1] += 1 self.obs_idx_counter[arr_i] = 0 # If all observations have been received, # calculate the median of the last array. # This is the robust approximation of the median if self.obs_count == self.t: self.remedian = np.median(self.arrs[-1], axis=-1, overwrite_input=True)