Source code for remedian.remedian
"""Contains an implementation of Remedian."""
# Author: Stefan Appelhoff <stefan.appelhoff@mailbox.org>
# License: MIT
import numpy as np
[docs]
class Remedian:
"""Remedian object for a robust averaging method for large data sets.
Implementation of the Remedian algorithm, see [1]_ [2]_ [3]_ for
references. This algorithm is used to approximate the median of several
data chunks if these data chunks cannot (or should not) be loaded into
memory at once. See "Notes" section for further information.
Parameters
----------
obs_size : ndarray
The shape of each data chunk (=observation) to be fed into the Remedian
object.
n_obs : int
The number of observations to be stored within each array.
If `n_obs` >= `t`, Remedian will equal the median. The smaller this
parameter, the fewer data have to be loaded into memory at once, but
the less accurate the approximation of the median will be.
t : int
The total number of observations from which a median should be
approximated.
Attributes
----------
obs_count : int
Counter of number of observations that have already been given
to the Remedian object.
remedian : None | ndarray, shape(obs_size)
The calculated remedian of the same shape as the input data.
Will be None until all observations `n_obs` have been fed into
the object using the add_obs method.
Notes
-----
Given a data chunk of size `obs_size`, and `t` data chunks overall, the
Remedian class sets up a number `k_arrs` of arrays of length `n_obs`.
The median of the `t` data chunks of size `obs_size` is then approximated
as follows: One data chunk after another is fed into the `n_obs` positions
of the first array. When the first array is full, its median is calculated
and stored in the first position of the second array. After this, the first
array is re-used to fill the second position of the second array, etc.
When the second array is full, the median of its values is stored in the
first position of the third array, and so on.
The final "Remedian" is the median of the last array, after all `t` data
chunks have been fed into the object.
In other words, given an n-dimensional array, the Remedian class
approximates the median of this array across the ith dimension and you have
to break up your n-dimensional array into `t` n-1-dimensional arrays that
are given to Remedian one after another.
References
----------
.. [1] P.J. Rousseeuw, G.W. Bassett Jr., "The remedian: A robust averaging
method for large data sets", Journal of the American Statistical
Association, vol. 85 (1990), pp. 97-104
.. [2] M. Chao, G. Lin, "The asymptotic distributions of the remedians",
Journal of Statistical Planning and Inference, vol. 37 (1993), pp. 1-11
.. [3] Domenico Cantone, Micha Hofri, "Further analysis of the remedian
algorithm", Theoretical Computer Science, vol. 495 (2013), pp. 1-16
"""
def __init__(self, obs_size, n_obs, t):
"""Initialize the Remedian object.
See class docstring for more thorough information.
Parameters
----------
obs_size : ndarray
Size of the observations. Must be (1,) for scalars.
n_obs : int
Observations per array.
t : int
Number of total observations.
"""
if n_obs <= 1:
raise ValueError('`n_obs` of <= 1 does not make sense.')
self.obs_size = list(obs_size)
self.n_obs = n_obs
self.t = t
# Calculate the number of arrays needed and their sizes
self.k_arrs = self._calc_k_arrs()
self.k_arr_sizes = self._calc_k_arr_sizes()
# Initialize the arrays
self.arrs = [np.zeros(self.obs_size+[s]) for s in self.k_arr_sizes]
# counter for observations within each array
self.obs_idx_counter = [0 for arr in range(self.k_arrs)]
# Modulos of observations to assign to correct array later
self.modulos = [self.n_obs**i for i in range(1, 1+self.k_arrs)]
# Counter of received observations
self.obs_count = 0
# Set the median value to None until we have it
self.remedian = None
def _calc_k_arrs(self):
"""Calculate number of arrays to accommodate the observations."""
tmp = self.n_obs
k_arrs = 1
while tmp <= self.t:
tmp *= self.n_obs
k_arrs += 1
return k_arrs
def _calc_k_arr_sizes(self):
"""Calculate the size of each array to accomodate the observations."""
k_arr_sizes = [self.n_obs for i in range(self.k_arrs)]
k_arr_sizes[-1] = int(np.ceil(self.t / (self.n_obs**(self.k_arrs-1))))
return k_arr_sizes
[docs]
def add_obs(self, obs):
"""Add an observation to the Remedian.
Parameters
----------
obs : ndarray, shape(obs_size)
A single data observation.
"""
# We only work if:
# ... we get an observation of correct size
# ... we have not yet received all observations already
if list(obs.shape) != self.obs_size:
raise ValueError('Expected observation of size {} but received: '
'{}'.format(self.obs_size, list(obs.shape)))
if self.obs_count > (self.t - 1):
raise RuntimeError('Already collected {} observations out of t={} '
'The remedian is {}'.format(self.obs_count,
self.t,
self.remedian))
# We accept a new observation
self.obs_count += 1
# Add the data to the first array
# and increment the counter for the next data
obs_idx = self.obs_idx_counter[0]
self.arrs[0][..., obs_idx] = obs
self.obs_idx_counter[0] += 1
# We can notice whenever an array is full using modulo operations
# on the observation counter self.obs_count.
# When an array is full, calculate the median and put the result
# into the next array. Then reset the counters and start filling
# previous arrays again
for arr_i, mod in enumerate(self.modulos):
if self.obs_count % mod == 0:
data = self.arrs[arr_i]
m_tmp = np.median(data, axis=-1, overwrite_input=True)
self.arrs[arr_i+1][..., self.obs_idx_counter[arr_i+1]] = m_tmp
self.obs_idx_counter[arr_i+1] += 1
self.obs_idx_counter[arr_i] = 0
# If all observations have been received,
# calculate the median of the last array.
# This is the robust approximation of the median
if self.obs_count == self.t:
self.remedian = np.median(self.arrs[-1], axis=-1,
overwrite_input=True)