"""Functions operating on arrays."""
# Any commits made to this module between 2015-05-01 and 2017-03-01
# by Gerrit Holl are developed for the EC project “Fidelity and
# Uncertainty in Climate Data Records from Earth Observations (FIDUCEO)”.
# Grant agreement: 638822
#
# All those contributions are dual-licensed under the MIT license for use
# in typhon, and the GNU General Public License version 3.
import numpy as np
import scipy.stats
[docs]
def localmin(arr):
"""Find local minima for 1-D array
Given a 1-dimensional numpy.ndarray, return the locations of any local
minimum as a boolean array. The first and last item are always
considered False.
Arguments:
localmin (numpy.ndarray): 1-D ndarray for which to find local
minima. Should have a numeric dtype.
Returns:
numpy.ndarray with dtype `bool`. True for any element that is
strictly smaller than both neighbouring elements. First and last
element are always False.
"""
localmin = np.hstack(
(False, (arr[1:-1] < arr[0:-2]) & (arr[1:-1] < arr[2:]), False)
)
return localmin
[docs]
def limit_ndarray(M, limits):
"""Select elements from structured ndarray based on value ranges
This function filters a structured ndarray based on ranges defined for
zero or more fields. For each field f with limits (lo, hi), it will
select only those elements where lo<=X[f]<hi.
>>> X = array([(2, 3), (4, 5), (8, 2), (5, 1)],
dtype=[("A", "i4"), ("B", "i4")])
>>> print(limit_ndarray(X, {"A": (2, 5)}))
[(2, 3) (4, 5)]
>>> X = array([([2, 3], 3), ([4, 6], 5), ([8, 3], 2), ([5, 3], 1)],
dtype=[("A", "i4", 2), ("B", "i4")])
>>> print(limit_ndarray(X, {"A": (2, 5, "all")}))
[([2, 3], 3)]
Arguments:
M (numpy.ndarray): 1-D structured ndarray
limits (dict): Dictionary with limits. Keys must correspond to
fields in M. If this is a scalar field
(`M.dtype[field].shape==()`), values are tuples (lo, hi).
If this is a multidimensional field, values are tuples (lo,
hi, mode), where mode must be either `all` or `any`.
Values in the range [lo, hi) are retained, applying all or any
when needed.
Returns:
ndarray subset of M. This is a view, not a copy.
"""
selection = np.ones(shape=M.shape, dtype="?")
for (field, val) in limits.items():
ndim = len(M.dtype[field].shape)
if ndim == 0:
(lo, hi) = val
selection = selection & (M[field] >= lo) & (M[field] < hi)
else:
(lo, hi, mode) = val
lelo = M[field] >= lo
sthi = M[field] < hi
while lelo.ndim > 1:
lelo = getattr(lelo, mode)(-1)
sthi = getattr(sthi, mode)(-1)
selection = selection & lelo & sthi
return M[selection]
[docs]
def parity(v):
"""Vectorised parity-checking.
For any ndarray with an nd.integer dtype, return an equally shaped
array with the bit parity for each element.
Arguments:
v (numpy.ndarray): Array of integer dtype
Returns:
ndarray with uint8 dtype with the parity for each value in v
"""
v = v.copy() # don't ruin original
parity = np.zeros(dtype=">u1", shape=v.shape)
while v.any():
parity[v != 0] += 1
v &= v - 1
return parity
[docs]
def mad_outliers(arr, cutoff=10, mad0="raise"):
"""Mask out mad outliers
Mask out any values that are more than N times the median absolute
devitation from the median.
Although I (Gerrit Holl) came up with this myself, it's also
documented at:
http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
except that I rolled by own approach for "what if mad==0".
Note: If all values except one are constant, it is not possible to
determine whether the remaining one is an outlier or “reasonably
close” to the rest, without additional hints. In this case, some
outliers may go unnoticed.
Arguments:
arr (numpy.ndarray): n-D array with numeric dtype
cutoff (int): Maximum tolerable normalised fractional distance
mad0 (str): What to do if mad=0. Can be 'raise', 'ignore', or
'perc'. In case of 'perc', will search for the lowest
percentile at which the percentile absolute deviation is
nonzero, increase the cutoff by the fractional approach toward
percentile 100, and use that percentile instead. So if the
first non-zero is at percentile 75%, it will use the
75th-percntile-absolute-deviation and increase the cutoff by
a factor (100 - 50)/(100 - 75).
Returns:
ndarray with bool dtype, True for outliers
"""
if arr.ptp() == 0:
return np.zeros(shape=arr.shape, dtype="?")
ad = abs(arr - np.ma.median(arr))
mad = np.ma.median(ad)
if mad == 0:
if mad0 == "raise":
raise ValueError("Cannot filter outliers, MAD=0")
elif mad0 == "perc":
# try other percentiles
perc = np.r_[np.arange(50, 99, 1), np.linspace(99, 100, 100)]
pad = scipy.stats.scoreatpercentile(ad, perc)
if (pad == 0).all(): # all constant…?
raise ValueError("These data are weird!")
p_i = pad.nonzero()[0][0]
cutoff *= (100 - 50) / (100 - perc[p_i])
return (ad / pad[p_i]) > cutoff
elif mad is np.ma.masked:
# all are masked already…
return np.ones(shape=ad.shape, dtype="?")
else:
return (ad / mad) > cutoff
[docs]
def argclosest(array, value, retvalue=False):
"""Returns the index of the closest value in array.
Parameters:
array (ndarray): Input array.
value (float): Value to compare to.
retvalue (bool): If True, return the index and the closest value.
Returns:
int, float:
Index of closest value, Closest value (if ``retvalue`` is True)
"""
idx = np.abs(np.asarray(array) - value).argmin()
return (idx, array[idx]) if retvalue else idx