diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/tools/python3/src/Lib/statistics.py | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/python3/src/Lib/statistics.py')
-rw-r--r-- | contrib/tools/python3/src/Lib/statistics.py | 1054 |
1 files changed, 527 insertions, 527 deletions
diff --git a/contrib/tools/python3/src/Lib/statistics.py b/contrib/tools/python3/src/Lib/statistics.py index 463ac9e92c..c82ef879db 100644 --- a/contrib/tools/python3/src/Lib/statistics.py +++ b/contrib/tools/python3/src/Lib/statistics.py @@ -7,21 +7,21 @@ averages, variance, and standard deviation. Calculating averages -------------------- -================== ================================================== +================== ================================================== Function Description -================== ================================================== +================== ================================================== mean Arithmetic mean (average) of data. -fmean Fast, floating point arithmetic mean. -geometric_mean Geometric mean of data. +fmean Fast, floating point arithmetic mean. +geometric_mean Geometric mean of data. harmonic_mean Harmonic mean of data. median Median (middle value) of data. median_low Low median of data. median_high High median of data. median_grouped Median, or 50th percentile, of grouped data. mode Mode (most common value) of data. -multimode List of modes (most common values of data). -quantiles Divide data into intervals with equal probability. -================== ================================================== +multimode List of modes (most common values of data). +quantiles Divide data into intervals with equal probability. +================== ================================================== Calculate the arithmetic mean ("the average") of data: @@ -80,37 +80,37 @@ A single exception is defined: StatisticsError is a subclass of ValueError. """ -__all__ = [ - 'NormalDist', - 'StatisticsError', - 'fmean', - 'geometric_mean', - 'harmonic_mean', - 'mean', - 'median', - 'median_grouped', - 'median_high', - 'median_low', - 'mode', - 'multimode', - 'pstdev', - 'pvariance', - 'quantiles', - 'stdev', - 'variance', -] +__all__ = [ + 'NormalDist', + 'StatisticsError', + 'fmean', + 'geometric_mean', + 'harmonic_mean', + 'mean', + 'median', + 'median_grouped', + 'median_high', + 'median_low', + 'mode', + 'multimode', + 'pstdev', + 'pvariance', + 'quantiles', + 'stdev', + 'variance', +] import math import numbers -import random +import random from fractions import Fraction from decimal import Decimal from itertools import groupby from bisect import bisect_left, bisect_right -from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum -from operator import itemgetter -from collections import Counter +from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum +from operator import itemgetter +from collections import Counter # === Exceptions === @@ -163,7 +163,7 @@ def _sum(data, start=0): T = _coerce(int, type(start)) for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError - for n, d in map(_exact_ratio, values): + for n, d in map(_exact_ratio, values): count += 1 partials[d] = partials_get(d, 0) + n if None in partials: @@ -261,7 +261,7 @@ def _convert(value, T): return T(value) except TypeError: if issubclass(T, Decimal): - return T(value.numerator) / T(value.denominator) + return T(value.numerator) / T(value.denominator) else: raise @@ -277,8 +277,8 @@ def _find_lteq(a, x): def _find_rteq(a, l, x): 'Locate the rightmost value exactly equal to x' i = bisect_right(a, x, lo=l) - if i != (len(a) + 1) and a[i - 1] == x: - return i - 1 + if i != (len(a) + 1) and a[i - 1] == x: + return i - 1 raise ValueError @@ -315,55 +315,55 @@ def mean(data): raise StatisticsError('mean requires at least one data point') T, total, count = _sum(data) assert count == n - return _convert(total / n, T) - - -def fmean(data): - """Convert data to floats and compute the arithmetic mean. - - This runs faster than the mean() function and it always returns a float. - If the input dataset is empty, it raises a StatisticsError. - - >>> fmean([3.5, 4.0, 5.25]) - 4.25 - """ - try: - n = len(data) - except TypeError: - # Handle iterators that do not define __len__(). - n = 0 - def count(iterable): - nonlocal n - for n, x in enumerate(iterable, start=1): - yield x - total = fsum(count(data)) - else: - total = fsum(data) - try: - return total / n - except ZeroDivisionError: - raise StatisticsError('fmean requires at least one data point') from None - - -def geometric_mean(data): - """Convert data to floats and compute the geometric mean. - - Raises a StatisticsError if the input dataset is empty, - if it contains a zero, or if it contains a negative value. - - No special efforts are made to achieve exact results. - (However, this may change in the future.) - - >>> round(geometric_mean([54, 24, 36]), 9) - 36.0 - """ - try: - return exp(fmean(map(log, data))) - except ValueError: - raise StatisticsError('geometric mean requires a non-empty dataset ' - 'containing positive numbers') from None - - + return _convert(total / n, T) + + +def fmean(data): + """Convert data to floats and compute the arithmetic mean. + + This runs faster than the mean() function and it always returns a float. + If the input dataset is empty, it raises a StatisticsError. + + >>> fmean([3.5, 4.0, 5.25]) + 4.25 + """ + try: + n = len(data) + except TypeError: + # Handle iterators that do not define __len__(). + n = 0 + def count(iterable): + nonlocal n + for n, x in enumerate(iterable, start=1): + yield x + total = fsum(count(data)) + else: + total = fsum(data) + try: + return total / n + except ZeroDivisionError: + raise StatisticsError('fmean requires at least one data point') from None + + +def geometric_mean(data): + """Convert data to floats and compute the geometric mean. + + Raises a StatisticsError if the input dataset is empty, + if it contains a zero, or if it contains a negative value. + + No special efforts are made to achieve exact results. + (However, this may change in the future.) + + >>> round(geometric_mean([54, 24, 36]), 9) + 36.0 + """ + try: + return exp(fmean(map(log, data))) + except ValueError: + raise StatisticsError('geometric mean requires a non-empty dataset ' + 'containing positive numbers') from None + + def harmonic_mean(data): """Return the harmonic mean of data. @@ -403,11 +403,11 @@ def harmonic_mean(data): else: raise TypeError('unsupported type') try: - T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg)) + T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg)) except ZeroDivisionError: return 0 assert count == n - return _convert(n / total, T) + return _convert(n / total, T) # FIXME: investigate ways to calculate medians without sorting? Quickselect? @@ -428,11 +428,11 @@ def median(data): n = len(data) if n == 0: raise StatisticsError("no median for empty data") - if n % 2 == 1: - return data[n // 2] + if n % 2 == 1: + return data[n // 2] else: - i = n // 2 - return (data[i - 1] + data[i]) / 2 + i = n // 2 + return (data[i - 1] + data[i]) / 2 def median_low(data): @@ -451,10 +451,10 @@ def median_low(data): n = len(data) if n == 0: raise StatisticsError("no median for empty data") - if n % 2 == 1: - return data[n // 2] + if n % 2 == 1: + return data[n // 2] else: - return data[n // 2 - 1] + return data[n // 2 - 1] def median_high(data): @@ -473,7 +473,7 @@ def median_high(data): n = len(data) if n == 0: raise StatisticsError("no median for empty data") - return data[n // 2] + return data[n // 2] def median_grouped(data, interval=1): @@ -510,15 +510,15 @@ def median_grouped(data, interval=1): return data[0] # Find the value at the midpoint. Remember this corresponds to the # centre of the class interval. - x = data[n // 2] + x = data[n // 2] for obj in (x, interval): if isinstance(obj, (str, bytes)): raise TypeError('expected number but got %r' % obj) try: - L = x - interval / 2 # The lower limit of the median interval. + L = x - interval / 2 # The lower limit of the median interval. except TypeError: # Mixed type. For now we just coerce to float. - L = float(x) - float(interval) / 2 + L = float(x) - float(interval) / 2 # Uses bisection search to search for x in data with log(n) time complexity # Find the position of leftmost occurrence of x in data @@ -528,7 +528,7 @@ def median_grouped(data, interval=1): l2 = _find_rteq(data, l1, x) cf = l1 f = l2 - l1 + 1 - return L + interval * (n / 2 - cf) / f + return L + interval * (n / 2 - cf) / f def mode(data): @@ -537,128 +537,128 @@ def mode(data): ``mode`` assumes discrete data, and returns a single value. This is the standard treatment of the mode as commonly taught in schools: - >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) - 3 + >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) + 3 This also works with nominal (non-numeric) data: - >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) - 'red' - - If there are multiple modes with same frequency, return the first one - encountered: - - >>> mode(['red', 'red', 'green', 'blue', 'blue']) - 'red' - - If *data* is empty, ``mode``, raises StatisticsError. - - """ - pairs = Counter(iter(data)).most_common(1) - try: - return pairs[0][0] - except IndexError: - raise StatisticsError('no mode for empty data') from None - - -def multimode(data): - """Return a list of the most frequently occurring values. - - Will return more than one result if there are multiple modes - or an empty list if *data* is empty. - - >>> multimode('aabbbbbbbbcc') - ['b'] - >>> multimode('aabbbbccddddeeffffgg') - ['b', 'd', 'f'] - >>> multimode('') - [] - """ - counts = Counter(iter(data)).most_common() - maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) - return list(map(itemgetter(0), mode_items)) - - -# Notes on methods for computing quantiles -# ---------------------------------------- -# -# There is no one perfect way to compute quantiles. Here we offer -# two methods that serve common needs. Most other packages -# surveyed offered at least one or both of these two, making them -# "standard" in the sense of "widely-adopted and reproducible". -# They are also easy to explain, easy to compute manually, and have -# straight-forward interpretations that aren't surprising. - -# The default method is known as "R6", "PERCENTILE.EXC", or "expected -# value of rank order statistics". The alternative method is known as -# "R7", "PERCENTILE.INC", or "mode of rank order statistics". - -# For sample data where there is a positive probability for values -# beyond the range of the data, the R6 exclusive method is a -# reasonable choice. Consider a random sample of nine values from a -# population with a uniform distribution from 0.0 to 1.0. The -# distribution of the third ranked sample point is described by -# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and -# mean=0.300. Only the latter (which corresponds with R6) gives the -# desired cut point with 30% of the population falling below that -# value, making it comparable to a result from an inv_cdf() function. -# The R6 exclusive method is also idempotent. - -# For describing population data where the end points are known to -# be included in the data, the R7 inclusive method is a reasonable -# choice. Instead of the mean, it uses the mode of the beta -# distribution for the interior points. Per Hyndman & Fan, "One nice -# property is that the vertices of Q7(p) divide the range into n - 1 -# intervals, and exactly 100p% of the intervals lie to the left of -# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." - -# If needed, other methods could be added. However, for now, the -# position is that fewer options make for easier choices and that -# external packages can be used for anything more advanced. - -def quantiles(data, *, n=4, method='exclusive'): - """Divide *data* into *n* continuous intervals with equal probability. - - Returns a list of (n - 1) cut points separating the intervals. - - Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. - Set *n* to 100 for percentiles which gives the 99 cuts points that - separate *data* in to 100 equal sized groups. - - The *data* can be any iterable containing sample. - The cut points are linearly interpolated between data points. - - If *method* is set to *inclusive*, *data* is treated as population - data. The minimum value is treated as the 0th percentile and the - maximum value is treated as the 100th percentile. + >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) + 'red' + + If there are multiple modes with same frequency, return the first one + encountered: + + >>> mode(['red', 'red', 'green', 'blue', 'blue']) + 'red' + + If *data* is empty, ``mode``, raises StatisticsError. + """ - if n < 1: - raise StatisticsError('n must be at least 1') - data = sorted(data) - ld = len(data) - if ld < 2: - raise StatisticsError('must have at least two data points') - if method == 'inclusive': - m = ld - 1 - result = [] - for i in range(1, n): - j, delta = divmod(i * m, n) - interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n - result.append(interpolated) - return result - if method == 'exclusive': - m = ld + 1 - result = [] - for i in range(1, n): - j = i * m // n # rescale i to m/n - j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 - delta = i*m - j*n # exact integer math - interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n - result.append(interpolated) - return result - raise ValueError(f'Unknown method: {method!r}') - - + pairs = Counter(iter(data)).most_common(1) + try: + return pairs[0][0] + except IndexError: + raise StatisticsError('no mode for empty data') from None + + +def multimode(data): + """Return a list of the most frequently occurring values. + + Will return more than one result if there are multiple modes + or an empty list if *data* is empty. + + >>> multimode('aabbbbbbbbcc') + ['b'] + >>> multimode('aabbbbccddddeeffffgg') + ['b', 'd', 'f'] + >>> multimode('') + [] + """ + counts = Counter(iter(data)).most_common() + maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) + return list(map(itemgetter(0), mode_items)) + + +# Notes on methods for computing quantiles +# ---------------------------------------- +# +# There is no one perfect way to compute quantiles. Here we offer +# two methods that serve common needs. Most other packages +# surveyed offered at least one or both of these two, making them +# "standard" in the sense of "widely-adopted and reproducible". +# They are also easy to explain, easy to compute manually, and have +# straight-forward interpretations that aren't surprising. + +# The default method is known as "R6", "PERCENTILE.EXC", or "expected +# value of rank order statistics". The alternative method is known as +# "R7", "PERCENTILE.INC", or "mode of rank order statistics". + +# For sample data where there is a positive probability for values +# beyond the range of the data, the R6 exclusive method is a +# reasonable choice. Consider a random sample of nine values from a +# population with a uniform distribution from 0.0 to 1.0. The +# distribution of the third ranked sample point is described by +# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and +# mean=0.300. Only the latter (which corresponds with R6) gives the +# desired cut point with 30% of the population falling below that +# value, making it comparable to a result from an inv_cdf() function. +# The R6 exclusive method is also idempotent. + +# For describing population data where the end points are known to +# be included in the data, the R7 inclusive method is a reasonable +# choice. Instead of the mean, it uses the mode of the beta +# distribution for the interior points. Per Hyndman & Fan, "One nice +# property is that the vertices of Q7(p) divide the range into n - 1 +# intervals, and exactly 100p% of the intervals lie to the left of +# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." + +# If needed, other methods could be added. However, for now, the +# position is that fewer options make for easier choices and that +# external packages can be used for anything more advanced. + +def quantiles(data, *, n=4, method='exclusive'): + """Divide *data* into *n* continuous intervals with equal probability. + + Returns a list of (n - 1) cut points separating the intervals. + + Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. + Set *n* to 100 for percentiles which gives the 99 cuts points that + separate *data* in to 100 equal sized groups. + + The *data* can be any iterable containing sample. + The cut points are linearly interpolated between data points. + + If *method* is set to *inclusive*, *data* is treated as population + data. The minimum value is treated as the 0th percentile and the + maximum value is treated as the 100th percentile. + """ + if n < 1: + raise StatisticsError('n must be at least 1') + data = sorted(data) + ld = len(data) + if ld < 2: + raise StatisticsError('must have at least two data points') + if method == 'inclusive': + m = ld - 1 + result = [] + for i in range(1, n): + j, delta = divmod(i * m, n) + interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n + result.append(interpolated) + return result + if method == 'exclusive': + m = ld + 1 + result = [] + for i in range(1, n): + j = i * m // n # rescale i to m/n + j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 + delta = i*m - j*n # exact integer math + interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n + result.append(interpolated) + return result + raise ValueError(f'Unknown method: {method!r}') + + # === Measures of spread === # See http://mathworld.wolfram.com/Variance.html @@ -680,16 +680,16 @@ def _ss(data, c=None): calculated from ``c`` as given. Use the second case with care, as it can lead to garbage results. """ - if c is not None: - T, total, count = _sum((x-c)**2 for x in data) - return (T, total) - c = mean(data) + if c is not None: + T, total, count = _sum((x-c)**2 for x in data) + return (T, total) + c = mean(data) T, total, count = _sum((x-c)**2 for x in data) # The following sum should mathematically equal zero, but due to rounding # error may not. - U, total2, count2 = _sum((x - c) for x in data) + U, total2, count2 = _sum((x - c) for x in data) assert T == U and count == count2 - total -= total2 ** 2 / len(data) + total -= total2 ** 2 / len(data) assert not total < 0, 'negative sum of square deviations: %f' % total return (T, total) @@ -738,13 +738,13 @@ def variance(data, xbar=None): if n < 2: raise StatisticsError('variance requires at least two data points') T, ss = _ss(data, xbar) - return _convert(ss / (n - 1), T) + return _convert(ss / (n - 1), T) def pvariance(data, mu=None): """Return the population variance of ``data``. - data should be a sequence or iterable of Real-valued numbers, with at least one + data should be a sequence or iterable of Real-valued numbers, with at least one value. The optional argument mu, if given, should be the mean of the data. If it is missing or None, the mean is automatically calculated. @@ -782,7 +782,7 @@ def pvariance(data, mu=None): if n < 1: raise StatisticsError('pvariance requires at least one data point') T, ss = _ss(data, mu) - return _convert(ss / n, T) + return _convert(ss / n, T) def stdev(data, xbar=None): @@ -815,306 +815,306 @@ def pstdev(data, mu=None): return var.sqrt() except AttributeError: return math.sqrt(var) - - -## Normal Distribution ##################################################### - - -def _normal_dist_inv_cdf(p, mu, sigma): - # There is no closed-form solution to the inverse CDF for the normal - # distribution, so we use a rational approximation instead: - # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the - # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 - # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. - q = p - 0.5 - if fabs(q) <= 0.425: - r = 0.180625 - q * q - # Hash sum: 55.88319_28806_14901_4439 - num = (((((((2.50908_09287_30122_6727e+3 * r + - 3.34305_75583_58812_8105e+4) * r + - 6.72657_70927_00870_0853e+4) * r + - 4.59219_53931_54987_1457e+4) * r + - 1.37316_93765_50946_1125e+4) * r + - 1.97159_09503_06551_4427e+3) * r + - 1.33141_66789_17843_7745e+2) * r + - 3.38713_28727_96366_6080e+0) * q - den = (((((((5.22649_52788_52854_5610e+3 * r + - 2.87290_85735_72194_2674e+4) * r + - 3.93078_95800_09271_0610e+4) * r + - 2.12137_94301_58659_5867e+4) * r + - 5.39419_60214_24751_1077e+3) * r + - 6.87187_00749_20579_0830e+2) * r + - 4.23133_30701_60091_1252e+1) * r + - 1.0) - x = num / den - return mu + (x * sigma) - r = p if q <= 0.0 else 1.0 - p - r = sqrt(-log(r)) - if r <= 5.0: - r = r - 1.6 - # Hash sum: 49.33206_50330_16102_89036 - num = (((((((7.74545_01427_83414_07640e-4 * r + - 2.27238_44989_26918_45833e-2) * r + - 2.41780_72517_74506_11770e-1) * r + - 1.27045_82524_52368_38258e+0) * r + - 3.64784_83247_63204_60504e+0) * r + - 5.76949_72214_60691_40550e+0) * r + - 4.63033_78461_56545_29590e+0) * r + - 1.42343_71107_49683_57734e+0) - den = (((((((1.05075_00716_44416_84324e-9 * r + - 5.47593_80849_95344_94600e-4) * r + - 1.51986_66563_61645_71966e-2) * r + - 1.48103_97642_74800_74590e-1) * r + - 6.89767_33498_51000_04550e-1) * r + - 1.67638_48301_83803_84940e+0) * r + - 2.05319_16266_37758_82187e+0) * r + - 1.0) - else: - r = r - 5.0 - # Hash sum: 47.52583_31754_92896_71629 - num = (((((((2.01033_43992_92288_13265e-7 * r + - 2.71155_55687_43487_57815e-5) * r + - 1.24266_09473_88078_43860e-3) * r + - 2.65321_89526_57612_30930e-2) * r + - 2.96560_57182_85048_91230e-1) * r + - 1.78482_65399_17291_33580e+0) * r + - 5.46378_49111_64114_36990e+0) * r + - 6.65790_46435_01103_77720e+0) - den = (((((((2.04426_31033_89939_78564e-15 * r + - 1.42151_17583_16445_88870e-7) * r + - 1.84631_83175_10054_68180e-5) * r + - 7.86869_13114_56132_59100e-4) * r + - 1.48753_61290_85061_48525e-2) * r + - 1.36929_88092_27358_05310e-1) * r + - 5.99832_20655_58879_37690e-1) * r + - 1.0) - x = num / den - if q < 0.0: - x = -x - return mu + (x * sigma) - - -# If available, use C implementation -try: - from _statistics import _normal_dist_inv_cdf -except ImportError: - pass - - -class NormalDist: - "Normal distribution of a random variable" - # https://en.wikipedia.org/wiki/Normal_distribution - # https://en.wikipedia.org/wiki/Variance#Properties - - __slots__ = { - '_mu': 'Arithmetic mean of a normal distribution', - '_sigma': 'Standard deviation of a normal distribution', - } - - def __init__(self, mu=0.0, sigma=1.0): - "NormalDist where mu is the mean and sigma is the standard deviation." - if sigma < 0.0: - raise StatisticsError('sigma must be non-negative') - self._mu = float(mu) - self._sigma = float(sigma) - - @classmethod - def from_samples(cls, data): - "Make a normal distribution instance from sample data." - if not isinstance(data, (list, tuple)): - data = list(data) - xbar = fmean(data) - return cls(xbar, stdev(data, xbar)) - - def samples(self, n, *, seed=None): - "Generate *n* samples for a given mean and standard deviation." - gauss = random.gauss if seed is None else random.Random(seed).gauss - mu, sigma = self._mu, self._sigma - return [gauss(mu, sigma) for i in range(n)] - - def pdf(self, x): - "Probability density function. P(x <= X < x+dx) / dx" - variance = self._sigma ** 2.0 - if not variance: - raise StatisticsError('pdf() not defined when sigma is zero') - return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance) - - def cdf(self, x): - "Cumulative distribution function. P(X <= x)" - if not self._sigma: - raise StatisticsError('cdf() not defined when sigma is zero') - return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0)))) - - def inv_cdf(self, p): - """Inverse cumulative distribution function. x : P(X <= x) = p - - Finds the value of the random variable such that the probability of - the variable being less than or equal to that value equals the given - probability. - - This function is also called the percent point function or quantile - function. - """ - if p <= 0.0 or p >= 1.0: - raise StatisticsError('p must be in the range 0.0 < p < 1.0') - if self._sigma <= 0.0: - raise StatisticsError('cdf() not defined when sigma at or below zero') - return _normal_dist_inv_cdf(p, self._mu, self._sigma) - - def quantiles(self, n=4): - """Divide into *n* continuous intervals with equal probability. - - Returns a list of (n - 1) cut points separating the intervals. - - Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. - Set *n* to 100 for percentiles which gives the 99 cuts points that - separate the normal distribution in to 100 equal sized groups. - """ - return [self.inv_cdf(i / n) for i in range(1, n)] - - def overlap(self, other): - """Compute the overlapping coefficient (OVL) between two normal distributions. - - Measures the agreement between two normal probability distributions. - Returns a value between 0.0 and 1.0 giving the overlapping area in - the two underlying probability density functions. - - >>> N1 = NormalDist(2.4, 1.6) - >>> N2 = NormalDist(3.2, 2.0) - >>> N1.overlap(N2) - 0.8035050657330205 - """ - # See: "The overlapping coefficient as a measure of agreement between - # probability distributions and point estimation of the overlap of two - # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr - # http://dx.doi.org/10.1080/03610928908830127 - if not isinstance(other, NormalDist): - raise TypeError('Expected another NormalDist instance') - X, Y = self, other - if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity - X, Y = Y, X - X_var, Y_var = X.variance, Y.variance - if not X_var or not Y_var: - raise StatisticsError('overlap() not defined when sigma is zero') - dv = Y_var - X_var - dm = fabs(Y._mu - X._mu) - if not dv: - return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0))) - a = X._mu * Y_var - Y._mu * X_var - b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) - x1 = (a + b) / dv - x2 = (a - b) / dv - return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) - - def zscore(self, x): - """Compute the Standard Score. (x - mean) / stdev - - Describes *x* in terms of the number of standard deviations - above or below the mean of the normal distribution. - """ - # https://www.statisticshowto.com/probability-and-statistics/z-score/ - if not self._sigma: - raise StatisticsError('zscore() not defined when sigma is zero') - return (x - self._mu) / self._sigma - - @property - def mean(self): - "Arithmetic mean of the normal distribution." - return self._mu - - @property - def median(self): - "Return the median of the normal distribution" - return self._mu - - @property - def mode(self): - """Return the mode of the normal distribution - - The mode is the value x where which the probability density - function (pdf) takes its maximum value. - """ - return self._mu - - @property - def stdev(self): - "Standard deviation of the normal distribution." - return self._sigma - - @property - def variance(self): - "Square of the standard deviation." - return self._sigma ** 2.0 - - def __add__(x1, x2): - """Add a constant or another NormalDist instance. - - If *other* is a constant, translate mu by the constant, - leaving sigma unchanged. - - If *other* is a NormalDist, add both the means and the variances. - Mathematically, this works only if the two distributions are - independent or if they are jointly normally distributed. - """ - if isinstance(x2, NormalDist): - return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) - return NormalDist(x1._mu + x2, x1._sigma) - - def __sub__(x1, x2): - """Subtract a constant or another NormalDist instance. - - If *other* is a constant, translate by the constant mu, - leaving sigma unchanged. - - If *other* is a NormalDist, subtract the means and add the variances. - Mathematically, this works only if the two distributions are - independent or if they are jointly normally distributed. - """ - if isinstance(x2, NormalDist): - return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) - return NormalDist(x1._mu - x2, x1._sigma) - - def __mul__(x1, x2): - """Multiply both mu and sigma by a constant. - - Used for rescaling, perhaps to change measurement units. - Sigma is scaled with the absolute value of the constant. - """ - return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) - - def __truediv__(x1, x2): - """Divide both mu and sigma by a constant. - - Used for rescaling, perhaps to change measurement units. - Sigma is scaled with the absolute value of the constant. - """ - return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) - - def __pos__(x1): - "Return a copy of the instance." - return NormalDist(x1._mu, x1._sigma) - - def __neg__(x1): - "Negates mu while keeping sigma the same." - return NormalDist(-x1._mu, x1._sigma) - - __radd__ = __add__ - - def __rsub__(x1, x2): - "Subtract a NormalDist from a constant or another NormalDist." - return -(x1 - x2) - - __rmul__ = __mul__ - - def __eq__(x1, x2): - "Two NormalDist objects are equal if their mu and sigma are both equal." - if not isinstance(x2, NormalDist): - return NotImplemented - return x1._mu == x2._mu and x1._sigma == x2._sigma - - def __hash__(self): - "NormalDist objects hash equal if their mu and sigma are both equal." - return hash((self._mu, self._sigma)) - - def __repr__(self): - return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' + + +## Normal Distribution ##################################################### + + +def _normal_dist_inv_cdf(p, mu, sigma): + # There is no closed-form solution to the inverse CDF for the normal + # distribution, so we use a rational approximation instead: + # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the + # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 + # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. + q = p - 0.5 + if fabs(q) <= 0.425: + r = 0.180625 - q * q + # Hash sum: 55.88319_28806_14901_4439 + num = (((((((2.50908_09287_30122_6727e+3 * r + + 3.34305_75583_58812_8105e+4) * r + + 6.72657_70927_00870_0853e+4) * r + + 4.59219_53931_54987_1457e+4) * r + + 1.37316_93765_50946_1125e+4) * r + + 1.97159_09503_06551_4427e+3) * r + + 1.33141_66789_17843_7745e+2) * r + + 3.38713_28727_96366_6080e+0) * q + den = (((((((5.22649_52788_52854_5610e+3 * r + + 2.87290_85735_72194_2674e+4) * r + + 3.93078_95800_09271_0610e+4) * r + + 2.12137_94301_58659_5867e+4) * r + + 5.39419_60214_24751_1077e+3) * r + + 6.87187_00749_20579_0830e+2) * r + + 4.23133_30701_60091_1252e+1) * r + + 1.0) + x = num / den + return mu + (x * sigma) + r = p if q <= 0.0 else 1.0 - p + r = sqrt(-log(r)) + if r <= 5.0: + r = r - 1.6 + # Hash sum: 49.33206_50330_16102_89036 + num = (((((((7.74545_01427_83414_07640e-4 * r + + 2.27238_44989_26918_45833e-2) * r + + 2.41780_72517_74506_11770e-1) * r + + 1.27045_82524_52368_38258e+0) * r + + 3.64784_83247_63204_60504e+0) * r + + 5.76949_72214_60691_40550e+0) * r + + 4.63033_78461_56545_29590e+0) * r + + 1.42343_71107_49683_57734e+0) + den = (((((((1.05075_00716_44416_84324e-9 * r + + 5.47593_80849_95344_94600e-4) * r + + 1.51986_66563_61645_71966e-2) * r + + 1.48103_97642_74800_74590e-1) * r + + 6.89767_33498_51000_04550e-1) * r + + 1.67638_48301_83803_84940e+0) * r + + 2.05319_16266_37758_82187e+0) * r + + 1.0) + else: + r = r - 5.0 + # Hash sum: 47.52583_31754_92896_71629 + num = (((((((2.01033_43992_92288_13265e-7 * r + + 2.71155_55687_43487_57815e-5) * r + + 1.24266_09473_88078_43860e-3) * r + + 2.65321_89526_57612_30930e-2) * r + + 2.96560_57182_85048_91230e-1) * r + + 1.78482_65399_17291_33580e+0) * r + + 5.46378_49111_64114_36990e+0) * r + + 6.65790_46435_01103_77720e+0) + den = (((((((2.04426_31033_89939_78564e-15 * r + + 1.42151_17583_16445_88870e-7) * r + + 1.84631_83175_10054_68180e-5) * r + + 7.86869_13114_56132_59100e-4) * r + + 1.48753_61290_85061_48525e-2) * r + + 1.36929_88092_27358_05310e-1) * r + + 5.99832_20655_58879_37690e-1) * r + + 1.0) + x = num / den + if q < 0.0: + x = -x + return mu + (x * sigma) + + +# If available, use C implementation +try: + from _statistics import _normal_dist_inv_cdf +except ImportError: + pass + + +class NormalDist: + "Normal distribution of a random variable" + # https://en.wikipedia.org/wiki/Normal_distribution + # https://en.wikipedia.org/wiki/Variance#Properties + + __slots__ = { + '_mu': 'Arithmetic mean of a normal distribution', + '_sigma': 'Standard deviation of a normal distribution', + } + + def __init__(self, mu=0.0, sigma=1.0): + "NormalDist where mu is the mean and sigma is the standard deviation." + if sigma < 0.0: + raise StatisticsError('sigma must be non-negative') + self._mu = float(mu) + self._sigma = float(sigma) + + @classmethod + def from_samples(cls, data): + "Make a normal distribution instance from sample data." + if not isinstance(data, (list, tuple)): + data = list(data) + xbar = fmean(data) + return cls(xbar, stdev(data, xbar)) + + def samples(self, n, *, seed=None): + "Generate *n* samples for a given mean and standard deviation." + gauss = random.gauss if seed is None else random.Random(seed).gauss + mu, sigma = self._mu, self._sigma + return [gauss(mu, sigma) for i in range(n)] + + def pdf(self, x): + "Probability density function. P(x <= X < x+dx) / dx" + variance = self._sigma ** 2.0 + if not variance: + raise StatisticsError('pdf() not defined when sigma is zero') + return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance) + + def cdf(self, x): + "Cumulative distribution function. P(X <= x)" + if not self._sigma: + raise StatisticsError('cdf() not defined when sigma is zero') + return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0)))) + + def inv_cdf(self, p): + """Inverse cumulative distribution function. x : P(X <= x) = p + + Finds the value of the random variable such that the probability of + the variable being less than or equal to that value equals the given + probability. + + This function is also called the percent point function or quantile + function. + """ + if p <= 0.0 or p >= 1.0: + raise StatisticsError('p must be in the range 0.0 < p < 1.0') + if self._sigma <= 0.0: + raise StatisticsError('cdf() not defined when sigma at or below zero') + return _normal_dist_inv_cdf(p, self._mu, self._sigma) + + def quantiles(self, n=4): + """Divide into *n* continuous intervals with equal probability. + + Returns a list of (n - 1) cut points separating the intervals. + + Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. + Set *n* to 100 for percentiles which gives the 99 cuts points that + separate the normal distribution in to 100 equal sized groups. + """ + return [self.inv_cdf(i / n) for i in range(1, n)] + + def overlap(self, other): + """Compute the overlapping coefficient (OVL) between two normal distributions. + + Measures the agreement between two normal probability distributions. + Returns a value between 0.0 and 1.0 giving the overlapping area in + the two underlying probability density functions. + + >>> N1 = NormalDist(2.4, 1.6) + >>> N2 = NormalDist(3.2, 2.0) + >>> N1.overlap(N2) + 0.8035050657330205 + """ + # See: "The overlapping coefficient as a measure of agreement between + # probability distributions and point estimation of the overlap of two + # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr + # http://dx.doi.org/10.1080/03610928908830127 + if not isinstance(other, NormalDist): + raise TypeError('Expected another NormalDist instance') + X, Y = self, other + if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity + X, Y = Y, X + X_var, Y_var = X.variance, Y.variance + if not X_var or not Y_var: + raise StatisticsError('overlap() not defined when sigma is zero') + dv = Y_var - X_var + dm = fabs(Y._mu - X._mu) + if not dv: + return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0))) + a = X._mu * Y_var - Y._mu * X_var + b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) + x1 = (a + b) / dv + x2 = (a - b) / dv + return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) + + def zscore(self, x): + """Compute the Standard Score. (x - mean) / stdev + + Describes *x* in terms of the number of standard deviations + above or below the mean of the normal distribution. + """ + # https://www.statisticshowto.com/probability-and-statistics/z-score/ + if not self._sigma: + raise StatisticsError('zscore() not defined when sigma is zero') + return (x - self._mu) / self._sigma + + @property + def mean(self): + "Arithmetic mean of the normal distribution." + return self._mu + + @property + def median(self): + "Return the median of the normal distribution" + return self._mu + + @property + def mode(self): + """Return the mode of the normal distribution + + The mode is the value x where which the probability density + function (pdf) takes its maximum value. + """ + return self._mu + + @property + def stdev(self): + "Standard deviation of the normal distribution." + return self._sigma + + @property + def variance(self): + "Square of the standard deviation." + return self._sigma ** 2.0 + + def __add__(x1, x2): + """Add a constant or another NormalDist instance. + + If *other* is a constant, translate mu by the constant, + leaving sigma unchanged. + + If *other* is a NormalDist, add both the means and the variances. + Mathematically, this works only if the two distributions are + independent or if they are jointly normally distributed. + """ + if isinstance(x2, NormalDist): + return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) + return NormalDist(x1._mu + x2, x1._sigma) + + def __sub__(x1, x2): + """Subtract a constant or another NormalDist instance. + + If *other* is a constant, translate by the constant mu, + leaving sigma unchanged. + + If *other* is a NormalDist, subtract the means and add the variances. + Mathematically, this works only if the two distributions are + independent or if they are jointly normally distributed. + """ + if isinstance(x2, NormalDist): + return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) + return NormalDist(x1._mu - x2, x1._sigma) + + def __mul__(x1, x2): + """Multiply both mu and sigma by a constant. + + Used for rescaling, perhaps to change measurement units. + Sigma is scaled with the absolute value of the constant. + """ + return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) + + def __truediv__(x1, x2): + """Divide both mu and sigma by a constant. + + Used for rescaling, perhaps to change measurement units. + Sigma is scaled with the absolute value of the constant. + """ + return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) + + def __pos__(x1): + "Return a copy of the instance." + return NormalDist(x1._mu, x1._sigma) + + def __neg__(x1): + "Negates mu while keeping sigma the same." + return NormalDist(-x1._mu, x1._sigma) + + __radd__ = __add__ + + def __rsub__(x1, x2): + "Subtract a NormalDist from a constant or another NormalDist." + return -(x1 - x2) + + __rmul__ = __mul__ + + def __eq__(x1, x2): + "Two NormalDist objects are equal if their mu and sigma are both equal." + if not isinstance(x2, NormalDist): + return NotImplemented + return x1._mu == x2._mu and x1._sigma == x2._sigma + + def __hash__(self): + "NormalDist objects hash equal if their mu and sigma are both equal." + return hash((self._mu, self._sigma)) + + def __repr__(self): + return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' |