source: trunk/src/allmydata/util/statistics.py

Last change on this file was 1cfe843d, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-22T23:40:25Z

more python2 removal

  • Property mode set to 100644
File size: 8.7 KB
Line 
1"""
2Statistical utilities.
3
4Ported to Python 3.
5"""
6# Copyright (c) 2009 Shawn Willden
7# mailto:shawn@willden.org
8# I hereby license all patches I have contributed or will contribute to the
9# Allmydata Tahoe-LAFS project, including the file 'statistics.py', under
10# either the GNU General Public License, version 2 or later, or under the
11# Transitive Grace Period Public License, version 1 or later.
12
13
14from allmydata.util.mathutil import round_sigfigs
15import math
16from functools import reduce
17import sys
18
19def pr_file_loss(p_list, k):
20    """
21    Probability of single-file loss for shares with reliabilities in
22    p_list.
23
24    Computes the probability that a single file will become
25    unrecoverable, based on the individual share survival
26    probabilities and and k (number of shares needed for recovery).
27
28    Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the
29    probability that a file with k=3, N=10 and stored on five servers
30    with reliability .9 and five servers with reliability .99 is lost.
31
32    See survival_pmf docstring for important statistical assumptions.
33
34    """
35    assert 0 < k <= len(p_list)
36    assert valid_probability_list(p_list)
37
38    # Sum elements 0 through k-1 of the share set PMF to get the
39    # probability that less than k shares survived.
40    return sum(survival_pmf(p_list)[0:k])
41
42def survival_pmf(p_list):
43    """
44    Return the collective PMF of share survival count for a set of
45    shares with the individual survival probabilities in p_list.
46
47    Example: survival_pmf([.99] * 10 + [.8] * 6) returns the
48    probability mass function for the number of shares that will
49    survive from an initial set of 16, 10 with p=0.99 and 6 with
50    p=0.8.  The ith element of the resulting list is the probability
51    that exactly i shares will survive.
52
53    This calculation makes the following assumptions:
54
55    1.  p_list[i] is the probability that any individual share will
56    will survive during the time period in question (whatever that may
57    be).
58
59    2.  The share failures are "independent", in the statistical
60    sense.  Note that if a group of shares are stored on the same
61    machine or even in the same data center, they are NOT independent
62    and this calculation is therefore wrong.
63    """
64    assert valid_probability_list(p_list)
65
66    pmf = survival_pmf_via_conv(p_list)
67
68    assert valid_pmf(pmf)
69    return pmf
70
71def survival_pmf_via_bd(p_list):
72    """
73    Compute share survival PMF using the binomial distribution PMF as
74    much as possible.
75
76    This is more efficient than the convolution method below, but
77    doesn't work for large numbers of shares because the
78    binomial_coeff calculation blows up.  Since the efficiency gains
79    only matter in the case of large numbers of shares, it's pretty
80    much useless except for testing the convolution methond.
81
82    Note that this function does little to no error checking and is
83    intended for internal use and testing only.
84    """
85    pmf_list = [ binomial_distribution_pmf(p_list.count(p), p)
86                 for p in set(p_list) ]
87    return list(reduce(convolve, pmf_list))
88
89def survival_pmf_via_conv(p_list):
90    """
91    Compute share survival PMF using iterated convolution of trivial
92    PMFs.
93
94    Note that this function does little to no error checking and is
95    intended for internal use and testing only.
96    """
97    pmf_list = [ [1 - p, p] for p in p_list ];
98    return list(reduce(convolve, pmf_list))
99
100def print_pmf(pmf, n=4, out=sys.stdout):
101    """
102    Print a PMF in a readable form, with values rounded to n
103    significant digits.
104    """
105    for k, p in enumerate(pmf):
106        print("i=" + str(k) + ":", round_sigfigs(p, n), file=out)
107
108def pr_backup_file_loss(p_list, backup_p, k):
109    """
110    Probability of single-file loss in a backup context
111
112    Same as pr_file_loss, except it factors in the probability of
113    survival of the original source, specified as backup_p.  Because
114    that's a precondition to caring about the availability of the
115    backup, it's an independent event.
116    """
117    assert valid_probability_list(p_list)
118    assert 0 < backup_p <= 1
119    assert 0 < k <= len(p_list)
120
121    return pr_file_loss(p_list, k) * (1 - backup_p)
122
123
124def find_k(p_list, target_loss_prob):
125    """
126    Find the highest k value that achieves the targeted loss
127    probability, given the share reliabilities given in p_list.
128    """
129    assert valid_probability_list(p_list)
130    assert 0 < target_loss_prob < 1
131
132    pmf = survival_pmf(p_list)
133    return find_k_from_pmf(pmf, target_loss_prob)
134
135def find_k_from_pmf(pmf, target_loss_prob):
136    """
137    Find the highest k value that achieves the targeted loss
138    probability, given the share survival PMF given in pmf.
139    """
140    assert valid_pmf(pmf)
141    assert 0 < target_loss_prob < 1
142
143    loss_prob = 0.0
144    for k, p_k in enumerate(pmf):
145        loss_prob += p_k
146        if loss_prob > target_loss_prob:
147            return k
148
149    # we shouldn't be able to get here, since sum(pmf)==1.0
150    k = len(pmf) - 1
151    return k
152
153def repair_count_pmf(survival_pmf, k):
154    """
155    Return Pr[D=d], where D represents the number of shares that have
156    to be repaired at the end of an interval, starting with a full
157    set and subject to losses described in survival_pmf.
158    """
159    n = len(survival_pmf) - 1
160
161    # Probability of 0 to repair is the probability of all shares
162    # surviving plus the probability of less than k surviving.
163    pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ]
164
165    # Probability of more than 0, up to N-k to repair
166    for i in range(1, n-k+1):
167        pmf.append(survival_pmf[n-i])
168
169    # Probability of more than N-k to repair is 0, because that means
170    # there are less than k available and the file is irreparable.
171    for i in range(n-k+1, n+1):
172        pmf.append(0.0)
173
174    assert(valid_pmf(pmf))
175    return pmf
176
177def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio):
178    return file_size + float(file_size) / k * shares * ul_dl_ratio
179
180def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio):
181    """
182    Return the expected cost for a repair run on a file with the given
183    survival_pmf and requiring k shares, in which upload cost is
184    'ul_dl_ratio' times download cost.
185    """
186    repair_pmf = repair_count_pmf(survival_pmf, k)
187    expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio)
188                         * repair_pmf[new_shares]
189                         for new_shares in range(1, len(repair_pmf))])
190    return expected_cost
191
192def eternal_repair_cost(cost_function, file_size, survival_pmf, k,
193                        discount_rate=0, ul_dl_ratio=1.0):
194    """
195    Calculate the eternal repair cost for a file that is aggressively
196    repaired, i.e. the sum of repair costs until the file is dead.
197    """
198    c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio)
199    f = 1 - sum(survival_pmf[0:k])
200    r = float(discount_rate)
201
202    return (c * (1-r)) / (1 - (1-r) * f)
203
204def valid_pmf(pmf):
205    """
206    Validate that pmf looks like a proper discrete probability mass
207    function in list form.
208
209    Returns true if the elements of pmf sum to 1.
210    """
211    return round(sum(pmf),5) == 1.0
212
213def valid_probability_list(p_list):
214    """
215    Validate that p_list is a list of probibilities
216    """
217    for p in p_list:
218        if p < 0 or p > 1:
219            return False
220
221    return True
222
223def convolve(list_a, list_b):
224    """
225    Returns the discrete convolution of two lists.
226
227    Given two random variables X and Y, the convolution of their
228    probability mass functions Pr(X) and Pr(Y) is equal to the
229    Pr(X+Y).
230    """
231    n = len(list_a)
232    m = len(list_b)
233
234    result = []
235    for i in range(n + m - 1):
236        sum = 0.0
237
238        lower = max(0, i - n + 1)
239        upper = min(m - 1, i)
240
241        for j in range(lower, upper+1):
242            sum += list_a[i-j] * list_b[j]
243
244        result.append(sum)
245
246    return result
247
248def binomial_distribution_pmf(n, p):
249    """
250    Returns Pr(K), where K ~ B(n,p), as a list of values.
251
252    Returns the full probability mass function of a B(n, p) as a list
253    of values, where the kth element is Pr(K=k), or, in the Tahoe
254    context, the probability that exactly k copies of a file share
255    survive, when placed on n independent servers with survival
256    probability p.
257    """
258    assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p
259    assert n > 0
260
261    result = []
262    for k in range(n+1):
263        result.append(math.pow(p    , k    ) *
264                      math.pow(1 - p, n - k) *
265                      binomial_coeff(n, k))
266
267    assert valid_pmf(result)
268    return result;
269
270def binomial_coeff(n, k):
271    """
272    Returns the number of ways that k items can be chosen from a set
273    of n.
274    """
275    assert n >= k
276
277    if k > n/2:
278        k = n - k
279
280    accum = 1.0
281    for i in range(1, k+1):
282        accum = accum * (n - k + i) // i;
283
284    return int(accum + 0.5)
Note: See TracBrowser for help on using the repository browser.