1 | """ |
---|
2 | Statistical utilities. |
---|
3 | |
---|
4 | Ported to Python 3. |
---|
5 | """ |
---|
6 | # Copyright (c) 2009 Shawn Willden |
---|
7 | # mailto:shawn@willden.org |
---|
8 | # I hereby license all patches I have contributed or will contribute to the |
---|
9 | # Allmydata Tahoe-LAFS project, including the file 'statistics.py', under |
---|
10 | # either the GNU General Public License, version 2 or later, or under the |
---|
11 | # Transitive Grace Period Public License, version 1 or later. |
---|
12 | |
---|
13 | |
---|
14 | from allmydata.util.mathutil import round_sigfigs |
---|
15 | import math |
---|
16 | from functools import reduce |
---|
17 | import sys |
---|
18 | |
---|
19 | def pr_file_loss(p_list, k): |
---|
20 | """ |
---|
21 | Probability of single-file loss for shares with reliabilities in |
---|
22 | p_list. |
---|
23 | |
---|
24 | Computes the probability that a single file will become |
---|
25 | unrecoverable, based on the individual share survival |
---|
26 | probabilities and and k (number of shares needed for recovery). |
---|
27 | |
---|
28 | Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the |
---|
29 | probability that a file with k=3, N=10 and stored on five servers |
---|
30 | with reliability .9 and five servers with reliability .99 is lost. |
---|
31 | |
---|
32 | See survival_pmf docstring for important statistical assumptions. |
---|
33 | |
---|
34 | """ |
---|
35 | assert 0 < k <= len(p_list) |
---|
36 | assert valid_probability_list(p_list) |
---|
37 | |
---|
38 | # Sum elements 0 through k-1 of the share set PMF to get the |
---|
39 | # probability that less than k shares survived. |
---|
40 | return sum(survival_pmf(p_list)[0:k]) |
---|
41 | |
---|
42 | def survival_pmf(p_list): |
---|
43 | """ |
---|
44 | Return the collective PMF of share survival count for a set of |
---|
45 | shares with the individual survival probabilities in p_list. |
---|
46 | |
---|
47 | Example: survival_pmf([.99] * 10 + [.8] * 6) returns the |
---|
48 | probability mass function for the number of shares that will |
---|
49 | survive from an initial set of 16, 10 with p=0.99 and 6 with |
---|
50 | p=0.8. The ith element of the resulting list is the probability |
---|
51 | that exactly i shares will survive. |
---|
52 | |
---|
53 | This calculation makes the following assumptions: |
---|
54 | |
---|
55 | 1. p_list[i] is the probability that any individual share will |
---|
56 | will survive during the time period in question (whatever that may |
---|
57 | be). |
---|
58 | |
---|
59 | 2. The share failures are "independent", in the statistical |
---|
60 | sense. Note that if a group of shares are stored on the same |
---|
61 | machine or even in the same data center, they are NOT independent |
---|
62 | and this calculation is therefore wrong. |
---|
63 | """ |
---|
64 | assert valid_probability_list(p_list) |
---|
65 | |
---|
66 | pmf = survival_pmf_via_conv(p_list) |
---|
67 | |
---|
68 | assert valid_pmf(pmf) |
---|
69 | return pmf |
---|
70 | |
---|
71 | def survival_pmf_via_bd(p_list): |
---|
72 | """ |
---|
73 | Compute share survival PMF using the binomial distribution PMF as |
---|
74 | much as possible. |
---|
75 | |
---|
76 | This is more efficient than the convolution method below, but |
---|
77 | doesn't work for large numbers of shares because the |
---|
78 | binomial_coeff calculation blows up. Since the efficiency gains |
---|
79 | only matter in the case of large numbers of shares, it's pretty |
---|
80 | much useless except for testing the convolution methond. |
---|
81 | |
---|
82 | Note that this function does little to no error checking and is |
---|
83 | intended for internal use and testing only. |
---|
84 | """ |
---|
85 | pmf_list = [ binomial_distribution_pmf(p_list.count(p), p) |
---|
86 | for p in set(p_list) ] |
---|
87 | return list(reduce(convolve, pmf_list)) |
---|
88 | |
---|
89 | def survival_pmf_via_conv(p_list): |
---|
90 | """ |
---|
91 | Compute share survival PMF using iterated convolution of trivial |
---|
92 | PMFs. |
---|
93 | |
---|
94 | Note that this function does little to no error checking and is |
---|
95 | intended for internal use and testing only. |
---|
96 | """ |
---|
97 | pmf_list = [ [1 - p, p] for p in p_list ]; |
---|
98 | return list(reduce(convolve, pmf_list)) |
---|
99 | |
---|
100 | def print_pmf(pmf, n=4, out=sys.stdout): |
---|
101 | """ |
---|
102 | Print a PMF in a readable form, with values rounded to n |
---|
103 | significant digits. |
---|
104 | """ |
---|
105 | for k, p in enumerate(pmf): |
---|
106 | print("i=" + str(k) + ":", round_sigfigs(p, n), file=out) |
---|
107 | |
---|
108 | def pr_backup_file_loss(p_list, backup_p, k): |
---|
109 | """ |
---|
110 | Probability of single-file loss in a backup context |
---|
111 | |
---|
112 | Same as pr_file_loss, except it factors in the probability of |
---|
113 | survival of the original source, specified as backup_p. Because |
---|
114 | that's a precondition to caring about the availability of the |
---|
115 | backup, it's an independent event. |
---|
116 | """ |
---|
117 | assert valid_probability_list(p_list) |
---|
118 | assert 0 < backup_p <= 1 |
---|
119 | assert 0 < k <= len(p_list) |
---|
120 | |
---|
121 | return pr_file_loss(p_list, k) * (1 - backup_p) |
---|
122 | |
---|
123 | |
---|
124 | def find_k(p_list, target_loss_prob): |
---|
125 | """ |
---|
126 | Find the highest k value that achieves the targeted loss |
---|
127 | probability, given the share reliabilities given in p_list. |
---|
128 | """ |
---|
129 | assert valid_probability_list(p_list) |
---|
130 | assert 0 < target_loss_prob < 1 |
---|
131 | |
---|
132 | pmf = survival_pmf(p_list) |
---|
133 | return find_k_from_pmf(pmf, target_loss_prob) |
---|
134 | |
---|
135 | def find_k_from_pmf(pmf, target_loss_prob): |
---|
136 | """ |
---|
137 | Find the highest k value that achieves the targeted loss |
---|
138 | probability, given the share survival PMF given in pmf. |
---|
139 | """ |
---|
140 | assert valid_pmf(pmf) |
---|
141 | assert 0 < target_loss_prob < 1 |
---|
142 | |
---|
143 | loss_prob = 0.0 |
---|
144 | for k, p_k in enumerate(pmf): |
---|
145 | loss_prob += p_k |
---|
146 | if loss_prob > target_loss_prob: |
---|
147 | return k |
---|
148 | |
---|
149 | # we shouldn't be able to get here, since sum(pmf)==1.0 |
---|
150 | k = len(pmf) - 1 |
---|
151 | return k |
---|
152 | |
---|
153 | def repair_count_pmf(survival_pmf, k): |
---|
154 | """ |
---|
155 | Return Pr[D=d], where D represents the number of shares that have |
---|
156 | to be repaired at the end of an interval, starting with a full |
---|
157 | set and subject to losses described in survival_pmf. |
---|
158 | """ |
---|
159 | n = len(survival_pmf) - 1 |
---|
160 | |
---|
161 | # Probability of 0 to repair is the probability of all shares |
---|
162 | # surviving plus the probability of less than k surviving. |
---|
163 | pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ] |
---|
164 | |
---|
165 | # Probability of more than 0, up to N-k to repair |
---|
166 | for i in range(1, n-k+1): |
---|
167 | pmf.append(survival_pmf[n-i]) |
---|
168 | |
---|
169 | # Probability of more than N-k to repair is 0, because that means |
---|
170 | # there are less than k available and the file is irreparable. |
---|
171 | for i in range(n-k+1, n+1): |
---|
172 | pmf.append(0.0) |
---|
173 | |
---|
174 | assert(valid_pmf(pmf)) |
---|
175 | return pmf |
---|
176 | |
---|
177 | def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio): |
---|
178 | return file_size + float(file_size) / k * shares * ul_dl_ratio |
---|
179 | |
---|
180 | def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio): |
---|
181 | """ |
---|
182 | Return the expected cost for a repair run on a file with the given |
---|
183 | survival_pmf and requiring k shares, in which upload cost is |
---|
184 | 'ul_dl_ratio' times download cost. |
---|
185 | """ |
---|
186 | repair_pmf = repair_count_pmf(survival_pmf, k) |
---|
187 | expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio) |
---|
188 | * repair_pmf[new_shares] |
---|
189 | for new_shares in range(1, len(repair_pmf))]) |
---|
190 | return expected_cost |
---|
191 | |
---|
192 | def eternal_repair_cost(cost_function, file_size, survival_pmf, k, |
---|
193 | discount_rate=0, ul_dl_ratio=1.0): |
---|
194 | """ |
---|
195 | Calculate the eternal repair cost for a file that is aggressively |
---|
196 | repaired, i.e. the sum of repair costs until the file is dead. |
---|
197 | """ |
---|
198 | c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio) |
---|
199 | f = 1 - sum(survival_pmf[0:k]) |
---|
200 | r = float(discount_rate) |
---|
201 | |
---|
202 | return (c * (1-r)) / (1 - (1-r) * f) |
---|
203 | |
---|
204 | def valid_pmf(pmf): |
---|
205 | """ |
---|
206 | Validate that pmf looks like a proper discrete probability mass |
---|
207 | function in list form. |
---|
208 | |
---|
209 | Returns true if the elements of pmf sum to 1. |
---|
210 | """ |
---|
211 | return round(sum(pmf),5) == 1.0 |
---|
212 | |
---|
213 | def valid_probability_list(p_list): |
---|
214 | """ |
---|
215 | Validate that p_list is a list of probibilities |
---|
216 | """ |
---|
217 | for p in p_list: |
---|
218 | if p < 0 or p > 1: |
---|
219 | return False |
---|
220 | |
---|
221 | return True |
---|
222 | |
---|
223 | def convolve(list_a, list_b): |
---|
224 | """ |
---|
225 | Returns the discrete convolution of two lists. |
---|
226 | |
---|
227 | Given two random variables X and Y, the convolution of their |
---|
228 | probability mass functions Pr(X) and Pr(Y) is equal to the |
---|
229 | Pr(X+Y). |
---|
230 | """ |
---|
231 | n = len(list_a) |
---|
232 | m = len(list_b) |
---|
233 | |
---|
234 | result = [] |
---|
235 | for i in range(n + m - 1): |
---|
236 | sum = 0.0 |
---|
237 | |
---|
238 | lower = max(0, i - n + 1) |
---|
239 | upper = min(m - 1, i) |
---|
240 | |
---|
241 | for j in range(lower, upper+1): |
---|
242 | sum += list_a[i-j] * list_b[j] |
---|
243 | |
---|
244 | result.append(sum) |
---|
245 | |
---|
246 | return result |
---|
247 | |
---|
248 | def binomial_distribution_pmf(n, p): |
---|
249 | """ |
---|
250 | Returns Pr(K), where K ~ B(n,p), as a list of values. |
---|
251 | |
---|
252 | Returns the full probability mass function of a B(n, p) as a list |
---|
253 | of values, where the kth element is Pr(K=k), or, in the Tahoe |
---|
254 | context, the probability that exactly k copies of a file share |
---|
255 | survive, when placed on n independent servers with survival |
---|
256 | probability p. |
---|
257 | """ |
---|
258 | assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p |
---|
259 | assert n > 0 |
---|
260 | |
---|
261 | result = [] |
---|
262 | for k in range(n+1): |
---|
263 | result.append(math.pow(p , k ) * |
---|
264 | math.pow(1 - p, n - k) * |
---|
265 | binomial_coeff(n, k)) |
---|
266 | |
---|
267 | assert valid_pmf(result) |
---|
268 | return result; |
---|
269 | |
---|
270 | def binomial_coeff(n, k): |
---|
271 | """ |
---|
272 | Returns the number of ways that k items can be chosen from a set |
---|
273 | of n. |
---|
274 | """ |
---|
275 | assert n >= k |
---|
276 | |
---|
277 | if k > n/2: |
---|
278 | k = n - k |
---|
279 | |
---|
280 | accum = 1.0 |
---|
281 | for i in range(1, k+1): |
---|
282 | accum = accum * (n - k + i) // i; |
---|
283 | |
---|
284 | return int(accum + 0.5) |
---|