source: trunk/misc/operations_helpers/provisioning/reliability.py

Last change on this file was b856238, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-15T15:53:34Z

remove old Python2 future statements

  • Property mode set to 100644
File size: 11.1 KB
Line 
1#! /usr/bin/python
2
3
4import math
5from allmydata.util import statistics
6from numpy import array, matrix, dot
7
8DAY=24*60*60
9MONTH=31*DAY
10YEAR=365*DAY
11
12class ReliabilityModel(object):
13    """Generate a model of system-wide reliability, given several input
14    parameters.
15
16    This runs a simulation in which time is quantized down to 'delta' seconds
17    (default is one month): a smaller delta will result in a more accurate
18    simulation, but will take longer to run. 'report_span' simulated seconds
19    will be run.
20
21    The encoding parameters are provided as 'k' (minimum number of shares
22    needed to recover the file) and 'N' (total number of shares generated).
23    The default parameters are 3-of-10.
24
25    The first step is to build a probability of individual drive loss during
26    any given delta. This uses a simple exponential model, in which the
27    average drive lifetime is specified by the 'drive_lifetime' parameter
28    (default is 8 years).
29
30    The second step is to calculate a 'transition matrix': a table of
31    probabilities that shows, given A shares at the start of the delta, what
32    the chances are of having B shares left at the end of the delta. The
33    current code optimistically assumes all drives are independent. A
34    subclass could override that assumption.
35
36    An additional 'repair matrix' is created to show what happens when the
37    Checker/Repairer is run. In the simulation, the Checker will be run every
38    'check_period' seconds (default is one month), and the Repairer will be
39    run if it sees fewer than 'R' shares (default 7).
40
41    The third step is to finally run the simulation. An initial probability
42    vector is created (with a 100% chance of N shares and a 0% chance of
43    fewer than N shares), then it is multiplied by the transition matrix for
44    every delta of time. Each time the Checker is to be run, the repair
45    matrix is multiplied in, and some additional stats are accumulated
46    (average number of repairs that occur, average number of shares
47    regenerated per repair).
48
49    The output is a ReliabilityReport instance, which contains a table that
50    samples the state of the simulation once each 'report_period' seconds
51    (defaults to 3 months). Each row of this table will contain the
52    probability vector for one sample period (chance of having X shares, from
53    0 to N, at the end of the period). The report will also contain other
54    information.
55
56    """
57
58    @classmethod
59    def run(klass,
60            drive_lifetime=8*YEAR,
61            k=3, R=7, N=10,
62            delta=1*MONTH,
63            check_period=1*MONTH,
64            report_period=3*MONTH,
65            report_span=5*YEAR,
66            ):
67        self = klass()
68
69        check_period = check_period-1
70        P = self.p_in_period(drive_lifetime, delta)
71
72        decay = self.build_decay_matrix(N, P)
73
74        repair = self.build_repair_matrix(k, N, R)
75
76        #print("DECAY:", decay)
77        #print("OLD-POST-REPAIR:", old_post_repair)
78        #print("NEW-POST-REPAIR:", decay * repair)
79        #print("REPAIR:", repair)
80        #print("DIFF:", (old_post_repair - decay * repair))
81
82        START = array([0]*N + [1])
83        DEAD = array([1]*k + [0]*(1+N-k))
84        REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
85        REPAIR_newshares = array([0]*k +
86                                 [N-i for i in range(k, R)] +
87                                 [0]*(1+N-R))
88        assert REPAIR_newshares.shape[0] == N+1
89        #print("START", START)
90        #print("REPAIRp", REPAIRp)
91        #print("REPAIR_newshares", REPAIR_newshares)
92
93        unmaintained_state = START
94        maintained_state = START
95        last_check = 0
96        last_report = 0
97        P_repaired_last_check_period = 0.0
98        needed_repairs = []
99        needed_new_shares = []
100        report = ReliabilityReport()
101
102        for t in range(0, report_span+delta, delta):
103            # the .A[0] turns the one-row matrix back into an array
104            unmaintained_state = (unmaintained_state * decay).A[0]
105            maintained_state = (maintained_state * decay).A[0]
106            if (t-last_check) > check_period:
107                last_check = t
108                # we do a check-and-repair this frequently
109                need_repair = dot(maintained_state, REPAIRp)
110
111                P_repaired_last_check_period = need_repair
112                new_shares = dot(maintained_state, REPAIR_newshares)
113                needed_repairs.append(need_repair)
114                needed_new_shares.append(new_shares)
115
116                maintained_state = (maintained_state * repair).A[0]
117
118            if (t-last_report) > report_period:
119                last_report = t
120                P_dead_unmaintained = dot(unmaintained_state, DEAD)
121                P_dead_maintained = dot(maintained_state, DEAD)
122                cumulative_number_of_repairs = sum(needed_repairs)
123                cumulative_number_of_new_shares = sum(needed_new_shares)
124                report.add_sample(t, unmaintained_state, maintained_state,
125                                  P_repaired_last_check_period,
126                                  cumulative_number_of_repairs,
127                                  cumulative_number_of_new_shares,
128                                  P_dead_unmaintained, P_dead_maintained)
129
130        # record one more sample at the end of the run
131        P_dead_unmaintained = dot(unmaintained_state, DEAD)
132        P_dead_maintained = dot(maintained_state, DEAD)
133        cumulative_number_of_repairs = sum(needed_repairs)
134        cumulative_number_of_new_shares = sum(needed_new_shares)
135        report.add_sample(t, unmaintained_state, maintained_state,
136                          P_repaired_last_check_period,
137                          cumulative_number_of_repairs,
138                          cumulative_number_of_new_shares,
139                          P_dead_unmaintained, P_dead_maintained)
140
141        #def yandm(seconds):
142        #    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
143        #needed_repairs_total = sum(needed_repairs)
144        #needed_new_shares_total = sum(needed_new_shares)
145        #print("at 2y:")
146        #print(" unmaintained", unmaintained_state)
147        #print(" maintained", maintained_state)
148        #print(" number of repairs", needed_repairs_total)
149        #print(" new shares generated", needed_new_shares_total)
150        #repair_rate_inv = report_span / needed_repairs_total
151        #print("  avg repair rate: once every %s" % yandm(repair_rate_inv))
152        #print("  avg repair download: one share every %s" % yandm(repair_rate_inv/k))
153        #print("  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total))
154
155        return report
156
157    def p_in_period(self, avg_lifetime, period):
158        """Given an average lifetime of a disk (using an exponential model),
159        what is the chance that a live disk will survive the next 'period'
160        seconds?"""
161
162        # eg p_in_period(8*YEAR, MONTH) = 98.94%
163        return math.exp(-1.0*period/avg_lifetime)
164
165    def build_decay_matrix(self, N, P):
166        """Return a decay matrix. decay[start_shares][end_shares] is the
167        conditional probability of finishing with end_shares, given that we
168        started with start_shares."""
169        decay_rows = []
170        decay_rows.append( [0.0]*(N+1) )
171        for start_shares in range(1, (N+1)):
172            end_shares = self.build_decay_row(start_shares, P)
173            decay_row = end_shares + [0.0] * (N-start_shares)
174            assert len(decay_row) == (N+1), len(decay_row)
175            decay_rows.append(decay_row)
176
177        decay = matrix(decay_rows)
178        return decay
179
180    def build_decay_row(self, start_shares, P):
181        """Return a decay row 'end_shares'. end_shares[i] is the chance that
182        we finish with i shares, given that we started with start_shares, for
183        all i between 0 and start_shares, inclusive. This implementation
184        assumes that all shares are independent (IID), but a more complex
185        model could incorporate inter-share failure correlations like having
186        two shares on the same server."""
187        end_shares = statistics.binomial_distribution_pmf(start_shares, P)
188        return end_shares
189
190    def build_repair_matrix(self, k, N, R):
191        """Return a repair matrix. repair[start][end]: is the conditional
192        probability of the repairer finishing with 'end' shares, given that
193        it began with 'start' shares (repair if fewer than R shares). The
194        repairer's behavior is deterministic, so all values in this matrix
195        are either 0 or 1. This matrix should be applied *after* the decay
196        matrix."""
197        new_repair_rows = []
198        for start_shares in range(0, N+1):
199            new_repair_row = [0] * (N+1)
200            if start_shares < k:
201                new_repair_row[start_shares] = 1
202            elif start_shares < R:
203                new_repair_row[N] = 1
204            else:
205                new_repair_row[start_shares] = 1
206            new_repair_rows.append(new_repair_row)
207
208        repair = matrix(new_repair_rows)
209        return repair
210
211class ReliabilityReport(object):
212    def __init__(self):
213        self.samples = []
214
215    def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
216                   P_repaired_last_check_period,
217                   cumulative_number_of_repairs,
218                   cumulative_number_of_new_shares,
219                   P_dead_unmaintained, P_dead_maintained):
220        """
221        when: the timestamp at the end of the report period
222        unmaintained_shareprobs: a vector of probabilities, element[S]
223                                 is the chance that there are S shares
224                                 left at the end of the report period.
225                                 This tracks what happens if no repair
226                                 is ever done.
227        maintained_shareprobs: same, but for 'maintained' grids, where
228                               check and repair is done at the end
229                               of each check period
230        P_repaired_last_check_period: a float, with the probability
231                                      that a repair was performed
232                                      at the end of the most recent
233                                      check period.
234        cumulative_number_of_repairs: a float, with the average number
235                                      of repairs that will have been
236                                      performed by the end of the
237                                      report period
238        cumulative_number_of_new_shares: a float, with the average number
239                                         of new shares that repair proceses
240                                         generated by the end of the report
241                                         period
242        P_dead_unmaintained: a float, with the chance that the file will
243                             be unrecoverable at the end of the period
244        P_dead_maintained: same, but for maintained grids
245
246        """
247        row = (when, unmaintained_shareprobs, maintained_shareprobs,
248               P_repaired_last_check_period,
249               cumulative_number_of_repairs,
250               cumulative_number_of_new_shares,
251               P_dead_unmaintained, P_dead_maintained)
252        self.samples.append(row)
Note: See TracBrowser for help on using the repository browser.