1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | |
---|
4 | """ |
---|
5 | This tool estimates how much space would be consumed by a filetree into which |
---|
6 | a native directory was copied. |
---|
7 | |
---|
8 | One open question is how we should encode directories. One approach is to put |
---|
9 | a block of data on a server, one per directory, which effectively contains a |
---|
10 | dictionary that maps child names to targets (URIs for children which are |
---|
11 | files, slotnames for children which are directories). To prevent the server |
---|
12 | which hosts this data from either learning its contents or corrupting them, |
---|
13 | we can add encryption and integrity checks to the data, at the cost of |
---|
14 | storage overhead. |
---|
15 | |
---|
16 | This program is intended to estimate the size of these data blocks using |
---|
17 | real-world filenames and directories. You point it at a real directory, and |
---|
18 | it does a recursive walk of the filesystem, adding up the size of the |
---|
19 | filetree data structures that would be required to represent it. |
---|
20 | |
---|
21 | MODES: |
---|
22 | |
---|
23 | A: no confidentiality or integrity checking. Directories are serialized |
---|
24 | plaintext dictionaries which map file/subdir names to targets (either |
---|
25 | URIs or slotnames). Each entry can be changed independently. |
---|
26 | B1: child names and targets are encrypted. No integrity checks, so the |
---|
27 | server can still corrupt the contents undetectably. Each entry can |
---|
28 | still be changed independently. |
---|
29 | B2: same security properties as B1, but the dictionary is serialized before |
---|
30 | encryption. This reduces overhead at the cost of preventing independent |
---|
31 | updates of entries (all entries must be updated at the same time, so |
---|
32 | test-and-set operations are required to avoid data-losing races) |
---|
33 | C1: like B1, but adding HMACs to each entry to guarantee data integrity |
---|
34 | C2: like B2, but adding a single block-wide HMAC for data integrity |
---|
35 | |
---|
36 | """ |
---|
37 | |
---|
38 | import sys, os.path |
---|
39 | |
---|
40 | #URI:7jzbza6iwdsk5xbxsvdgjaugyrhetw64zpflp4gihmyh5krjblra====:a5qdejwbimu5b2wfke7xwexxlq======:gzeub5v42rjbgd7ccawnahu2evqd42lpdpzd447c6zkmdvjkpowq====:25:100:219889 |
---|
41 | # that's a printable representation of two 32-byte hashes (storage index, URI |
---|
42 | # extension block hash) and a 16-byte AES read-capability key, and some |
---|
43 | # share-count and size information |
---|
44 | URI_SIZE = 164 |
---|
45 | |
---|
46 | #pb://xextf3eap44o3wi27mf7ehiur6wvhzr6@207.7.153.180:56677,127.0.0.1:56677/zilcw5uz2yyyo=== |
---|
47 | # that's a FURL which points at the slot. Modes that need to add a |
---|
48 | # read-capability AES key will need more space. |
---|
49 | SLOTNAME_SIZE = 90 |
---|
50 | |
---|
51 | |
---|
52 | def slotsize(mode, numfiles, numdirs): |
---|
53 | # URI_sizes is the total space taken up by the target (dict keys) strings |
---|
54 | # for all of the targets that are files, instead of directories |
---|
55 | target_sizes_for_files = numfiles * URI_SIZE |
---|
56 | slotname_size = SLOTNAME_SIZE |
---|
57 | if mode in ("B1", "B2", "C1", "C2"): |
---|
58 | slotname_size += 16 |
---|
59 | # slotname_sizes is the total space taken up by the target strings for |
---|
60 | # all the targets that are directories, instead of files. These are |
---|
61 | # bigger when the read+write-cap slotname is larger than the store-cap, |
---|
62 | # which happens as soon as we seek to prevent the slot's host from |
---|
63 | # reading or corrupting it. |
---|
64 | target_sizes_for_subdirs = numdirs * slotname_size |
---|
65 | |
---|
66 | # now how much overhead is there for each entry? |
---|
67 | per_slot, per_entry = 0, 0 |
---|
68 | if mode == "B1": |
---|
69 | per_entry = 16+12+12 |
---|
70 | elif mode == "C1": |
---|
71 | per_entry = 16+12+12 + 32+32 |
---|
72 | elif mode == "B2": |
---|
73 | per_slot = 12 |
---|
74 | elif mode == "C2": |
---|
75 | per_slot = 12+32 |
---|
76 | num_entries = numfiles + numdirs |
---|
77 | total = (target_sizes_for_files + |
---|
78 | target_sizes_for_subdirs + |
---|
79 | per_slot + |
---|
80 | per_entry * num_entries |
---|
81 | ) |
---|
82 | return total |
---|
83 | |
---|
84 | MODES = ("A", "B1", "B2", "C1", "C2") |
---|
85 | |
---|
86 | def scan(root): |
---|
87 | total = dict([(mode,0) for mode in MODES]) |
---|
88 | num_files = 0 |
---|
89 | num_dirs = 0 |
---|
90 | for absroot, dirs, files in os.walk(root): |
---|
91 | #print(absroot) |
---|
92 | #print(" %d files" % len(files)) |
---|
93 | #print(" %d subdirs" % len(dirs)) |
---|
94 | num_files += len(files) |
---|
95 | num_dirs += len(dirs) |
---|
96 | stringsize = len(''.join(files) + ''.join(dirs)) |
---|
97 | for mode in MODES: |
---|
98 | total[mode] += slotsize(mode, len(files), len(dirs)) + stringsize |
---|
99 | |
---|
100 | print("%d directories" % num_dirs) |
---|
101 | print("%d files" % num_files) |
---|
102 | for mode in sorted(total.keys()): |
---|
103 | print("%s: %d bytes" % (mode, total[mode])) |
---|
104 | |
---|
105 | |
---|
106 | if __name__ == '__main__': |
---|
107 | scan(sys.argv[1]) |
---|
108 | |
---|
109 | """ |
---|
110 | 260:warner@monolith% ./count_dirs.py ~ |
---|
111 | 70925 directories |
---|
112 | 457199 files |
---|
113 | A: 90042361 bytes |
---|
114 | B1: 112302121 bytes |
---|
115 | B2: 92027061 bytes |
---|
116 | C1: 146102057 bytes |
---|
117 | C2: 94293461 bytes |
---|
118 | |
---|
119 | """ |
---|