source: trunk/misc/simulators/count_dirs.py

Last change on this file was b856238, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-15T15:53:34Z

remove old Python2 future statements

  • Property mode set to 100644
File size: 4.4 KB
Line 
1#!/usr/bin/env python
2
3
4"""
5This tool estimates how much space would be consumed by a filetree into which
6a native directory was copied.
7
8One open question is how we should encode directories. One approach is to put
9a block of data on a server, one per directory, which effectively contains a
10dictionary that maps child names to targets (URIs for children which are
11files, slotnames for children which are directories). To prevent the server
12which hosts this data from either learning its contents or corrupting them,
13we can add encryption and integrity checks to the data, at the cost of
14storage overhead.
15
16This program is intended to estimate the size of these data blocks using
17real-world filenames and directories. You point it at a real directory, and
18it does a recursive walk of the filesystem, adding up the size of the
19filetree data structures that would be required to represent it.
20
21MODES:
22
23 A: no confidentiality or integrity checking. Directories are serialized
24    plaintext dictionaries which map file/subdir names to targets (either
25    URIs or slotnames). Each entry can be changed independently.
26 B1: child names and targets are encrypted. No integrity checks, so the
27     server can still corrupt the contents undetectably. Each entry can
28     still be changed independently.
29 B2: same security properties as B1, but the dictionary is serialized before
30     encryption. This reduces overhead at the cost of preventing independent
31     updates of entries (all entries must be updated at the same time, so
32     test-and-set operations are required to avoid data-losing races)
33 C1: like B1, but adding HMACs to each entry to guarantee data integrity
34 C2: like B2, but adding a single block-wide HMAC for data integrity
35
36"""
37
38import sys, os.path
39
40#URI:7jzbza6iwdsk5xbxsvdgjaugyrhetw64zpflp4gihmyh5krjblra====:a5qdejwbimu5b2wfke7xwexxlq======:gzeub5v42rjbgd7ccawnahu2evqd42lpdpzd447c6zkmdvjkpowq====:25:100:219889
41# that's a printable representation of two 32-byte hashes (storage index, URI
42# extension block hash) and a 16-byte AES read-capability key, and some
43# share-count and size information
44URI_SIZE = 164
45
46#pb://xextf3eap44o3wi27mf7ehiur6wvhzr6@207.7.153.180:56677,127.0.0.1:56677/zilcw5uz2yyyo===
47# that's a FURL which points at the slot. Modes that need to add a
48# read-capability AES key will need more space.
49SLOTNAME_SIZE = 90
50
51
52def slotsize(mode, numfiles, numdirs):
53    # URI_sizes is the total space taken up by the target (dict keys) strings
54    # for all of the targets that are files, instead of directories
55    target_sizes_for_files = numfiles * URI_SIZE
56    slotname_size = SLOTNAME_SIZE
57    if mode in ("B1", "B2", "C1", "C2"):
58        slotname_size += 16
59    # slotname_sizes is the total space taken up by the target strings for
60    # all the targets that are directories, instead of files. These are
61    # bigger when the read+write-cap slotname is larger than the store-cap,
62    # which happens as soon as we seek to prevent the slot's host from
63    # reading or corrupting it.
64    target_sizes_for_subdirs = numdirs * slotname_size
65
66    # now how much overhead is there for each entry?
67    per_slot, per_entry = 0, 0
68    if mode == "B1":
69        per_entry = 16+12+12
70    elif mode == "C1":
71        per_entry = 16+12+12 + 32+32
72    elif mode == "B2":
73        per_slot = 12
74    elif mode == "C2":
75        per_slot = 12+32
76    num_entries = numfiles + numdirs
77    total = (target_sizes_for_files +
78             target_sizes_for_subdirs +
79             per_slot +
80             per_entry * num_entries
81             )
82    return total
83
84MODES = ("A", "B1", "B2", "C1", "C2")
85
86def scan(root):
87    total = dict([(mode,0) for mode in MODES])
88    num_files = 0
89    num_dirs = 0
90    for absroot, dirs, files in os.walk(root):
91        #print(absroot)
92        #print(" %d files" % len(files))
93        #print(" %d subdirs" % len(dirs))
94        num_files += len(files)
95        num_dirs += len(dirs)
96        stringsize = len(''.join(files) + ''.join(dirs))
97        for mode in MODES:
98            total[mode] += slotsize(mode, len(files), len(dirs)) + stringsize
99
100    print("%d directories" % num_dirs)
101    print("%d files" % num_files)
102    for mode in sorted(total.keys()):
103        print("%s: %d bytes" % (mode, total[mode]))
104
105
106if __name__ == '__main__':
107    scan(sys.argv[1])
108
109"""
110260:warner@monolith% ./count_dirs.py ~
11170925 directories
112457199 files
113A: 90042361 bytes
114B1: 112302121 bytes
115B2: 92027061 bytes
116C1: 146102057 bytes
117C2: 94293461 bytes
118
119"""
Note: See TracBrowser for help on using the repository browser.