55
56
VALUE := no-newline-no-null-bytes
58
def __init__(self, reference_lists=0):
59
def __init__(self, reference_lists=0, key_elements=1):
59
60
"""Create a GraphIndex builder.
61
62
:param reference_lists: The number of node references lists for each
64
:param key_elements: The number of bytestrings in each key.
64
66
self.reference_lists = reference_lists
68
self._nodes_by_key = {}
69
self._key_length = key_elements
71
def _check_key(self, key):
72
"""Raise BadIndexKey if key is not a valid key for this index."""
73
if type(key) != tuple:
74
raise errors.BadIndexKey(key)
75
if self._key_length != len(key):
76
raise errors.BadIndexKey(key)
78
if not element or _whitespace_re.search(element) is not None:
79
raise errors.BadIndexKey(element)
67
81
def add_node(self, key, value, references=()):
68
82
"""Add a node to the index.
70
:param key: The key. keys must be whitespace-free utf8.
84
:param key: The key. keys are non-empty tuples containing
85
as many whitespace-free utf8 bytestrings as the key length
86
defined for this index.
71
87
:param references: An iterable of iterables of keys. Each is a
72
88
reference to another key.
73
89
:param value: The value to associate with the key. It may be any
74
90
bytes as long as it does not contain \0 or \n.
76
if not key or _whitespace_re.search(key) is not None:
77
raise errors.BadIndexKey(key)
78
93
if _newline_null_re.search(value) is not None:
79
94
raise errors.BadIndexValue(value)
80
95
if len(references) != self.reference_lists:
83
98
for reference_list in references:
84
99
for reference in reference_list:
85
if _whitespace_re.search(reference) is not None:
86
raise errors.BadIndexKey(reference)
100
self._check_key(reference)
87
101
if reference not in self._nodes:
88
102
self._nodes[reference] = ('a', (), '')
89
103
node_refs.append(tuple(reference_list))
90
104
if key in self._nodes and self._nodes[key][0] == '':
91
105
raise errors.BadIndexDuplicateKey(key, self)
92
106
self._nodes[key] = ('', tuple(node_refs), value)
107
if self._key_length > 1:
108
key_dict = self._nodes_by_key
109
if self.reference_lists:
110
key_value = key, value, tuple(node_refs)
112
key_value = key, value
113
# possibly should do this on-demand, but it seems likely it is
115
# For a key of (foo, bar, baz) create
116
# _nodes_by_key[foo][bar][baz] = key_value
117
for subkey in key[:-1]:
118
key_dict = key_dict.setdefault(subkey, {})
119
key_dict[key[-1]] = key_value
95
122
lines = [_SIGNATURE]
96
123
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
97
prefix_length = len(lines[0]) + len(lines[1])
124
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
125
prefix_length = sum(len(x) for x in lines)
98
126
# references are byte offsets. To avoid having to do nasty
99
127
# polynomial work to resolve offsets (references to later in the
100
128
# file cannot be determined until all the inbetween references have
125
153
# date - saves reaccumulating on the second pass
126
154
key_offset_info.append((key, non_ref_bytes, total_references))
127
155
# key is literal, value is literal, there are 3 null's, 1 NL
128
non_ref_bytes += len(key) + len(value) + 3 + 1
156
# key is variable length tuple, \x00 between elements
157
non_ref_bytes += sum(len(element) for element in key)
158
if self._key_length > 1:
159
non_ref_bytes += self._key_length - 1
160
# value is literal bytes, there are 3 null's, 1 NL.
161
non_ref_bytes += len(value) + 3 + 1
129
162
# one byte for absent if set.
131
164
non_ref_bytes += 1
159
192
for reference in ref_list:
160
193
ref_addresses.append(format_string % key_addresses[reference])
161
194
flattened_references.append('\r'.join(ref_addresses))
162
lines.append("%s\0%s\0%s\0%s\n" % (key, absent,
195
string_key = '\x00'.join(key)
196
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
163
197
'\t'.join(flattened_references), value))
164
198
lines.append('\n')
165
199
result = StringIO(''.join(lines))
177
211
Each node has the same number of key reference lists. Each key reference
178
212
list can be empty or an arbitrary length. The value is an opaque NULL
179
213
terminated string without any newlines. The storage of the index is
180
hidden in the interface: keys and key references are always bytestrings,
181
never the internal representation (e.g. dictionary offsets).
214
hidden in the interface: keys and key references are always tuples of
215
bytestrings, never the internal representation (e.g. dictionary offsets).
183
217
It is presumed that the index will not be mutated - it is static data.
197
231
self._transport = transport
198
232
self._name = name
200
def iter_all_entries(self):
201
"""Iterate over all keys within the index.
203
:return: An iterable of (key, value) or (key, value, reference_lists).
204
The former tuple is used when there are no reference lists in the
205
index, making the API compatible with simple key:value index types.
206
There is no defined order for the result iteration - it will be in
207
the most efficient order for the index.
234
self._keys_by_offset = None
235
self._nodes_by_key = None
237
def _buffer_all(self):
238
"""Buffer all the index data.
240
Mutates self._nodes and self.keys_by_offset.
209
242
stream = self._transport.get(self._name)
210
243
self._read_prefix(stream)
244
expected_elements = 3 + self._key_length
212
self.keys_by_offset = {}
246
# raw data keyed by offset
247
self._keys_by_offset = {}
248
# ready-to-return key:value or key:value, node_ref_lists
250
self._nodes_by_key = {}
214
252
pos = stream.tell()
215
253
for line in stream.readlines():
219
key, absent, references, value = line.split('\0')
257
elements = line.split('\0')
258
if len(elements) != expected_elements:
259
raise errors.BadIndexData(self)
261
key = tuple(elements[:self._key_length])
262
absent, references, value = elements[-3:]
220
263
value = value[:-1] # remove the newline
222
265
for ref_string in references.split('\t'):
224
267
int(ref) for ref in ref_string.split('\r') if ref
226
269
ref_lists = tuple(ref_lists)
227
self.keys_by_offset[pos] = (key, absent, ref_lists, value)
270
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
229
for key, absent, references, value in self.keys_by_offset.itervalues():
272
for key, absent, references, value in self._keys_by_offset.itervalues():
232
275
# resolve references:
233
276
if self.node_ref_lists:
235
278
for ref_list in references:
236
node_refs.append(tuple([self.keys_by_offset[ref][0] for ref in ref_list]))
237
yield (key, value, tuple(node_refs))
279
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
280
node_value = (value, tuple(node_refs))
283
self._nodes[key] = node_value
284
if self._key_length > 1:
285
subkey = list(reversed(key[:-1]))
286
key_dict = self._nodes_by_key
287
if self.node_ref_lists:
288
key_value = key, node_value[0], node_value[1]
290
key_value = key, node_value
291
# possibly should do this on-demand, but it seems likely it is
293
# For a key of (foo, bar, baz) create
294
# _nodes_by_key[foo][bar][baz] = key_value
295
for subkey in key[:-1]:
296
key_dict = key_dict.setdefault(subkey, {})
297
key_dict[key[-1]] = key_value
298
self._keys = set(self._nodes)
240
299
if trailers != 1:
241
300
# there must be one line - the empty trailer line.
242
301
raise errors.BadIndexData(self)
303
def iter_all_entries(self):
304
"""Iterate over all keys within the index.
306
:return: An iterable of (key, value) or (key, value, reference_lists).
307
The former tuple is used when there are no reference lists in the
308
index, making the API compatible with simple key:value index types.
309
There is no defined order for the result iteration - it will be in
310
the most efficient order for the index.
312
if self._nodes is None:
314
if self.node_ref_lists:
315
for key, (value, node_ref_lists) in self._nodes.iteritems():
316
yield key, value, node_ref_lists
318
for key, value in self._nodes.iteritems():
244
321
def _read_prefix(self, stream):
245
322
signature = stream.read(len(self._signature()))
246
323
if not signature == self._signature():
252
329
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
253
330
except ValueError:
254
331
raise errors.BadIndexOptions(self)
332
options_line = stream.readline()
333
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
334
raise errors.BadIndexOptions(self)
336
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
338
raise errors.BadIndexOptions(self)
256
340
def iter_entries(self, keys):
257
341
"""Iterate over keys within the index.
267
for node in self.iter_all_entries():
351
if self._nodes is None:
353
keys = keys.intersection(self._keys)
354
if self.node_ref_lists:
356
value, node_refs = self._nodes[key]
357
yield key, value, node_refs
360
yield key, self._nodes[key]
362
def iter_entries_prefix(self, keys):
363
"""Iterate over keys within the index using prefix matching.
365
Prefix matching is applied within the tuple of a key, not to within
366
the bytestring of each key element. e.g. if you have the keys ('foo',
367
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
368
only the former key is returned.
370
:param keys: An iterable providing the key prefixes to be retrieved.
371
Each key prefix takes the form of a tuple the length of a key, but
372
with the last N elements 'None' rather than a regular bytestring.
373
The first element cannot be 'None'.
374
:return: An iterable as per iter_all_entries, but restricted to the
375
keys with a matching prefix to those supplied. No additional keys
376
will be returned, and every match that is in the index will be
382
# load data - also finds key lengths
383
if self._nodes is None:
385
if self._key_length == 1:
389
raise errors.BadIndexKey(key)
390
if len(key) != self._key_length:
391
raise errors.BadIndexKey(key)
392
if self.node_ref_lists:
393
value, node_refs = self._nodes[key]
394
yield key, value, node_refs
396
yield key, self._nodes[key]
401
raise errors.BadIndexKey(key)
402
if len(key) != self._key_length:
403
raise errors.BadIndexKey(key)
404
# find what it refers to:
405
key_dict = self._nodes_by_key
407
# find the subdict whose contents should be returned.
409
while len(elements) and elements[0] is not None:
410
key_dict = key_dict[elements[0]]
413
# a non-existant lookup.
418
key_dict = dicts.pop(-1)
419
# can't be empty or would not exist
420
item, value = key_dict.iteritems().next()
421
if type(value) == dict:
423
dicts.extend(key_dict.itervalues())
426
for value in key_dict.itervalues():
427
# each value is the key:value:node refs tuple
431
# the last thing looked up was a terminal element
274
434
def _signature(self):
275
435
"""The file signature for this index type."""
346
506
keys.remove(node[0])
509
def iter_entries_prefix(self, keys):
510
"""Iterate over keys within the index using prefix matching.
512
Duplicate keys across child indices are presumed to have the same
513
value and are only reported once.
515
Prefix matching is applied within the tuple of a key, not to within
516
the bytestring of each key element. e.g. if you have the keys ('foo',
517
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
518
only the former key is returned.
520
:param keys: An iterable providing the key prefixes to be retrieved.
521
Each key prefix takes the form of a tuple the length of a key, but
522
with the last N elements 'None' rather than a regular bytestring.
523
The first element cannot be 'None'.
524
:return: An iterable as per iter_all_entries, but restricted to the
525
keys with a matching prefix to those supplied. No additional keys
526
will be returned, and every match that is in the index will be
533
for index in self._indices:
534
for node in index.iter_entries_prefix(keys):
535
if node[0] in seen_keys:
537
seen_keys.add(node[0])
349
540
def validate(self):
350
541
"""Validate that everything in the index can be accessed."""
351
542
for index in self._indices:
366
557
:param nodes: An iterable of (key, node_refs, value) entries to add.
368
for (key, value, node_refs) in nodes:
369
self.add_node(key, value, node_refs)
559
if self.reference_lists:
560
for (key, value, node_refs) in nodes:
561
self.add_node(key, value, node_refs)
563
for (key, value) in nodes:
564
self.add_node(key, value)
371
566
def iter_all_entries(self):
372
567
"""Iterate over all keys within the index
405
600
yield key, node[2]
602
def iter_entries_prefix(self, keys):
603
"""Iterate over keys within the index using prefix matching.
605
Prefix matching is applied within the tuple of a key, not to within
606
the bytestring of each key element. e.g. if you have the keys ('foo',
607
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
608
only the former key is returned.
610
:param keys: An iterable providing the key prefixes to be retrieved.
611
Each key prefix takes the form of a tuple the length of a key, but
612
with the last N elements 'None' rather than a regular bytestring.
613
The first element cannot be 'None'.
614
:return: An iterable as per iter_all_entries, but restricted to the
615
keys with a matching prefix to those supplied. No additional keys
616
will be returned, and every match that is in the index will be
619
# XXX: To much duplication with the GraphIndex class; consider finding
620
# a good place to pull out the actual common logic.
624
if self._key_length == 1:
628
raise errors.BadIndexKey(key)
629
if len(key) != self._key_length:
630
raise errors.BadIndexKey(key)
631
node = self._nodes[key]
634
if self.reference_lists:
635
yield key, node[2], node[1]
642
raise errors.BadIndexKey(key)
643
if len(key) != self._key_length:
644
raise errors.BadIndexKey(key)
645
# find what it refers to:
646
key_dict = self._nodes_by_key
648
# find the subdict to return
650
while len(elements) and elements[0] is not None:
651
key_dict = key_dict[elements[0]]
654
# a non-existant lookup.
659
key_dict = dicts.pop(-1)
660
# can't be empty or would not exist
661
item, value = key_dict.iteritems().next()
662
if type(value) == dict:
664
dicts.extend(key_dict.itervalues())
667
for value in key_dict.itervalues():
407
672
def validate(self):
408
673
"""In memory index's have no known corruption at the moment."""