94
97
if not element or _whitespace_re.search(element) is not None:
95
98
raise errors.BadIndexKey(element)
97
def add_node(self, key, value, references=()):
98
"""Add a node to the index.
100
:param key: The key. keys are non-empty tuples containing
101
as many whitespace-free utf8 bytestrings as the key length
102
defined for this index.
103
:param references: An iterable of iterables of keys. Each is a
104
reference to another key.
105
:param value: The value to associate with the key. It may be any
106
bytes as long as it does not contain \0 or \n.
100
def _get_nodes_by_key(self):
101
if self._nodes_by_key is None:
103
if self.reference_lists:
104
for key, (absent, references, value) in self._nodes.iteritems():
107
key_dict = nodes_by_key
108
for subkey in key[:-1]:
109
key_dict = key_dict.setdefault(subkey, {})
110
key_dict[key[-1]] = key, value, references
112
for key, (absent, references, value) in self._nodes.iteritems():
115
key_dict = nodes_by_key
116
for subkey in key[:-1]:
117
key_dict = key_dict.setdefault(subkey, {})
118
key_dict[key[-1]] = key, value
119
self._nodes_by_key = nodes_by_key
120
return self._nodes_by_key
122
def _update_nodes_by_key(self, key, value, node_refs):
123
"""Update the _nodes_by_key dict with a new key.
125
For a key of (foo, bar, baz) create
126
_nodes_by_key[foo][bar][baz] = key_value
128
if self._nodes_by_key is None:
130
key_dict = self._nodes_by_key
131
if self.reference_lists:
132
key_value = key, value, node_refs
134
key_value = key, value
135
for subkey in key[:-1]:
136
key_dict = key_dict.setdefault(subkey, {})
137
key_dict[key[-1]] = key_value
139
def _check_key_ref_value(self, key, references, value):
140
"""Check that 'key' and 'references' are all valid.
142
:param key: A key tuple. Must conform to the key interface (be a tuple,
143
be of the right length, not have any whitespace or nulls in any key
145
:param references: An iterable of reference lists. Something like
146
[[(ref, key)], [(ref, key), (other, key)]]
147
:param value: The value associate with this key. Must not contain
148
newlines or null characters.
149
:return: (node_refs, absent_references)
150
node_refs basically a packed form of 'references' where all
152
absent_references reference keys that are not in self._nodes.
153
This may contain duplicates if the same key is
154
referenced in multiple lists.
108
156
self._check_key(key)
109
157
if _newline_null_re.search(value) is not None:
111
159
if len(references) != self.reference_lists:
112
160
raise errors.BadIndexValue(references)
162
absent_references = []
114
163
for reference_list in references:
115
164
for reference in reference_list:
116
self._check_key(reference)
165
# If reference *is* in self._nodes, then we know it has already
117
167
if reference not in self._nodes:
118
self._nodes[reference] = ('a', (), '')
168
self._check_key(reference)
169
absent_references.append(reference)
119
170
node_refs.append(tuple(reference_list))
120
if key in self._nodes and self._nodes[key][0] == '':
171
return tuple(node_refs), absent_references
173
def add_node(self, key, value, references=()):
174
"""Add a node to the index.
176
:param key: The key. keys are non-empty tuples containing
177
as many whitespace-free utf8 bytestrings as the key length
178
defined for this index.
179
:param references: An iterable of iterables of keys. Each is a
180
reference to another key.
181
:param value: The value to associate with the key. It may be any
182
bytes as long as it does not contain \0 or \n.
185
absent_references) = self._check_key_ref_value(key, references, value)
186
if key in self._nodes and self._nodes[key][0] != 'a':
121
187
raise errors.BadIndexDuplicateKey(key, self)
122
self._nodes[key] = ('', tuple(node_refs), value)
188
for reference in absent_references:
189
# There may be duplicates, but I don't think it is worth worrying
191
self._nodes[reference] = ('a', (), '')
192
self._nodes[key] = ('', node_refs, value)
123
193
self._keys.add(key)
124
if self._key_length > 1:
125
key_dict = self._nodes_by_key
126
if self.reference_lists:
127
key_value = key, value, tuple(node_refs)
129
key_value = key, value
130
# possibly should do this on-demand, but it seems likely it is
132
# For a key of (foo, bar, baz) create
133
# _nodes_by_key[foo][bar][baz] = key_value
134
for subkey in key[:-1]:
135
key_dict = key_dict.setdefault(subkey, {})
136
key_dict[key[-1]] = key_value
194
if self._nodes_by_key is not None and self._key_length > 1:
195
self._update_nodes_by_key(key, value, node_refs)
138
197
def finish(self):
139
198
lines = [_SIGNATURE]
142
201
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
143
202
prefix_length = sum(len(x) for x in lines)
144
203
# references are byte offsets. To avoid having to do nasty
145
# polynomial work to resolve offsets (references to later in the
204
# polynomial work to resolve offsets (references to later in the
146
205
# file cannot be determined until all the inbetween references have
147
206
# been calculated too) we pad the offsets with 0's to make them be
148
207
# of consistent length. Using binary offsets would break the trivial
219
278
raise errors.BzrError('Failed index creation. Internal error:'
220
279
' mismatched output length and expected length: %d %d' %
221
280
(len(result.getvalue()), expected_bytes))
222
return StringIO(''.join(lines))
283
def set_optimize(self, for_size=True):
284
"""Change how the builder tries to optimize the result.
286
:param for_size: Tell the builder to try and make the index as small as
290
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
292
self._optimize_for_size = for_size
225
295
class GraphIndex(object):
284
356
def __ne__(self, other):
285
357
return not self.__eq__(other)
287
def _buffer_all(self):
360
return "%s(%r)" % (self.__class__.__name__,
361
self._transport.abspath(self._name))
363
def _buffer_all(self, stream=None):
288
364
"""Buffer all the index data.
290
366
Mutates self._nodes and self.keys_by_offset.
368
if self._nodes is not None:
369
# We already did this
292
371
if 'index' in debug.debug_flags:
293
372
mutter('Reading entire index %s', self._transport.abspath(self._name))
294
stream = self._transport.get(self._name)
374
stream = self._transport.get(self._name)
295
375
self._read_prefix(stream)
296
376
self._expected_elements = 3 + self._key_length
315
395
node_value = value
316
396
self._nodes[key] = node_value
317
if self._key_length > 1:
318
subkey = list(reversed(key[:-1]))
319
key_dict = self._nodes_by_key
320
if self.node_ref_lists:
321
key_value = key, node_value[0], node_value[1]
323
key_value = key, node_value
324
# possibly should do this on-demand, but it seems likely it is
326
# For a key of (foo, bar, baz) create
327
# _nodes_by_key[foo][bar][baz] = key_value
328
for subkey in key[:-1]:
329
key_dict = key_dict.setdefault(subkey, {})
330
key_dict[key[-1]] = key_value
331
397
# cache the keys for quick set intersections
332
398
self._keys = set(self._nodes)
333
399
if trailers != 1:
334
400
# there must be one line - the empty trailer line.
335
401
raise errors.BadIndexData(self)
403
def _get_nodes_by_key(self):
404
if self._nodes_by_key is None:
406
if self.node_ref_lists:
407
for key, (value, references) in self._nodes.iteritems():
408
key_dict = nodes_by_key
409
for subkey in key[:-1]:
410
key_dict = key_dict.setdefault(subkey, {})
411
key_dict[key[-1]] = key, value, references
413
for key, value in self._nodes.iteritems():
414
key_dict = nodes_by_key
415
for subkey in key[:-1]:
416
key_dict = key_dict.setdefault(subkey, {})
417
key_dict[key[-1]] = key, value
418
self._nodes_by_key = nodes_by_key
419
return self._nodes_by_key
337
421
def iter_all_entries(self):
338
422
"""Iterate over all keys within the index.
464
548
keys supplied. No additional keys will be returned, and every
465
549
key supplied that is in the index will be returned.
467
# PERFORMANCE TODO: parse and bisect all remaining data at some
468
# threshold of total-index processing/get calling layers that expect to
469
# read the entire index to use the iter_all_entries method instead.
473
554
if self._size is None and self._nodes is None:
474
555
self._buffer_all()
557
# We fit about 20 keys per minimum-read (4K), so if we are looking for
558
# more than 1/20th of the index its likely (assuming homogenous key
559
# spread) that we'll read the entire index. If we're going to do that,
560
# buffer the whole thing. A better analysis might take key spread into
561
# account - but B+Tree indices are better anyway.
562
# We could look at all data read, and use a threshold there, which will
563
# trigger on ancestry walks, but that is not yet fully mapped out.
564
if self._nodes is None and len(keys) * 20 > self.key_count():
475
566
if self._nodes is not None:
476
567
return self._iter_entries_from_total_buffer(keys)
619
711
if self._bisect_nodes is None:
620
712
readv_ranges.append(_HEADER_READV)
621
713
self._read_and_parse(readv_ranges)
715
if self._nodes is not None:
716
# _read_and_parse triggered a _buffer_all because we requested the
718
for location, key in location_keys:
719
if key not in self._nodes: # not present
720
result.append(((location, key), False))
721
elif self.node_ref_lists:
722
value, refs = self._nodes[key]
723
result.append(((location, key),
724
(self, key, value, refs)))
726
result.append(((location, key),
727
(self, key, self._nodes[key])))
622
729
# generate results:
623
730
# - figure out <, >, missing, present
624
731
# - result present references so we can return them.
626
732
# keys that we cannot answer until we resolve references
627
733
pending_references = []
628
734
pending_locations = set()
679
785
readv_ranges.append((location, length))
680
786
self._read_and_parse(readv_ranges)
787
if self._nodes is not None:
788
# The _read_and_parse triggered a _buffer_all, grab the data and
790
for location, key in pending_references:
791
value, refs = self._nodes[key]
792
result.append(((location, key), (self, key, value, refs)))
681
794
for location, key in pending_references:
682
795
# answer key references we had to look-up-late.
683
index = self._parsed_key_index(key)
684
796
value, refs = self._bisect_nodes[key]
685
797
result.append(((location, key), (self, key,
686
798
value, self._resolve_references(refs))))
877
989
elements = line.split('\0')
878
990
if len(elements) != self._expected_elements:
879
991
raise errors.BadIndexData(self)
881
key = tuple(elements[:self._key_length])
992
# keys are tuples. Each element is a string that may occur many
993
# times, so we intern them to save space. AB, RC, 200807
994
key = tuple([intern(element) for element in elements[:self._key_length]])
882
995
if first_key is None:
884
997
absent, references, value = elements[-3:]
956
1069
:param readv_ranges: A prepared readv range list.
959
readv_data = self._transport.readv(self._name, readv_ranges, True,
962
for offset, data in readv_data:
963
if self._bisect_nodes is None:
964
# this must be the start
965
if not (offset == 0):
966
raise AssertionError()
967
offset, data = self._parse_header_from_bytes(data)
968
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
969
self._parse_region(offset, data)
1071
if not readv_ranges:
1073
if self._nodes is None and self._bytes_read * 2 >= self._size:
1074
# We've already read more than 50% of the file and we are about to
1075
# request more data, just _buffer_all() and be done
1079
readv_data = self._transport.readv(self._name, readv_ranges, True,
1082
for offset, data in readv_data:
1083
self._bytes_read += len(data)
1084
if offset == 0 and len(data) == self._size:
1085
# We read the whole range, most likely because the
1086
# Transport upcast our readv ranges into one long request
1087
# for enough total data to grab the whole index.
1088
self._buffer_all(StringIO(data))
1090
if self._bisect_nodes is None:
1091
# this must be the start
1092
if not (offset == 0):
1093
raise AssertionError()
1094
offset, data = self._parse_header_from_bytes(data)
1095
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1096
self._parse_region(offset, data)
971
1098
def _signature(self):
972
1099
"""The file signature for this index type."""
992
1119
in the index list.
995
def __init__(self, indices):
1122
def __init__(self, indices, reload_func=None):
996
1123
"""Create a CombinedGraphIndex backed by indices.
998
1125
:param indices: An ordered list of indices to query for data.
1126
:param reload_func: A function to call if we find we are missing an
1127
index. Should have the form reload_func() => True/False to indicate
1128
if reloading actually changed anything.
1000
1130
self._indices = indices
1131
self._reload_func = reload_func
1002
1133
def __repr__(self):
1003
1134
return "%s(%s)" % (
1055
1186
the most efficient order for the index.
1057
1188
seen_keys = set()
1058
for index in self._indices:
1059
for node in index.iter_all_entries():
1060
if node[1] not in seen_keys:
1062
seen_keys.add(node[1])
1191
for index in self._indices:
1192
for node in index.iter_all_entries():
1193
if node[1] not in seen_keys:
1195
seen_keys.add(node[1])
1197
except errors.NoSuchFile:
1198
self._reload_or_raise()
1064
1200
def iter_entries(self, keys):
1065
1201
"""Iterate over keys within the index.
1073
1209
efficient order for the index.
1075
1211
keys = set(keys)
1076
for index in self._indices:
1214
for index in self._indices:
1217
for node in index.iter_entries(keys):
1218
keys.remove(node[1])
1079
for node in index.iter_entries(keys):
1080
keys.remove(node[1])
1221
except errors.NoSuchFile:
1222
self._reload_or_raise()
1083
1224
def iter_entries_prefix(self, keys):
1084
1225
"""Iterate over keys within the index using prefix matching.
1106
1247
seen_keys = set()
1107
for index in self._indices:
1108
for node in index.iter_entries_prefix(keys):
1109
if node[1] in seen_keys:
1111
seen_keys.add(node[1])
1250
for index in self._indices:
1251
for node in index.iter_entries_prefix(keys):
1252
if node[1] in seen_keys:
1254
seen_keys.add(node[1])
1257
except errors.NoSuchFile:
1258
self._reload_or_raise()
1114
1260
def key_count(self):
1115
1261
"""Return an estimate of the number of keys in this index.
1117
1263
For CombinedGraphIndex this is approximated by the sum of the keys of
1118
1264
the child indices. As child indices may have duplicate keys this can
1119
1265
have a maximum error of the number of child indices * largest number of
1120
1266
keys in any index.
1122
return sum((index.key_count() for index in self._indices), 0)
1270
return sum((index.key_count() for index in self._indices), 0)
1271
except errors.NoSuchFile:
1272
self._reload_or_raise()
1274
def _reload_or_raise(self):
1275
"""We just got a NoSuchFile exception.
1277
Try to reload the indices, if it fails, just raise the current
1280
if self._reload_func is None:
1282
exc_type, exc_value, exc_traceback = sys.exc_info()
1283
trace.mutter('Trying to reload after getting exception: %s',
1285
if not self._reload_func():
1286
# We tried to reload, but nothing changed, so we fail anyway
1287
trace.mutter('_reload_func indicated nothing has changed.'
1288
' Raising original exception.')
1289
raise exc_type, exc_value, exc_traceback
1124
1291
def validate(self):
1125
1292
"""Validate that everything in the index can be accessed."""
1126
for index in self._indices:
1295
for index in self._indices:
1298
except errors.NoSuchFile:
1299
self._reload_or_raise()
1130
1302
class InMemoryGraphIndex(GraphIndexBuilder):