53
52
_newline_null_re = re.compile('[\n\0]')
56
def _has_key_from_parent_map(self, key):
57
"""Check if this index has one key.
59
If it's possible to check for multiple keys at once through
60
calling get_parent_map that should be faster.
62
return (key in self.get_parent_map([key]))
65
def _missing_keys_from_parent_map(self, keys):
66
return set(keys) - set(self.get_parent_map(keys))
69
55
class GraphIndexBuilder(object):
70
56
"""A builder that can build a GraphIndex.
72
58
The resulting graph has the structure:
74
60
_SIGNATURE OPTIONS NODES NEWLINE
75
61
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
76
62
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
111
94
if not element or _whitespace_re.search(element) is not None:
112
95
raise errors.BadIndexKey(element)
114
def _external_references(self):
115
"""Return references that are not present in this index.
119
# TODO: JAM 2008-11-21 This makes an assumption about how the reference
120
# lists are used. It is currently correct for pack-0.92 through
121
# 1.9, which use the node references (3rd column) second
122
# reference list as the compression parent. Perhaps this should
123
# be moved into something higher up the stack, since it
124
# makes assumptions about how the index is used.
125
if self.reference_lists > 1:
126
for node in self.iter_all_entries():
128
refs.update(node[3][1])
131
# If reference_lists == 0 there can be no external references, and
132
# if reference_lists == 1, then there isn't a place to store the
136
def _get_nodes_by_key(self):
137
if self._nodes_by_key is None:
139
if self.reference_lists:
140
for key, (absent, references, value) in self._nodes.iteritems():
143
key_dict = nodes_by_key
144
for subkey in key[:-1]:
145
key_dict = key_dict.setdefault(subkey, {})
146
key_dict[key[-1]] = key, value, references
148
for key, (absent, references, value) in self._nodes.iteritems():
151
key_dict = nodes_by_key
152
for subkey in key[:-1]:
153
key_dict = key_dict.setdefault(subkey, {})
154
key_dict[key[-1]] = key, value
155
self._nodes_by_key = nodes_by_key
156
return self._nodes_by_key
158
def _update_nodes_by_key(self, key, value, node_refs):
159
"""Update the _nodes_by_key dict with a new key.
161
For a key of (foo, bar, baz) create
162
_nodes_by_key[foo][bar][baz] = key_value
164
if self._nodes_by_key is None:
166
key_dict = self._nodes_by_key
167
if self.reference_lists:
168
key_value = key, value, node_refs
170
key_value = key, value
171
for subkey in key[:-1]:
172
key_dict = key_dict.setdefault(subkey, {})
173
key_dict[key[-1]] = key_value
175
def _check_key_ref_value(self, key, references, value):
176
"""Check that 'key' and 'references' are all valid.
178
:param key: A key tuple. Must conform to the key interface (be a tuple,
179
be of the right length, not have any whitespace or nulls in any key
181
:param references: An iterable of reference lists. Something like
182
[[(ref, key)], [(ref, key), (other, key)]]
183
:param value: The value associate with this key. Must not contain
184
newlines or null characters.
185
:return: (node_refs, absent_references)
186
node_refs basically a packed form of 'references' where all
188
absent_references reference keys that are not in self._nodes.
189
This may contain duplicates if the same key is
190
referenced in multiple lists.
97
def add_node(self, key, value, references=()):
98
"""Add a node to the index.
100
:param key: The key. keys are non-empty tuples containing
101
as many whitespace-free utf8 bytestrings as the key length
102
defined for this index.
103
:param references: An iterable of iterables of keys. Each is a
104
reference to another key.
105
:param value: The value to associate with the key. It may be any
106
bytes as long as it does not contain \0 or \n.
192
108
self._check_key(key)
193
109
if _newline_null_re.search(value) is not None:
195
111
if len(references) != self.reference_lists:
196
112
raise errors.BadIndexValue(references)
198
absent_references = []
199
114
for reference_list in references:
200
115
for reference in reference_list:
201
# If reference *is* in self._nodes, then we know it has already
116
self._check_key(reference)
203
117
if reference not in self._nodes:
204
self._check_key(reference)
205
absent_references.append(reference)
118
self._nodes[reference] = ('a', (), '')
206
119
node_refs.append(tuple(reference_list))
207
return tuple(node_refs), absent_references
209
def add_node(self, key, value, references=()):
210
"""Add a node to the index.
212
:param key: The key. keys are non-empty tuples containing
213
as many whitespace-free utf8 bytestrings as the key length
214
defined for this index.
215
:param references: An iterable of iterables of keys. Each is a
216
reference to another key.
217
:param value: The value to associate with the key. It may be any
218
bytes as long as it does not contain \0 or \n.
221
absent_references) = self._check_key_ref_value(key, references, value)
222
if key in self._nodes and self._nodes[key][0] != 'a':
120
if key in self._nodes and self._nodes[key][0] == '':
223
121
raise errors.BadIndexDuplicateKey(key, self)
224
for reference in absent_references:
225
# There may be duplicates, but I don't think it is worth worrying
227
self._nodes[reference] = ('a', (), '')
228
self._nodes[key] = ('', node_refs, value)
122
self._nodes[key] = ('', tuple(node_refs), value)
229
123
self._keys.add(key)
230
if self._nodes_by_key is not None and self._key_length > 1:
231
self._update_nodes_by_key(key, value, node_refs)
124
if self._key_length > 1:
125
key_dict = self._nodes_by_key
126
if self.reference_lists:
127
key_value = key, value, tuple(node_refs)
129
key_value = key, value
130
# possibly should do this on-demand, but it seems likely it is
132
# For a key of (foo, bar, baz) create
133
# _nodes_by_key[foo][bar][baz] = key_value
134
for subkey in key[:-1]:
135
key_dict = key_dict.setdefault(subkey, {})
136
key_dict[key[-1]] = key_value
233
138
def finish(self):
234
139
lines = [_SIGNATURE]
314
219
raise errors.BzrError('Failed index creation. Internal error:'
315
220
' mismatched output length and expected length: %d %d' %
316
221
(len(result.getvalue()), expected_bytes))
319
def set_optimize(self, for_size=None, combine_backing_indices=None):
320
"""Change how the builder tries to optimize the result.
322
:param for_size: Tell the builder to try and make the index as small as
324
:param combine_backing_indices: If the builder spills to disk to save
325
memory, should the on-disk indices be combined. Set to True if you
326
are going to be probing the index, but to False if you are not. (If
327
you are not querying, then the time spent combining is wasted.)
330
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
332
if for_size is not None:
333
self._optimize_for_size = for_size
334
if combine_backing_indices is not None:
335
self._combine_backing_indices = combine_backing_indices
222
return StringIO(''.join(lines))
338
225
class GraphIndex(object):
339
226
"""An index for data with embedded graphs.
341
228
The index maps keys to a list of key reference lists, and a value.
342
229
Each node has the same number of key reference lists. Each key reference
343
230
list can be empty or an arbitrary length. The value is an opaque NULL
344
terminated string without any newlines. The storage of the index is
231
terminated string without any newlines. The storage of the index is
345
232
hidden in the interface: keys and key references are always tuples of
346
233
bytestrings, never the internal representation (e.g. dictionary offsets).
399
284
def __ne__(self, other):
400
285
return not self.__eq__(other)
403
return "%s(%r)" % (self.__class__.__name__,
404
self._transport.abspath(self._name))
406
def _buffer_all(self, stream=None):
287
def _buffer_all(self):
407
288
"""Buffer all the index data.
409
290
Mutates self._nodes and self.keys_by_offset.
411
if self._nodes is not None:
412
# We already did this
414
292
if 'index' in debug.debug_flags:
415
293
mutter('Reading entire index %s', self._transport.abspath(self._name))
417
stream = self._transport.get(self._name)
294
stream = self._transport.get(self._name)
418
295
self._read_prefix(stream)
419
296
self._expected_elements = 3 + self._key_length
438
315
node_value = value
439
316
self._nodes[key] = node_value
317
if self._key_length > 1:
318
subkey = list(reversed(key[:-1]))
319
key_dict = self._nodes_by_key
320
if self.node_ref_lists:
321
key_value = key, node_value[0], node_value[1]
323
key_value = key, node_value
324
# possibly should do this on-demand, but it seems likely it is
326
# For a key of (foo, bar, baz) create
327
# _nodes_by_key[foo][bar][baz] = key_value
328
for subkey in key[:-1]:
329
key_dict = key_dict.setdefault(subkey, {})
330
key_dict[key[-1]] = key_value
440
331
# cache the keys for quick set intersections
441
332
self._keys = set(self._nodes)
442
333
if trailers != 1:
443
334
# there must be one line - the empty trailer line.
444
335
raise errors.BadIndexData(self)
446
def external_references(self, ref_list_num):
447
"""Return references that are not present in this index.
450
if ref_list_num + 1 > self.node_ref_lists:
451
raise ValueError('No ref list %d, index has %d ref lists'
452
% (ref_list_num, self.node_ref_lists))
454
for key, (value, ref_lists) in self._nodes.iteritems():
455
ref_list = ref_lists[ref_list_num]
456
refs.update(ref_list)
457
return refs - self._keys
459
def _get_nodes_by_key(self):
460
if self._nodes_by_key is None:
462
if self.node_ref_lists:
463
for key, (value, references) in self._nodes.iteritems():
464
key_dict = nodes_by_key
465
for subkey in key[:-1]:
466
key_dict = key_dict.setdefault(subkey, {})
467
key_dict[key[-1]] = key, value, references
469
for key, value in self._nodes.iteritems():
470
key_dict = nodes_by_key
471
for subkey in key[:-1]:
472
key_dict = key_dict.setdefault(subkey, {})
473
key_dict[key[-1]] = key, value
474
self._nodes_by_key = nodes_by_key
475
return self._nodes_by_key
477
337
def iter_all_entries(self):
478
338
"""Iterate over all keys within the index.
604
464
keys supplied. No additional keys will be returned, and every
605
465
key supplied that is in the index will be returned.
467
# PERFORMANCE TODO: parse and bisect all remaining data at some
468
# threshold of total-index processing/get calling layers that expect to
469
# read the entire index to use the iter_all_entries method instead.
610
473
if self._size is None and self._nodes is None:
611
474
self._buffer_all()
613
# We fit about 20 keys per minimum-read (4K), so if we are looking for
614
# more than 1/20th of the index its likely (assuming homogenous key
615
# spread) that we'll read the entire index. If we're going to do that,
616
# buffer the whole thing. A better analysis might take key spread into
617
# account - but B+Tree indices are better anyway.
618
# We could look at all data read, and use a threshold there, which will
619
# trigger on ancestry walks, but that is not yet fully mapped out.
620
if self._nodes is None and len(keys) * 20 > self.key_count():
622
475
if self._nodes is not None:
623
476
return self._iter_entries_from_total_buffer(keys)
767
619
if self._bisect_nodes is None:
768
620
readv_ranges.append(_HEADER_READV)
769
621
self._read_and_parse(readv_ranges)
771
if self._nodes is not None:
772
# _read_and_parse triggered a _buffer_all because we requested the
774
for location, key in location_keys:
775
if key not in self._nodes: # not present
776
result.append(((location, key), False))
777
elif self.node_ref_lists:
778
value, refs = self._nodes[key]
779
result.append(((location, key),
780
(self, key, value, refs)))
782
result.append(((location, key),
783
(self, key, self._nodes[key])))
785
622
# generate results:
786
623
# - figure out <, >, missing, present
787
624
# - result present references so we can return them.
788
626
# keys that we cannot answer until we resolve references
789
627
pending_references = []
790
628
pending_locations = set()
841
679
readv_ranges.append((location, length))
842
680
self._read_and_parse(readv_ranges)
843
if self._nodes is not None:
844
# The _read_and_parse triggered a _buffer_all, grab the data and
846
for location, key in pending_references:
847
value, refs = self._nodes[key]
848
result.append(((location, key), (self, key, value, refs)))
850
681
for location, key in pending_references:
851
682
# answer key references we had to look-up-late.
683
index = self._parsed_key_index(key)
852
684
value, refs = self._bisect_nodes[key]
853
685
result.append(((location, key), (self, key,
854
686
value, self._resolve_references(refs))))
1007
838
trim_end = data.rfind('\n') + 1
1009
840
trim_end = data.rfind('\n', None, trim_end) + 1
1010
if not (trim_end != 0):
1011
raise AssertionError('no \n was present')
841
assert trim_end != 0, 'no \n was present'
1012
842
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1013
843
# adjust offset and data to the parseable data.
1014
844
trimmed_data = data[trim_start:trim_end]
1015
if not (trimmed_data):
1016
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1017
% (trim_start, trim_end, offset, offset + len(data)))
845
assert trimmed_data, 'read unneeded data [%d:%d] from [%d:%d]' % (
846
trim_start, trim_end, offset, offset + len(data))
1019
848
offset += trim_start
1020
849
# print "parsing", repr(trimmed_data)
1039
868
# must be at the end
1041
if not (self._size == pos + 1):
1042
raise AssertionError("%s %s" % (self._size, pos))
870
assert self._size == pos + 1, "%s %s" % (self._size, pos)
1045
873
elements = line.split('\0')
1046
874
if len(elements) != self._expected_elements:
1047
875
raise errors.BadIndexData(self)
1048
# keys are tuples. Each element is a string that may occur many
1049
# times, so we intern them to save space. AB, RC, 200807
1050
key = tuple([intern(element) for element in elements[:self._key_length]])
877
key = tuple(elements[:self._key_length])
1051
878
if first_key is None:
1053
880
absent, references, value = elements[-3:]
1125
952
:param readv_ranges: A prepared readv range list.
1127
if not readv_ranges:
1129
if self._nodes is None and self._bytes_read * 2 >= self._size:
1130
# We've already read more than 50% of the file and we are about to
1131
# request more data, just _buffer_all() and be done
1135
readv_data = self._transport.readv(self._name, readv_ranges, True,
1138
for offset, data in readv_data:
1139
self._bytes_read += len(data)
1140
if offset == 0 and len(data) == self._size:
1141
# We read the whole range, most likely because the
1142
# Transport upcast our readv ranges into one long request
1143
# for enough total data to grab the whole index.
1144
self._buffer_all(StringIO(data))
1146
if self._bisect_nodes is None:
1147
# this must be the start
1148
if not (offset == 0):
1149
raise AssertionError()
1150
offset, data = self._parse_header_from_bytes(data)
1151
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1152
self._parse_region(offset, data)
955
readv_data = self._transport.readv(self._name, readv_ranges, True,
958
for offset, data in readv_data:
959
if self._bisect_nodes is None:
960
# this must be the start
962
offset, data = self._parse_header_from_bytes(data)
963
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
964
self._parse_region(offset, data)
1154
966
def _signature(self):
1155
967
"""The file signature for this index type."""
1175
987
in the index list.
1178
def __init__(self, indices, reload_func=None):
990
def __init__(self, indices):
1179
991
"""Create a CombinedGraphIndex backed by indices.
1181
993
:param indices: An ordered list of indices to query for data.
1182
:param reload_func: A function to call if we find we are missing an
1183
index. Should have the form reload_func() => True/False to indicate
1184
if reloading actually changed anything.
1186
995
self._indices = indices
1187
self._reload_func = reload_func
1189
997
def __repr__(self):
1190
998
return "%s(%s)" % (
1191
999
self.__class__.__name__,
1192
1000
', '.join(map(repr, self._indices)))
1002
@symbol_versioning.deprecated_method(symbol_versioning.one_one)
1003
def get_parents(self, revision_ids):
1004
"""See graph._StackedParentsProvider.get_parents.
1006
This implementation thunks the graph.Graph.get_parents api across to
1009
:param revision_ids: An iterable of graph keys for this graph.
1010
:return: A list of parent details for each key in revision_ids.
1011
Each parent details will be one of:
1012
* None when the key was missing
1013
* (NULL_REVISION,) when the key has no parents.
1014
* (parent_key, parent_key...) otherwise.
1016
parent_map = self.get_parent_map(revision_ids)
1017
return [parent_map.get(r, None) for r in revision_ids]
1194
1019
def get_parent_map(self, keys):
1195
1020
"""See graph._StackedParentsProvider.get_parent_map"""
1196
1021
search_keys = set(keys)
1288
1101
seen_keys = set()
1291
for index in self._indices:
1292
for node in index.iter_entries_prefix(keys):
1293
if node[1] in seen_keys:
1295
seen_keys.add(node[1])
1298
except errors.NoSuchFile:
1299
self._reload_or_raise()
1102
for index in self._indices:
1103
for node in index.iter_entries_prefix(keys):
1104
if node[1] in seen_keys:
1106
seen_keys.add(node[1])
1301
1109
def key_count(self):
1302
1110
"""Return an estimate of the number of keys in this index.
1304
1112
For CombinedGraphIndex this is approximated by the sum of the keys of
1305
1113
the child indices. As child indices may have duplicate keys this can
1306
1114
have a maximum error of the number of child indices * largest number of
1307
1115
keys in any index.
1311
return sum((index.key_count() for index in self._indices), 0)
1312
except errors.NoSuchFile:
1313
self._reload_or_raise()
1315
missing_keys = _missing_keys_from_parent_map
1317
def _reload_or_raise(self):
1318
"""We just got a NoSuchFile exception.
1320
Try to reload the indices, if it fails, just raise the current
1323
if self._reload_func is None:
1325
exc_type, exc_value, exc_traceback = sys.exc_info()
1326
trace.mutter('Trying to reload after getting exception: %s',
1328
if not self._reload_func():
1329
# We tried to reload, but nothing changed, so we fail anyway
1330
trace.mutter('_reload_func indicated nothing has changed.'
1331
' Raising original exception.')
1332
raise exc_type, exc_value, exc_traceback
1117
return sum((index.key_count() for index in self._indices), 0)
1334
1119
def validate(self):
1335
1120
"""Validate that everything in the index can be accessed."""
1338
for index in self._indices:
1341
except errors.NoSuchFile:
1342
self._reload_or_raise()
1121
for index in self._indices:
1345
1125
class InMemoryGraphIndex(GraphIndexBuilder):