1
# Copyright (C) 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Indexing facilities."""
23
'GraphIndexPrefixAdapter',
27
from bisect import bisect_right
28
from cStringIO import StringIO
31
from bzrlib.lazy_import import lazy_import
32
lazy_import(globals(), """
33
from bzrlib import trace
34
from bzrlib.bisect_multi import bisect_multi_bytes
35
from bzrlib.revision import NULL_REVISION
36
from bzrlib.trace import mutter
44
_HEADER_READV = (0, 200)
45
_OPTION_KEY_ELEMENTS = "key_elements="
47
_OPTION_NODE_REFS = "node_ref_lists="
48
_SIGNATURE = "Bazaar Graph Index 1\n"
51
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
52
_newline_null_re = re.compile('[\n\0]')
55
class GraphIndexBuilder(object):
56
"""A builder that can build a GraphIndex.
58
The resulting graph has the structure:
60
_SIGNATURE OPTIONS NODES NEWLINE
61
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
62
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
64
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
65
KEY := Not-whitespace-utf8
67
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
68
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
69
REFERENCE := DIGITS ; digits is the byte offset in the index of the
71
VALUE := no-newline-no-null-bytes
74
def __init__(self, reference_lists=0, key_elements=1):
75
"""Create a GraphIndex builder.
77
:param reference_lists: The number of node references lists for each
79
:param key_elements: The number of bytestrings in each key.
81
self.reference_lists = reference_lists
83
# A dict of {key: (absent, ref_lists, value)}
85
self._nodes_by_key = None
86
self._key_length = key_elements
88
def _check_key(self, key):
89
"""Raise BadIndexKey if key is not a valid key for this index."""
90
if type(key) != tuple:
91
raise errors.BadIndexKey(key)
92
if self._key_length != len(key):
93
raise errors.BadIndexKey(key)
95
if not element or _whitespace_re.search(element) is not None:
96
raise errors.BadIndexKey(element)
98
def _get_nodes_by_key(self):
99
if self._nodes_by_key is None:
101
if self.reference_lists:
102
for key, (absent, references, value) in self._nodes.iteritems():
105
key_dict = nodes_by_key
106
for subkey in key[:-1]:
107
key_dict = key_dict.setdefault(subkey, {})
108
key_dict[key[-1]] = key, value, references
110
for key, (absent, references, value) in self._nodes.iteritems():
113
key_dict = nodes_by_key
114
for subkey in key[:-1]:
115
key_dict = key_dict.setdefault(subkey, {})
116
key_dict[key[-1]] = key, value
117
self._nodes_by_key = nodes_by_key
118
return self._nodes_by_key
120
def _update_nodes_by_key(self, key, value, node_refs):
121
"""Update the _nodes_by_key dict with a new key.
123
For a key of (foo, bar, baz) create
124
_nodes_by_key[foo][bar][baz] = key_value
126
if self._nodes_by_key is None:
128
key_dict = self._nodes_by_key
129
if self.reference_lists:
130
key_value = key, value, node_refs
132
key_value = key, value
133
for subkey in key[:-1]:
134
key_dict = key_dict.setdefault(subkey, {})
135
key_dict[key[-1]] = key_value
137
def _check_key_ref_value(self, key, references, value):
138
"""Check that 'key' and 'references' are all valid.
140
:param key: A key tuple. Must conform to the key interface (be a tuple,
141
be of the right length, not have any whitespace or nulls in any key
143
:param references: An iterable of reference lists. Something like
144
[[(ref, key)], [(ref, key), (other, key)]]
145
:param value: The value associate with this key. Must not contain
146
newlines or null characters.
147
:return: (node_refs, absent_references)
148
node_refs basically a packed form of 'references' where all
150
absent_references reference keys that are not in self._nodes.
151
This may contain duplicates if the same key is
152
referenced in multiple lists.
155
if _newline_null_re.search(value) is not None:
156
raise errors.BadIndexValue(value)
157
if len(references) != self.reference_lists:
158
raise errors.BadIndexValue(references)
160
absent_references = []
161
for reference_list in references:
162
for reference in reference_list:
163
# If reference *is* in self._nodes, then we know it has already
165
if reference not in self._nodes:
166
self._check_key(reference)
167
absent_references.append(reference)
168
node_refs.append(tuple(reference_list))
169
return tuple(node_refs), absent_references
171
def add_node(self, key, value, references=()):
172
"""Add a node to the index.
174
:param key: The key. keys are non-empty tuples containing
175
as many whitespace-free utf8 bytestrings as the key length
176
defined for this index.
177
:param references: An iterable of iterables of keys. Each is a
178
reference to another key.
179
:param value: The value to associate with the key. It may be any
180
bytes as long as it does not contain \0 or \n.
183
absent_references) = self._check_key_ref_value(key, references, value)
184
if key in self._nodes and self._nodes[key][0] != 'a':
185
raise errors.BadIndexDuplicateKey(key, self)
186
for reference in absent_references:
187
# There may be duplicates, but I don't think it is worth worrying
189
self._nodes[reference] = ('a', (), '')
190
self._nodes[key] = ('', node_refs, value)
192
if self._nodes_by_key is not None and self._key_length > 1:
193
self._update_nodes_by_key(key, value, node_refs)
197
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
198
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
199
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
200
prefix_length = sum(len(x) for x in lines)
201
# references are byte offsets. To avoid having to do nasty
202
# polynomial work to resolve offsets (references to later in the
203
# file cannot be determined until all the inbetween references have
204
# been calculated too) we pad the offsets with 0's to make them be
205
# of consistent length. Using binary offsets would break the trivial
207
# to calculate the width of zero's needed we do three passes:
208
# one to gather all the non-reference data and the number of references.
209
# one to pad all the data with reference-length and determine entry
213
# forward sorted by key. In future we may consider topological sorting,
214
# at the cost of table scans for direct lookup, or a second index for
216
nodes = sorted(self._nodes.items())
217
# if we do not prepass, we don't know how long it will be up front.
218
expected_bytes = None
219
# we only need to pre-pass if we have reference lists at all.
220
if self.reference_lists:
222
non_ref_bytes = prefix_length
224
# TODO use simple multiplication for the constants in this loop.
225
for key, (absent, references, value) in nodes:
226
# record the offset known *so far* for this key:
227
# the non reference bytes to date, and the total references to
228
# date - saves reaccumulating on the second pass
229
key_offset_info.append((key, non_ref_bytes, total_references))
230
# key is literal, value is literal, there are 3 null's, 1 NL
231
# key is variable length tuple, \x00 between elements
232
non_ref_bytes += sum(len(element) for element in key)
233
if self._key_length > 1:
234
non_ref_bytes += self._key_length - 1
235
# value is literal bytes, there are 3 null's, 1 NL.
236
non_ref_bytes += len(value) + 3 + 1
237
# one byte for absent if set.
240
elif self.reference_lists:
241
# (ref_lists -1) tabs
242
non_ref_bytes += self.reference_lists - 1
243
# (ref-1 cr's per ref_list)
244
for ref_list in references:
245
# how many references across the whole file?
246
total_references += len(ref_list)
247
# accrue reference separators
249
non_ref_bytes += len(ref_list) - 1
250
# how many digits are needed to represent the total byte count?
252
possible_total_bytes = non_ref_bytes + total_references*digits
253
while 10 ** digits < possible_total_bytes:
255
possible_total_bytes = non_ref_bytes + total_references*digits
256
expected_bytes = possible_total_bytes + 1 # terminating newline
257
# resolve key addresses.
259
for key, non_ref_bytes, total_references in key_offset_info:
260
key_addresses[key] = non_ref_bytes + total_references*digits
262
format_string = '%%0%sd' % digits
263
for key, (absent, references, value) in nodes:
264
flattened_references = []
265
for ref_list in references:
267
for reference in ref_list:
268
ref_addresses.append(format_string % key_addresses[reference])
269
flattened_references.append('\r'.join(ref_addresses))
270
string_key = '\x00'.join(key)
271
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
272
'\t'.join(flattened_references), value))
274
result = StringIO(''.join(lines))
275
if expected_bytes and len(result.getvalue()) != expected_bytes:
276
raise errors.BzrError('Failed index creation. Internal error:'
277
' mismatched output length and expected length: %d %d' %
278
(len(result.getvalue()), expected_bytes))
282
class GraphIndex(object):
283
"""An index for data with embedded graphs.
285
The index maps keys to a list of key reference lists, and a value.
286
Each node has the same number of key reference lists. Each key reference
287
list can be empty or an arbitrary length. The value is an opaque NULL
288
terminated string without any newlines. The storage of the index is
289
hidden in the interface: keys and key references are always tuples of
290
bytestrings, never the internal representation (e.g. dictionary offsets).
292
It is presumed that the index will not be mutated - it is static data.
294
Successive iter_all_entries calls will read the entire index each time.
295
Additionally, iter_entries calls will read the index linearly until the
296
desired keys are found. XXX: This must be fixed before the index is
297
suitable for production use. :XXX
300
def __init__(self, transport, name, size):
301
"""Open an index called name on transport.
303
:param transport: A bzrlib.transport.Transport.
304
:param name: A path to provide to transport API calls.
305
:param size: The size of the index in bytes. This is used for bisection
306
logic to perform partial index reads. While the size could be
307
obtained by statting the file this introduced an additional round
308
trip as well as requiring stat'able transports, both of which are
309
avoided by having it supplied. If size is None, then bisection
310
support will be disabled and accessing the index will just stream
313
self._transport = transport
315
# Becomes a dict of key:(value, reference-list-byte-locations) used by
316
# the bisection interface to store parsed but not resolved keys.
317
self._bisect_nodes = None
318
# Becomes a dict of key:(value, reference-list-keys) which are ready to
319
# be returned directly to callers.
321
# a sorted list of slice-addresses for the parsed bytes of the file.
322
# e.g. (0,1) would mean that byte 0 is parsed.
323
self._parsed_byte_map = []
324
# a sorted list of keys matching each slice address for parsed bytes
325
# e.g. (None, 'foo@bar') would mean that the first byte contained no
326
# key, and the end byte of the slice is the of the data for 'foo@bar'
327
self._parsed_key_map = []
328
self._key_count = None
329
self._keys_by_offset = None
330
self._nodes_by_key = None
332
# The number of bytes we've read so far in trying to process this file
335
def __eq__(self, other):
336
"""Equal when self and other were created with the same parameters."""
338
type(self) == type(other) and
339
self._transport == other._transport and
340
self._name == other._name and
341
self._size == other._size)
343
def __ne__(self, other):
344
return not self.__eq__(other)
347
return "%s(%r)" % (self.__class__.__name__,
348
self._transport.abspath(self._name))
350
def _buffer_all(self, stream=None):
351
"""Buffer all the index data.
353
Mutates self._nodes and self.keys_by_offset.
355
if self._nodes is not None:
356
# We already did this
358
if 'index' in debug.debug_flags:
359
mutter('Reading entire index %s', self._transport.abspath(self._name))
361
stream = self._transport.get(self._name)
362
self._read_prefix(stream)
363
self._expected_elements = 3 + self._key_length
365
# raw data keyed by offset
366
self._keys_by_offset = {}
367
# ready-to-return key:value or key:value, node_ref_lists
369
self._nodes_by_key = {}
372
lines = stream.read().split('\n')
374
_, _, _, trailers = self._parse_lines(lines, pos)
375
for key, absent, references, value in self._keys_by_offset.itervalues():
378
# resolve references:
379
if self.node_ref_lists:
380
node_value = (value, self._resolve_references(references))
383
self._nodes[key] = node_value
384
if self._key_length > 1:
385
# TODO: We may want to do this lazily, but if we are calling
386
# _buffer_all, we are likely to be doing
387
# iter_entries_prefix
388
key_dict = self._nodes_by_key
389
if self.node_ref_lists:
390
key_value = key, node_value[0], node_value[1]
392
key_value = key, node_value
393
# For a key of (foo, bar, baz) create
394
# _nodes_by_key[foo][bar][baz] = key_value
395
for subkey in key[:-1]:
396
key_dict = key_dict.setdefault(subkey, {})
397
key_dict[key[-1]] = key_value
398
# cache the keys for quick set intersections
399
self._keys = set(self._nodes)
401
# there must be one line - the empty trailer line.
402
raise errors.BadIndexData(self)
404
def iter_all_entries(self):
405
"""Iterate over all keys within the index.
407
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
408
The former tuple is used when there are no reference lists in the
409
index, making the API compatible with simple key:value index types.
410
There is no defined order for the result iteration - it will be in
411
the most efficient order for the index.
413
if 'evil' in debug.debug_flags:
414
trace.mutter_callsite(3,
415
"iter_all_entries scales with size of history.")
416
if self._nodes is None:
418
if self.node_ref_lists:
419
for key, (value, node_ref_lists) in self._nodes.iteritems():
420
yield self, key, value, node_ref_lists
422
for key, value in self._nodes.iteritems():
423
yield self, key, value
425
def _read_prefix(self, stream):
426
signature = stream.read(len(self._signature()))
427
if not signature == self._signature():
428
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
429
options_line = stream.readline()
430
if not options_line.startswith(_OPTION_NODE_REFS):
431
raise errors.BadIndexOptions(self)
433
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
435
raise errors.BadIndexOptions(self)
436
options_line = stream.readline()
437
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
438
raise errors.BadIndexOptions(self)
440
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
442
raise errors.BadIndexOptions(self)
443
options_line = stream.readline()
444
if not options_line.startswith(_OPTION_LEN):
445
raise errors.BadIndexOptions(self)
447
self._key_count = int(options_line[len(_OPTION_LEN):-1])
449
raise errors.BadIndexOptions(self)
451
def _resolve_references(self, references):
452
"""Return the resolved key references for references.
454
References are resolved by looking up the location of the key in the
455
_keys_by_offset map and substituting the key name, preserving ordering.
457
:param references: An iterable of iterables of key locations. e.g.
459
:return: A tuple of tuples of keys.
462
for ref_list in references:
463
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
464
return tuple(node_refs)
466
def _find_index(self, range_map, key):
467
"""Helper for the _parsed_*_index calls.
469
Given a range map - [(start, end), ...], finds the index of the range
470
in the map for key if it is in the map, and if it is not there, the
471
immediately preceeding range in the map.
473
result = bisect_right(range_map, key) - 1
474
if result + 1 < len(range_map):
475
# check the border condition, it may be in result + 1
476
if range_map[result + 1][0] == key[0]:
480
def _parsed_byte_index(self, offset):
481
"""Return the index of the entry immediately before offset.
483
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
484
there is one unparsed byte (the 11th, addressed as[10]). then:
485
asking for 0 will return 0
486
asking for 10 will return 0
487
asking for 11 will return 1
488
asking for 12 will return 1
491
return self._find_index(self._parsed_byte_map, key)
493
def _parsed_key_index(self, key):
494
"""Return the index of the entry immediately before key.
496
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
497
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
498
have been parsed, then:
499
asking for '' will return 0
500
asking for 'a' will return 0
501
asking for 'b' will return 1
502
asking for 'e' will return 1
504
search_key = (key, None)
505
return self._find_index(self._parsed_key_map, search_key)
507
def _is_parsed(self, offset):
508
"""Returns True if offset has been parsed."""
509
index = self._parsed_byte_index(offset)
510
if index == len(self._parsed_byte_map):
511
return offset < self._parsed_byte_map[index - 1][1]
512
start, end = self._parsed_byte_map[index]
513
return offset >= start and offset < end
515
def _iter_entries_from_total_buffer(self, keys):
516
"""Iterate over keys when the entire index is parsed."""
517
keys = keys.intersection(self._keys)
518
if self.node_ref_lists:
520
value, node_refs = self._nodes[key]
521
yield self, key, value, node_refs
524
yield self, key, self._nodes[key]
526
def iter_entries(self, keys):
527
"""Iterate over keys within the index.
529
:param keys: An iterable providing the keys to be retrieved.
530
:return: An iterable as per iter_all_entries, but restricted to the
531
keys supplied. No additional keys will be returned, and every
532
key supplied that is in the index will be returned.
537
if self._size is None and self._nodes is None:
540
# We fit about 20 keys per minimum-read (4K), so if we are looking for
541
# more than 1/20th of the index its likely (assuming homogenous key
542
# spread) that we'll read the entire index. If we're going to do that,
543
# buffer the whole thing. A better analysis might take key spread into
544
# account - but B+Tree indices are better anyway.
545
# We could look at all data read, and use a threshold there, which will
546
# trigger on ancestry walks, but that is not yet fully mapped out.
547
if self._nodes is None and len(keys) * 20 > self.key_count():
549
if self._nodes is not None:
550
return self._iter_entries_from_total_buffer(keys)
552
return (result[1] for result in bisect_multi_bytes(
553
self._lookup_keys_via_location, self._size, keys))
555
def iter_entries_prefix(self, keys):
556
"""Iterate over keys within the index using prefix matching.
558
Prefix matching is applied within the tuple of a key, not to within
559
the bytestring of each key element. e.g. if you have the keys ('foo',
560
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
561
only the former key is returned.
563
WARNING: Note that this method currently causes a full index parse
564
unconditionally (which is reasonably appropriate as it is a means for
565
thunking many small indices into one larger one and still supplies
566
iter_all_entries at the thunk layer).
568
:param keys: An iterable providing the key prefixes to be retrieved.
569
Each key prefix takes the form of a tuple the length of a key, but
570
with the last N elements 'None' rather than a regular bytestring.
571
The first element cannot be 'None'.
572
:return: An iterable as per iter_all_entries, but restricted to the
573
keys with a matching prefix to those supplied. No additional keys
574
will be returned, and every match that is in the index will be
580
# load data - also finds key lengths
581
if self._nodes is None:
583
if self._key_length == 1:
587
raise errors.BadIndexKey(key)
588
if len(key) != self._key_length:
589
raise errors.BadIndexKey(key)
590
if self.node_ref_lists:
591
value, node_refs = self._nodes[key]
592
yield self, key, value, node_refs
594
yield self, key, self._nodes[key]
599
raise errors.BadIndexKey(key)
600
if len(key) != self._key_length:
601
raise errors.BadIndexKey(key)
602
# find what it refers to:
603
key_dict = self._nodes_by_key
605
# find the subdict whose contents should be returned.
607
while len(elements) and elements[0] is not None:
608
key_dict = key_dict[elements[0]]
611
# a non-existant lookup.
616
key_dict = dicts.pop(-1)
617
# can't be empty or would not exist
618
item, value = key_dict.iteritems().next()
619
if type(value) == dict:
621
dicts.extend(key_dict.itervalues())
624
for value in key_dict.itervalues():
625
# each value is the key:value:node refs tuple
627
yield (self, ) + value
629
# the last thing looked up was a terminal element
630
yield (self, ) + key_dict
633
"""Return an estimate of the number of keys in this index.
635
For GraphIndex the estimate is exact.
637
if self._key_count is None:
638
self._read_and_parse([_HEADER_READV])
639
return self._key_count
641
def _lookup_keys_via_location(self, location_keys):
642
"""Public interface for implementing bisection.
644
If _buffer_all has been called, then all the data for the index is in
645
memory, and this method should not be called, as it uses a separate
646
cache because it cannot pre-resolve all indices, which buffer_all does
649
:param location_keys: A list of location(byte offset), key tuples.
650
:return: A list of (location_key, result) tuples as expected by
651
bzrlib.bisect_multi.bisect_multi_bytes.
653
# Possible improvements:
654
# - only bisect lookup each key once
655
# - sort the keys first, and use that to reduce the bisection window
657
# this progresses in three parts:
660
# attempt to answer the question from the now in memory data.
661
# build the readv request
662
# for each location, ask for 800 bytes - much more than rows we've seen
665
for location, key in location_keys:
666
# can we answer from cache?
667
if self._bisect_nodes and key in self._bisect_nodes:
668
# We have the key parsed.
670
index = self._parsed_key_index(key)
671
if (len(self._parsed_key_map) and
672
self._parsed_key_map[index][0] <= key and
673
(self._parsed_key_map[index][1] >= key or
674
# end of the file has been parsed
675
self._parsed_byte_map[index][1] == self._size)):
676
# the key has been parsed, so no lookup is needed even if its
679
# - if we have examined this part of the file already - yes
680
index = self._parsed_byte_index(location)
681
if (len(self._parsed_byte_map) and
682
self._parsed_byte_map[index][0] <= location and
683
self._parsed_byte_map[index][1] > location):
684
# the byte region has been parsed, so no read is needed.
687
if location + length > self._size:
688
length = self._size - location
689
# todo, trim out parsed locations.
691
readv_ranges.append((location, length))
692
# read the header if needed
693
if self._bisect_nodes is None:
694
readv_ranges.append(_HEADER_READV)
695
self._read_and_parse(readv_ranges)
697
if self._nodes is not None:
698
# _read_and_parse triggered a _buffer_all because we requested the
700
for location, key in location_keys:
701
if key not in self._nodes: # not present
702
result.append(((location, key), False))
703
elif self.node_ref_lists:
704
value, refs = self._nodes[key]
705
result.append(((location, key),
706
(self, key, value, refs)))
708
result.append(((location, key),
709
(self, key, self._nodes[key])))
712
# - figure out <, >, missing, present
713
# - result present references so we can return them.
714
# keys that we cannot answer until we resolve references
715
pending_references = []
716
pending_locations = set()
717
for location, key in location_keys:
718
# can we answer from cache?
719
if key in self._bisect_nodes:
720
# the key has been parsed, so no lookup is needed
721
if self.node_ref_lists:
722
# the references may not have been all parsed.
723
value, refs = self._bisect_nodes[key]
724
wanted_locations = []
725
for ref_list in refs:
727
if ref not in self._keys_by_offset:
728
wanted_locations.append(ref)
730
pending_locations.update(wanted_locations)
731
pending_references.append((location, key))
733
result.append(((location, key), (self, key,
734
value, self._resolve_references(refs))))
736
result.append(((location, key),
737
(self, key, self._bisect_nodes[key])))
740
# has the region the key should be in, been parsed?
741
index = self._parsed_key_index(key)
742
if (self._parsed_key_map[index][0] <= key and
743
(self._parsed_key_map[index][1] >= key or
744
# end of the file has been parsed
745
self._parsed_byte_map[index][1] == self._size)):
746
result.append(((location, key), False))
748
# no, is the key above or below the probed location:
749
# get the range of the probed & parsed location
750
index = self._parsed_byte_index(location)
751
# if the key is below the start of the range, its below
752
if key < self._parsed_key_map[index][0]:
756
result.append(((location, key), direction))
758
# lookup data to resolve references
759
for location in pending_locations:
761
if location + length > self._size:
762
length = self._size - location
763
# TODO: trim out parsed locations (e.g. if the 800 is into the
764
# parsed region trim it, and dont use the adjust_for_latency
767
readv_ranges.append((location, length))
768
self._read_and_parse(readv_ranges)
769
if self._nodes is not None:
770
# The _read_and_parse triggered a _buffer_all, grab the data and
772
for location, key in pending_references:
773
value, refs = self._nodes[key]
774
result.append(((location, key), (self, key, value, refs)))
776
for location, key in pending_references:
777
# answer key references we had to look-up-late.
778
value, refs = self._bisect_nodes[key]
779
result.append(((location, key), (self, key,
780
value, self._resolve_references(refs))))
783
def _parse_header_from_bytes(self, bytes):
784
"""Parse the header from a region of bytes.
786
:param bytes: The data to parse.
787
:return: An offset, data tuple such as readv yields, for the unparsed
788
data. (which may length 0).
790
signature = bytes[0:len(self._signature())]
791
if not signature == self._signature():
792
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
793
lines = bytes[len(self._signature()):].splitlines()
794
options_line = lines[0]
795
if not options_line.startswith(_OPTION_NODE_REFS):
796
raise errors.BadIndexOptions(self)
798
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
800
raise errors.BadIndexOptions(self)
801
options_line = lines[1]
802
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
803
raise errors.BadIndexOptions(self)
805
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
807
raise errors.BadIndexOptions(self)
808
options_line = lines[2]
809
if not options_line.startswith(_OPTION_LEN):
810
raise errors.BadIndexOptions(self)
812
self._key_count = int(options_line[len(_OPTION_LEN):])
814
raise errors.BadIndexOptions(self)
815
# calculate the bytes we have processed
816
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
818
self._parsed_bytes(0, None, header_end, None)
819
# setup parsing state
820
self._expected_elements = 3 + self._key_length
821
# raw data keyed by offset
822
self._keys_by_offset = {}
823
# keys with the value and node references
824
self._bisect_nodes = {}
825
return header_end, bytes[header_end:]
827
def _parse_region(self, offset, data):
828
"""Parse node data returned from a readv operation.
830
:param offset: The byte offset the data starts at.
831
:param data: The data to parse.
835
end = offset + len(data)
838
# Trivial test - if the current index's end is within the
839
# low-matching parsed range, we're done.
840
index = self._parsed_byte_index(high_parsed)
841
if end < self._parsed_byte_map[index][1]:
843
# print "[%d:%d]" % (offset, end), \
844
# self._parsed_byte_map[index:index + 2]
845
high_parsed, last_segment = self._parse_segment(
846
offset, data, end, index)
850
def _parse_segment(self, offset, data, end, index):
851
"""Parse one segment of data.
853
:param offset: Where 'data' begins in the file.
854
:param data: Some data to parse a segment of.
855
:param end: Where data ends
856
:param index: The current index into the parsed bytes map.
857
:return: True if the parsed segment is the last possible one in the
859
:return: high_parsed_byte, last_segment.
860
high_parsed_byte is the location of the highest parsed byte in this
861
segment, last_segment is True if the parsed segment is the last
862
possible one in the data block.
864
# default is to use all data
866
# accomodate overlap with data before this.
867
if offset < self._parsed_byte_map[index][1]:
868
# overlaps the lower parsed region
869
# skip the parsed data
870
trim_start = self._parsed_byte_map[index][1] - offset
871
# don't trim the start for \n
872
start_adjacent = True
873
elif offset == self._parsed_byte_map[index][1]:
874
# abuts the lower parsed region
877
# do not trim anything
878
start_adjacent = True
880
# does not overlap the lower parsed region
883
# but trim the leading \n
884
start_adjacent = False
885
if end == self._size:
886
# lines up to the end of all data:
889
# do not strip to the last \n
892
elif index + 1 == len(self._parsed_byte_map):
893
# at the end of the parsed data
896
# but strip to the last \n
899
elif end == self._parsed_byte_map[index + 1][0]:
900
# buts up against the next parsed region
903
# do not strip to the last \n
906
elif end > self._parsed_byte_map[index + 1][0]:
907
# overlaps into the next parsed region
908
# only consider the unparsed data
909
trim_end = self._parsed_byte_map[index + 1][0] - offset
910
# do not strip to the last \n as we know its an entire record
912
last_segment = end < self._parsed_byte_map[index + 1][1]
914
# does not overlap into the next region
917
# but strip to the last \n
920
# now find bytes to discard if needed
921
if not start_adjacent:
922
# work around python bug in rfind
923
if trim_start is None:
924
trim_start = data.find('\n') + 1
926
trim_start = data.find('\n', trim_start) + 1
927
if not (trim_start != 0):
928
raise AssertionError('no \n was present')
929
# print 'removing start', offset, trim_start, repr(data[:trim_start])
931
# work around python bug in rfind
933
trim_end = data.rfind('\n') + 1
935
trim_end = data.rfind('\n', None, trim_end) + 1
936
if not (trim_end != 0):
937
raise AssertionError('no \n was present')
938
# print 'removing end', offset, trim_end, repr(data[trim_end:])
939
# adjust offset and data to the parseable data.
940
trimmed_data = data[trim_start:trim_end]
941
if not (trimmed_data):
942
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
943
% (trim_start, trim_end, offset, offset + len(data)))
946
# print "parsing", repr(trimmed_data)
947
# splitlines mangles the \r delimiters.. don't use it.
948
lines = trimmed_data.split('\n')
951
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
952
for key, value in nodes:
953
self._bisect_nodes[key] = value
954
self._parsed_bytes(offset, first_key,
955
offset + len(trimmed_data), last_key)
956
return offset + len(trimmed_data), last_segment
958
def _parse_lines(self, lines, pos):
967
if not (self._size == pos + 1):
968
raise AssertionError("%s %s" % (self._size, pos))
971
elements = line.split('\0')
972
if len(elements) != self._expected_elements:
973
raise errors.BadIndexData(self)
974
# keys are tuples. Each element is a string that may occur many
975
# times, so we intern them to save space. AB, RC, 200807
976
key = tuple(intern(element) for element in elements[:self._key_length])
977
if first_key is None:
979
absent, references, value = elements[-3:]
981
for ref_string in references.split('\t'):
982
ref_lists.append(tuple([
983
int(ref) for ref in ref_string.split('\r') if ref
985
ref_lists = tuple(ref_lists)
986
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
987
pos += len(line) + 1 # +1 for the \n
990
if self.node_ref_lists:
991
node_value = (value, ref_lists)
994
nodes.append((key, node_value))
995
# print "parsed ", key
996
return first_key, key, nodes, trailers
998
def _parsed_bytes(self, start, start_key, end, end_key):
999
"""Mark the bytes from start to end as parsed.
1001
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
1004
:param start: The start of the parsed region.
1005
:param end: The end of the parsed region.
1007
index = self._parsed_byte_index(start)
1008
new_value = (start, end)
1009
new_key = (start_key, end_key)
1011
# first range parsed is always the beginning.
1012
self._parsed_byte_map.insert(index, new_value)
1013
self._parsed_key_map.insert(index, new_key)
1017
# extend lower region
1018
# extend higher region
1019
# combine two regions
1020
if (index + 1 < len(self._parsed_byte_map) and
1021
self._parsed_byte_map[index][1] == start and
1022
self._parsed_byte_map[index + 1][0] == end):
1023
# combine two regions
1024
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1025
self._parsed_byte_map[index + 1][1])
1026
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1027
self._parsed_key_map[index + 1][1])
1028
del self._parsed_byte_map[index + 1]
1029
del self._parsed_key_map[index + 1]
1030
elif self._parsed_byte_map[index][1] == start:
1031
# extend the lower entry
1032
self._parsed_byte_map[index] = (
1033
self._parsed_byte_map[index][0], end)
1034
self._parsed_key_map[index] = (
1035
self._parsed_key_map[index][0], end_key)
1036
elif (index + 1 < len(self._parsed_byte_map) and
1037
self._parsed_byte_map[index + 1][0] == end):
1038
# extend the higher entry
1039
self._parsed_byte_map[index + 1] = (
1040
start, self._parsed_byte_map[index + 1][1])
1041
self._parsed_key_map[index + 1] = (
1042
start_key, self._parsed_key_map[index + 1][1])
1045
self._parsed_byte_map.insert(index + 1, new_value)
1046
self._parsed_key_map.insert(index + 1, new_key)
1048
def _read_and_parse(self, readv_ranges):
1049
"""Read the the ranges and parse the resulting data.
1051
:param readv_ranges: A prepared readv range list.
1053
if not readv_ranges:
1055
if self._nodes is None and self._bytes_read * 2 >= self._size:
1056
# We've already read more than 50% of the file and we are about to
1057
# request more data, just _buffer_all() and be done
1061
readv_data = self._transport.readv(self._name, readv_ranges, True,
1064
for offset, data in readv_data:
1065
self._bytes_read += len(data)
1066
if offset == 0 and len(data) == self._size:
1067
# We read the whole range, most likely because the
1068
# Transport upcast our readv ranges into one long request
1069
# for enough total data to grab the whole index.
1070
self._buffer_all(StringIO(data))
1072
if self._bisect_nodes is None:
1073
# this must be the start
1074
if not (offset == 0):
1075
raise AssertionError()
1076
offset, data = self._parse_header_from_bytes(data)
1077
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1078
self._parse_region(offset, data)
1080
def _signature(self):
1081
"""The file signature for this index type."""
1085
"""Validate that everything in the index can be accessed."""
1086
# iter_all validates completely at the moment, so just do that.
1087
for node in self.iter_all_entries():
1091
class CombinedGraphIndex(object):
1092
"""A GraphIndex made up from smaller GraphIndices.
1094
The backing indices must implement GraphIndex, and are presumed to be
1097
Queries against the combined index will be made against the first index,
1098
and then the second and so on. The order of index's can thus influence
1099
performance significantly. For example, if one index is on local disk and a
1100
second on a remote server, the local disk index should be before the other
1104
def __init__(self, indices):
1105
"""Create a CombinedGraphIndex backed by indices.
1107
:param indices: An ordered list of indices to query for data.
1109
self._indices = indices
1113
self.__class__.__name__,
1114
', '.join(map(repr, self._indices)))
1116
@symbol_versioning.deprecated_method(symbol_versioning.one_one)
1117
def get_parents(self, revision_ids):
1118
"""See graph._StackedParentsProvider.get_parents.
1120
This implementation thunks the graph.Graph.get_parents api across to
1123
:param revision_ids: An iterable of graph keys for this graph.
1124
:return: A list of parent details for each key in revision_ids.
1125
Each parent details will be one of:
1126
* None when the key was missing
1127
* (NULL_REVISION,) when the key has no parents.
1128
* (parent_key, parent_key...) otherwise.
1130
parent_map = self.get_parent_map(revision_ids)
1131
return [parent_map.get(r, None) for r in revision_ids]
1133
def get_parent_map(self, keys):
1134
"""See graph._StackedParentsProvider.get_parent_map"""
1135
search_keys = set(keys)
1136
if NULL_REVISION in search_keys:
1137
search_keys.discard(NULL_REVISION)
1138
found_parents = {NULL_REVISION:[]}
1141
for index, key, value, refs in self.iter_entries(search_keys):
1144
parents = (NULL_REVISION,)
1145
found_parents[key] = parents
1146
return found_parents
1148
def insert_index(self, pos, index):
1149
"""Insert a new index in the list of indices to query.
1151
:param pos: The position to insert the index.
1152
:param index: The index to insert.
1154
self._indices.insert(pos, index)
1156
def iter_all_entries(self):
1157
"""Iterate over all keys within the index
1159
Duplicate keys across child indices are presumed to have the same
1160
value and are only reported once.
1162
:return: An iterable of (index, key, reference_lists, value).
1163
There is no defined order for the result iteration - it will be in
1164
the most efficient order for the index.
1167
for index in self._indices:
1168
for node in index.iter_all_entries():
1169
if node[1] not in seen_keys:
1171
seen_keys.add(node[1])
1173
def iter_entries(self, keys):
1174
"""Iterate over keys within the index.
1176
Duplicate keys across child indices are presumed to have the same
1177
value and are only reported once.
1179
:param keys: An iterable providing the keys to be retrieved.
1180
:return: An iterable of (index, key, reference_lists, value). There is no
1181
defined order for the result iteration - it will be in the most
1182
efficient order for the index.
1185
for index in self._indices:
1188
for node in index.iter_entries(keys):
1189
keys.remove(node[1])
1192
def iter_entries_prefix(self, keys):
1193
"""Iterate over keys within the index using prefix matching.
1195
Duplicate keys across child indices are presumed to have the same
1196
value and are only reported once.
1198
Prefix matching is applied within the tuple of a key, not to within
1199
the bytestring of each key element. e.g. if you have the keys ('foo',
1200
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1201
only the former key is returned.
1203
:param keys: An iterable providing the key prefixes to be retrieved.
1204
Each key prefix takes the form of a tuple the length of a key, but
1205
with the last N elements 'None' rather than a regular bytestring.
1206
The first element cannot be 'None'.
1207
:return: An iterable as per iter_all_entries, but restricted to the
1208
keys with a matching prefix to those supplied. No additional keys
1209
will be returned, and every match that is in the index will be
1216
for index in self._indices:
1217
for node in index.iter_entries_prefix(keys):
1218
if node[1] in seen_keys:
1220
seen_keys.add(node[1])
1223
def key_count(self):
1224
"""Return an estimate of the number of keys in this index.
1226
For CombinedGraphIndex this is approximated by the sum of the keys of
1227
the child indices. As child indices may have duplicate keys this can
1228
have a maximum error of the number of child indices * largest number of
1231
return sum((index.key_count() for index in self._indices), 0)
1234
"""Validate that everything in the index can be accessed."""
1235
for index in self._indices:
1239
class InMemoryGraphIndex(GraphIndexBuilder):
1240
"""A GraphIndex which operates entirely out of memory and is mutable.
1242
This is designed to allow the accumulation of GraphIndex entries during a
1243
single write operation, where the accumulated entries need to be immediately
1244
available - for example via a CombinedGraphIndex.
1247
def add_nodes(self, nodes):
1248
"""Add nodes to the index.
1250
:param nodes: An iterable of (key, node_refs, value) entries to add.
1252
if self.reference_lists:
1253
for (key, value, node_refs) in nodes:
1254
self.add_node(key, value, node_refs)
1256
for (key, value) in nodes:
1257
self.add_node(key, value)
1259
def iter_all_entries(self):
1260
"""Iterate over all keys within the index
1262
:return: An iterable of (index, key, reference_lists, value). There is no
1263
defined order for the result iteration - it will be in the most
1264
efficient order for the index (in this case dictionary hash order).
1266
if 'evil' in debug.debug_flags:
1267
trace.mutter_callsite(3,
1268
"iter_all_entries scales with size of history.")
1269
if self.reference_lists:
1270
for key, (absent, references, value) in self._nodes.iteritems():
1272
yield self, key, value, references
1274
for key, (absent, references, value) in self._nodes.iteritems():
1276
yield self, key, value
1278
def iter_entries(self, keys):
1279
"""Iterate over keys within the index.
1281
:param keys: An iterable providing the keys to be retrieved.
1282
:return: An iterable of (index, key, value, reference_lists). There is no
1283
defined order for the result iteration - it will be in the most
1284
efficient order for the index (keys iteration order in this case).
1287
if self.reference_lists:
1288
for key in keys.intersection(self._keys):
1289
node = self._nodes[key]
1291
yield self, key, node[2], node[1]
1293
for key in keys.intersection(self._keys):
1294
node = self._nodes[key]
1296
yield self, key, node[2]
1298
def iter_entries_prefix(self, keys):
1299
"""Iterate over keys within the index using prefix matching.
1301
Prefix matching is applied within the tuple of a key, not to within
1302
the bytestring of each key element. e.g. if you have the keys ('foo',
1303
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1304
only the former key is returned.
1306
:param keys: An iterable providing the key prefixes to be retrieved.
1307
Each key prefix takes the form of a tuple the length of a key, but
1308
with the last N elements 'None' rather than a regular bytestring.
1309
The first element cannot be 'None'.
1310
:return: An iterable as per iter_all_entries, but restricted to the
1311
keys with a matching prefix to those supplied. No additional keys
1312
will be returned, and every match that is in the index will be
1315
# XXX: To much duplication with the GraphIndex class; consider finding
1316
# a good place to pull out the actual common logic.
1320
if self._key_length == 1:
1324
raise errors.BadIndexKey(key)
1325
if len(key) != self._key_length:
1326
raise errors.BadIndexKey(key)
1327
node = self._nodes[key]
1330
if self.reference_lists:
1331
yield self, key, node[2], node[1]
1333
yield self, key, node[2]
1335
nodes_by_key = self._get_nodes_by_key()
1339
raise errors.BadIndexKey(key)
1340
if len(key) != self._key_length:
1341
raise errors.BadIndexKey(key)
1342
# find what it refers to:
1343
key_dict = nodes_by_key
1344
elements = list(key)
1345
# find the subdict to return
1347
while len(elements) and elements[0] is not None:
1348
key_dict = key_dict[elements[0]]
1351
# a non-existant lookup.
1356
key_dict = dicts.pop(-1)
1357
# can't be empty or would not exist
1358
item, value = key_dict.iteritems().next()
1359
if type(value) == dict:
1361
dicts.extend(key_dict.itervalues())
1364
for value in key_dict.itervalues():
1365
yield (self, ) + value
1367
yield (self, ) + key_dict
1369
def key_count(self):
1370
"""Return an estimate of the number of keys in this index.
1372
For InMemoryGraphIndex the estimate is exact.
1374
return len(self._keys)
1377
"""In memory index's have no known corruption at the moment."""
1380
class GraphIndexPrefixAdapter(object):
1381
"""An adapter between GraphIndex with different key lengths.
1383
Queries against this will emit queries against the adapted Graph with the
1384
prefix added, queries for all items use iter_entries_prefix. The returned
1385
nodes will have their keys and node references adjusted to remove the
1386
prefix. Finally, an add_nodes_callback can be supplied - when called the
1387
nodes and references being added will have prefix prepended.
1390
def __init__(self, adapted, prefix, missing_key_length,
1391
add_nodes_callback=None):
1392
"""Construct an adapter against adapted with prefix."""
1393
self.adapted = adapted
1394
self.prefix_key = prefix + (None,)*missing_key_length
1395
self.prefix = prefix
1396
self.prefix_len = len(prefix)
1397
self.add_nodes_callback = add_nodes_callback
1399
def add_nodes(self, nodes):
1400
"""Add nodes to the index.
1402
:param nodes: An iterable of (key, node_refs, value) entries to add.
1404
# save nodes in case its an iterator
1405
nodes = tuple(nodes)
1406
translated_nodes = []
1408
# Add prefix_key to each reference node_refs is a tuple of tuples,
1409
# so split it apart, and add prefix_key to the internal reference
1410
for (key, value, node_refs) in nodes:
1411
adjusted_references = (
1412
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1413
for ref_list in node_refs))
1414
translated_nodes.append((self.prefix + key, value,
1415
adjusted_references))
1417
# XXX: TODO add an explicit interface for getting the reference list
1418
# status, to handle this bit of user-friendliness in the API more
1420
for (key, value) in nodes:
1421
translated_nodes.append((self.prefix + key, value))
1422
self.add_nodes_callback(translated_nodes)
1424
def add_node(self, key, value, references=()):
1425
"""Add a node to the index.
1427
:param key: The key. keys are non-empty tuples containing
1428
as many whitespace-free utf8 bytestrings as the key length
1429
defined for this index.
1430
:param references: An iterable of iterables of keys. Each is a
1431
reference to another key.
1432
:param value: The value to associate with the key. It may be any
1433
bytes as long as it does not contain \0 or \n.
1435
self.add_nodes(((key, value, references), ))
1437
def _strip_prefix(self, an_iter):
1438
"""Strip prefix data from nodes and return it."""
1439
for node in an_iter:
1441
if node[1][:self.prefix_len] != self.prefix:
1442
raise errors.BadIndexData(self)
1443
for ref_list in node[3]:
1444
for ref_node in ref_list:
1445
if ref_node[:self.prefix_len] != self.prefix:
1446
raise errors.BadIndexData(self)
1447
yield node[0], node[1][self.prefix_len:], node[2], (
1448
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1449
for ref_list in node[3]))
1451
def iter_all_entries(self):
1452
"""Iterate over all keys within the index
1454
iter_all_entries is implemented against the adapted index using
1455
iter_entries_prefix.
1457
:return: An iterable of (index, key, reference_lists, value). There is no
1458
defined order for the result iteration - it will be in the most
1459
efficient order for the index (in this case dictionary hash order).
1461
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1463
def iter_entries(self, keys):
1464
"""Iterate over keys within the index.
1466
:param keys: An iterable providing the keys to be retrieved.
1467
:return: An iterable of (index, key, value, reference_lists). There is no
1468
defined order for the result iteration - it will be in the most
1469
efficient order for the index (keys iteration order in this case).
1471
return self._strip_prefix(self.adapted.iter_entries(
1472
self.prefix + key for key in keys))
1474
def iter_entries_prefix(self, keys):
1475
"""Iterate over keys within the index using prefix matching.
1477
Prefix matching is applied within the tuple of a key, not to within
1478
the bytestring of each key element. e.g. if you have the keys ('foo',
1479
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1480
only the former key is returned.
1482
:param keys: An iterable providing the key prefixes to be retrieved.
1483
Each key prefix takes the form of a tuple the length of a key, but
1484
with the last N elements 'None' rather than a regular bytestring.
1485
The first element cannot be 'None'.
1486
:return: An iterable as per iter_all_entries, but restricted to the
1487
keys with a matching prefix to those supplied. No additional keys
1488
will be returned, and every match that is in the index will be
1491
return self._strip_prefix(self.adapted.iter_entries_prefix(
1492
self.prefix + key for key in keys))
1494
def key_count(self):
1495
"""Return an estimate of the number of keys in this index.
1497
For GraphIndexPrefixAdapter this is relatively expensive - key
1498
iteration with the prefix is done.
1500
return len(list(self.iter_all_entries()))
1503
"""Call the adapted's validate."""
1504
self.adapted.validate()