361
364
except ValueError:
362
365
raise errors.BadIndexOptions(self)
364
def iter_entries(self, keys):
365
"""Iterate over keys within the index.
367
:param keys: An iterable providing the keys to be retrieved.
368
:return: An iterable as per iter_all_entries, but restricted to the
369
keys supplied. No additional keys will be returned, and every
370
key supplied that is in the index will be returned.
375
if self._nodes is None:
367
def _resolve_references(self, references):
368
"""Return the resolved key references for references.
370
References are resolved by looking up the location of the key in the
371
_keys_by_offset map and substituting the key name, preserving ordering.
373
:param references: An iterable of iterables of key locations. e.g.
375
:return: A tuple of tuples of keys.
378
for ref_list in references:
379
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
380
return tuple(node_refs)
382
def _find_index(self, range_map, key):
383
"""Helper for the _parsed_*_index calls.
385
Given a range map - [(start, end), ...], finds the index of the range
386
in the map for key if it is in the map, and if it is not there, the
387
immediately preceeding range in the map.
389
result = bisect_right(range_map, key) - 1
390
if result + 1 < len(range_map):
391
# check the border condition, it may be in result + 1
392
if range_map[result + 1][0] == key[0]:
396
def _parsed_byte_index(self, offset):
397
"""Return the index of the entry immediately before offset.
399
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
400
there is one unparsed byte (the 11th, addressed as[10]). then:
401
asking for 0 will return 0
402
asking for 10 will return 0
403
asking for 11 will return 1
404
asking for 12 will return 1
407
return self._find_index(self._parsed_byte_map, key)
409
def _parsed_key_index(self, key):
410
"""Return the index of the entry immediately before key.
412
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
413
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
414
have been parsed, then:
415
asking for '' will return 0
416
asking for 'a' will return 0
417
asking for 'b' will return 1
418
asking for 'e' will return 1
420
search_key = (key, None)
421
return self._find_index(self._parsed_key_map, search_key)
423
def _is_parsed(self, offset):
424
"""Returns True if offset has been parsed."""
425
index = self._parsed_byte_index(offset)
426
if index == len(self._parsed_byte_map):
427
return offset < self._parsed_byte_map[index - 1][1]
428
start, end = self._parsed_byte_map[index]
429
return offset >= start and offset < end
431
def _iter_entries_from_total_buffer(self, keys):
432
"""Iterate over keys when the entire index is parsed."""
377
433
keys = keys.intersection(self._keys)
378
434
if self.node_ref_lists:
465
548
self._buffer_all()
466
549
return self._key_count
551
def _lookup_keys_via_location(self, location_keys):
552
"""Public interface for implementing bisection.
554
If _buffer_all has been called, then all the data for the index is in
555
memory, and this method should not be called, as it uses a separate
556
cache because it cannot pre-resolve all indices, which buffer_all does
559
:param location_keys: A list of location(byte offset), key tuples.
560
:return: A list of (location_key, result) tuples as expected by
561
bzrlib.bisect_multi.bisect_multi_bytes.
563
# Possible improvements:
564
# - only bisect lookup each key once
565
# - sort the keys first, and use that to reduce the bisection window
567
# this progresses in three parts:
570
# attempt to answer the question from the now in memory data.
571
# build the readv request
572
# for each location, ask for 800 bytes - much more than rows we've seen
575
for location, key in location_keys:
576
# can we answer from cache?
577
# - if we know the answer - yes
578
index = self._parsed_key_index(key)
579
if (len(self._parsed_key_map) and
580
self._parsed_key_map[index][0] <= key and
581
(self._parsed_key_map[index][1] > key or
582
# end of the file has been parsed
583
self._parsed_byte_map[index][1] == self._size)):
584
# the key has been parsed, so no lookup is needed
586
# - if we have examined this part of the file already - yes
587
index = self._parsed_byte_index(location)
588
if (len(self._parsed_byte_map) and
589
self._parsed_byte_map[index][0] <= location and
590
self._parsed_byte_map[index][1] > location):
591
# the byte region has been parsed, so no read is needed.
594
if location + length > self._size:
595
length = self._size - location
596
# todo, trim out parsed locations.
598
readv_ranges.append((location, length))
599
# read the header if needed
600
if self._bisect_nodes is None:
601
readv_ranges.append((0, 200))
602
self._read_and_parse(readv_ranges)
604
# - figure out <, >, missing, present
605
# - result present references so we can return them.
607
# keys that we cannot answer until we resolve references
608
pending_references = []
609
pending_locations = set()
610
for location, key in location_keys:
611
# can we answer from cache?
612
index = self._parsed_key_index(key)
613
if (self._parsed_key_map[index][0] <= key and
614
(self._parsed_key_map[index][1] > key or
615
# end of the file has been parsed
616
self._parsed_byte_map[index][1] == self._size)):
617
# the key has been parsed, so no lookup is needed
618
if key in self._bisect_nodes:
619
if self.node_ref_lists:
620
# the references may not have been all parsed.
621
value, refs = self._bisect_nodes[key]
622
wanted_locations = []
623
for ref_list in refs:
625
if ref not in self._keys_by_offset:
626
wanted_locations.append(ref)
628
pending_locations.update(wanted_locations)
629
pending_references.append((location, key))
631
result.append(((location, key), (self, key,
632
value, self._resolve_references(refs))))
634
result.append(((location, key),
635
(self, key, self._bisect_nodes[key])))
637
result.append(((location, key), False))
639
# no, is the key above or below the probed location:
640
# get the range of the probed & parsed location
641
index = self._parsed_byte_index(location)
642
# if the key is below the start of the range, its below
643
if key < self._parsed_key_map[index][0]:
647
result.append(((location, key), direction))
649
# lookup data to resolve references
650
for location in pending_locations:
652
if location + length > self._size:
653
length = self._size - location
654
# TODO: trim out parsed locations (e.g. if the 800 is into the
655
# parsed region trim it, and dont use the adjust_for_latency
658
readv_ranges.append((location, length))
659
self._read_and_parse(readv_ranges)
660
for location, key in pending_references:
661
# answer key references we had to look-up-late.
662
index = self._parsed_key_index(key)
663
value, refs = self._bisect_nodes[key]
664
result.append(((location, key), (self, key,
665
value, self._resolve_references(refs))))
668
def _parse_header_from_bytes(self, bytes):
669
"""Parse the header from a region of bytes.
671
:param bytes: The data to parse.
672
:return: An offset, data tuple such as readv yields, for the unparsed
673
data. (which may length 0).
675
signature = bytes[0:len(self._signature())]
676
if not signature == self._signature():
677
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
678
lines = bytes[len(self._signature()):].splitlines()
679
options_line = lines[0]
680
if not options_line.startswith(_OPTION_NODE_REFS):
681
raise errors.BadIndexOptions(self)
683
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
685
raise errors.BadIndexOptions(self)
686
options_line = lines[1]
687
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
688
raise errors.BadIndexOptions(self)
690
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
692
raise errors.BadIndexOptions(self)
693
options_line = lines[2]
694
if not options_line.startswith(_OPTION_LEN):
695
raise errors.BadIndexOptions(self)
697
self._key_count = int(options_line[len(_OPTION_LEN):])
699
raise errors.BadIndexOptions(self)
700
# calculate the bytes we have processed
701
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
703
self._parsed_bytes(0, None, header_end, None)
704
# setup parsing state
705
self._expected_elements = 3 + self._key_length
706
# raw data keyed by offset
707
self._keys_by_offset = {}
708
# keys with the value and node references
709
self._bisect_nodes = {}
710
return header_end, bytes[header_end:]
712
def _parse_region(self, offset, data):
713
"""Parse node data returned from a readv operation.
715
:param offset: The byte offset the data starts at.
716
:param data: The data to parse.
720
end = offset + len(data)
723
# Trivial test - if the current index's end is within the
724
# low-matching parsed range, we're done.
725
index = self._parsed_byte_index(high_parsed)
726
if end < self._parsed_byte_map[index][1]:
728
# print "[%d:%d]" % (offset, end), \
729
# self._parsed_byte_map[index:index + 2]
730
high_parsed, last_segment = self._parse_segment(
731
offset, data, end, index)
735
def _parse_segment(self, offset, data, end, index):
736
"""Parse one segment of data.
738
:param offset: Where 'data' begins in the file.
739
:param data: Some data to parse a segment of.
740
:param end: Where data ends
741
:param index: The current index into the parsed bytes map.
742
:return: True if the parsed segment is the last possible one in the
744
:return: high_parsed_byte, last_segment.
745
high_parsed_byte is the location of the highest parsed byte in this
746
segment, last_segment is True if the parsed segment is the last
747
possible one in the data block.
749
# default is to use all data
751
# accomodate overlap with data before this.
752
if offset < self._parsed_byte_map[index][1]:
753
# overlaps the lower parsed region
754
# skip the parsed data
755
trim_start = self._parsed_byte_map[index][1] - offset
756
# don't trim the start for \n
757
start_adjacent = True
758
elif offset == self._parsed_byte_map[index][1]:
759
# abuts the lower parsed region
762
# do not trim anything
763
start_adjacent = True
765
# does not overlap the lower parsed region
768
# but trim the leading \n
769
start_adjacent = False
770
if end == self._size:
771
# lines up to the end of all data:
774
# do not strip to the last \n
777
elif index + 1 == len(self._parsed_byte_map):
778
# at the end of the parsed data
781
# but strip to the last \n
784
elif end == self._parsed_byte_map[index + 1][0]:
785
# buts up against the next parsed region
788
# do not strip to the last \n
791
elif end > self._parsed_byte_map[index + 1][0]:
792
# overlaps into the next parsed region
793
# only consider the unparsed data
794
trim_end = self._parsed_byte_map[index + 1][0] - offset
795
# do not strip to the last \n as we know its an entire record
797
last_segment = end < self._parsed_byte_map[index + 1][1]
799
# does not overlap into the next region
802
# but strip to the last \n
805
# now find bytes to discard if needed
806
if not start_adjacent:
807
# work around python bug in rfind
808
if trim_start is None:
809
trim_start = data.find('\n') + 1
811
trim_start = data.find('\n', trim_start) + 1
812
assert trim_start != 0, 'no \n was present'
813
# print 'removing start', offset, trim_start, repr(data[:trim_start])
815
# work around python bug in rfind
817
trim_end = data.rfind('\n') + 1
819
trim_end = data.rfind('\n', None, trim_end) + 1
820
assert trim_end != 0, 'no \n was present'
821
# print 'removing end', offset, trim_end, repr(data[trim_end:])
822
# adjust offset and data to the parseable data.
823
trimmed_data = data[trim_start:trim_end]
824
assert trimmed_data, 'read unneeded data [%d:%d] from [%d:%d]' % (
825
trim_start, trim_end, offset, offset + len(data))
828
# print "parsing", repr(trimmed_data)
829
# splitlines mangles the \r delimiters.. don't use it.
830
lines = trimmed_data.split('\n')
833
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
834
for key, value in nodes:
835
self._bisect_nodes[key] = value
836
self._parsed_bytes(offset, first_key,
837
offset + len(trimmed_data), last_key)
838
return offset + len(trimmed_data), last_segment
840
def _parse_lines(self, lines, pos):
849
assert self._size == pos + 1, "%s %s" % (self._size, pos)
852
elements = line.split('\0')
853
if len(elements) != self._expected_elements:
854
raise errors.BadIndexData(self)
856
key = tuple(elements[:self._key_length])
857
if first_key is None:
859
absent, references, value = elements[-3:]
861
for ref_string in references.split('\t'):
862
ref_lists.append(tuple([
863
int(ref) for ref in ref_string.split('\r') if ref
865
ref_lists = tuple(ref_lists)
866
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
867
pos += len(line) + 1 # +1 for the \n
870
if self.node_ref_lists:
871
node_value = (value, ref_lists)
874
nodes.append((key, node_value))
875
# print "parsed ", key
876
return first_key, key, nodes, trailers
878
def _parsed_bytes(self, start, start_key, end, end_key):
879
"""Mark the bytes from start to end as parsed.
881
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
884
:param start: The start of the parsed region.
885
:param end: The end of the parsed region.
887
index = self._parsed_byte_index(start)
888
new_value = (start, end)
889
new_key = (start_key, end_key)
891
# first range parsed is always the beginning.
892
self._parsed_byte_map.insert(index, new_value)
893
self._parsed_key_map.insert(index, new_key)
897
# extend lower region
898
# extend higher region
899
# combine two regions
900
if (index + 1 < len(self._parsed_byte_map) and
901
self._parsed_byte_map[index][1] == start and
902
self._parsed_byte_map[index + 1][0] == end):
903
# combine two regions
904
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
905
self._parsed_byte_map[index + 1][1])
906
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
907
self._parsed_key_map[index + 1][1])
908
del self._parsed_byte_map[index + 1]
909
del self._parsed_key_map[index + 1]
910
elif self._parsed_byte_map[index][1] == start:
911
# extend the lower entry
912
self._parsed_byte_map[index] = (
913
self._parsed_byte_map[index][0], end)
914
self._parsed_key_map[index] = (
915
self._parsed_key_map[index][0], end_key)
916
elif (index + 1 < len(self._parsed_byte_map) and
917
self._parsed_byte_map[index + 1][0] == end):
918
# extend the higher entry
919
self._parsed_byte_map[index + 1] = (
920
start, self._parsed_byte_map[index + 1][1])
921
self._parsed_key_map[index + 1] = (
922
start_key, self._parsed_key_map[index + 1][1])
925
self._parsed_byte_map.insert(index + 1, new_value)
926
self._parsed_key_map.insert(index + 1, new_key)
928
def _read_and_parse(self, readv_ranges):
929
"""Read the the ranges and parse the resulting data.
931
:param readv_ranges: A prepared readv range list.
934
readv_data = self._transport.readv(self._name, readv_ranges, True,
937
for offset, data in readv_data:
938
if self._bisect_nodes is None:
939
# this must be the start
941
offset, data = self._parse_header_from_bytes(data)
942
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
943
self._parse_region(offset, data)
468
945
def _signature(self):
469
946
"""The file signature for this index type."""
470
947
return _SIGNATURE