373
337
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
374
338
except ValueError:
375
339
raise errors.BadIndexOptions(self)
376
options_line = stream.readline()
377
if not options_line.startswith(_OPTION_LEN):
378
raise errors.BadIndexOptions(self)
380
self._key_count = int(options_line[len(_OPTION_LEN):-1])
382
raise errors.BadIndexOptions(self)
384
def _resolve_references(self, references):
385
"""Return the resolved key references for references.
387
References are resolved by looking up the location of the key in the
388
_keys_by_offset map and substituting the key name, preserving ordering.
390
:param references: An iterable of iterables of key locations. e.g.
392
:return: A tuple of tuples of keys.
395
for ref_list in references:
396
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
397
return tuple(node_refs)
399
def _find_index(self, range_map, key):
400
"""Helper for the _parsed_*_index calls.
402
Given a range map - [(start, end), ...], finds the index of the range
403
in the map for key if it is in the map, and if it is not there, the
404
immediately preceeding range in the map.
406
result = bisect_right(range_map, key) - 1
407
if result + 1 < len(range_map):
408
# check the border condition, it may be in result + 1
409
if range_map[result + 1][0] == key[0]:
413
def _parsed_byte_index(self, offset):
414
"""Return the index of the entry immediately before offset.
416
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
417
there is one unparsed byte (the 11th, addressed as[10]). then:
418
asking for 0 will return 0
419
asking for 10 will return 0
420
asking for 11 will return 1
421
asking for 12 will return 1
424
return self._find_index(self._parsed_byte_map, key)
426
def _parsed_key_index(self, key):
427
"""Return the index of the entry immediately before key.
429
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
430
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
431
have been parsed, then:
432
asking for '' will return 0
433
asking for 'a' will return 0
434
asking for 'b' will return 1
435
asking for 'e' will return 1
437
search_key = (key, None)
438
return self._find_index(self._parsed_key_map, search_key)
440
def _is_parsed(self, offset):
441
"""Returns True if offset has been parsed."""
442
index = self._parsed_byte_index(offset)
443
if index == len(self._parsed_byte_map):
444
return offset < self._parsed_byte_map[index - 1][1]
445
start, end = self._parsed_byte_map[index]
446
return offset >= start and offset < end
448
def _iter_entries_from_total_buffer(self, keys):
449
"""Iterate over keys when the entire index is parsed."""
341
def iter_entries(self, keys):
342
"""Iterate over keys within the index.
344
:param keys: An iterable providing the keys to be retrieved.
345
:return: An iterable as per iter_all_entries, but restricted to the
346
keys supplied. No additional keys will be returned, and every
347
key supplied that is in the index will be returned.
352
if self._nodes is None:
450
354
keys = keys.intersection(self._keys)
451
355
if self.node_ref_lists:
555
432
# the last thing looked up was a terminal element
556
433
yield (self, ) + key_dict
559
"""Return an estimate of the number of keys in this index.
561
For GraphIndex the estimate is exact.
563
if self._key_count is None:
564
self._read_and_parse([_HEADER_READV])
565
return self._key_count
567
def _lookup_keys_via_location(self, location_keys):
568
"""Public interface for implementing bisection.
570
If _buffer_all has been called, then all the data for the index is in
571
memory, and this method should not be called, as it uses a separate
572
cache because it cannot pre-resolve all indices, which buffer_all does
575
:param location_keys: A list of location(byte offset), key tuples.
576
:return: A list of (location_key, result) tuples as expected by
577
bzrlib.bisect_multi.bisect_multi_bytes.
579
# Possible improvements:
580
# - only bisect lookup each key once
581
# - sort the keys first, and use that to reduce the bisection window
583
# this progresses in three parts:
586
# attempt to answer the question from the now in memory data.
587
# build the readv request
588
# for each location, ask for 800 bytes - much more than rows we've seen
591
for location, key in location_keys:
592
# can we answer from cache?
593
if self._bisect_nodes and key in self._bisect_nodes:
594
# We have the key parsed.
596
index = self._parsed_key_index(key)
597
if (len(self._parsed_key_map) and
598
self._parsed_key_map[index][0] <= key and
599
(self._parsed_key_map[index][1] >= key or
600
# end of the file has been parsed
601
self._parsed_byte_map[index][1] == self._size)):
602
# the key has been parsed, so no lookup is needed even if its
605
# - if we have examined this part of the file already - yes
606
index = self._parsed_byte_index(location)
607
if (len(self._parsed_byte_map) and
608
self._parsed_byte_map[index][0] <= location and
609
self._parsed_byte_map[index][1] > location):
610
# the byte region has been parsed, so no read is needed.
613
if location + length > self._size:
614
length = self._size - location
615
# todo, trim out parsed locations.
617
readv_ranges.append((location, length))
618
# read the header if needed
619
if self._bisect_nodes is None:
620
readv_ranges.append(_HEADER_READV)
621
self._read_and_parse(readv_ranges)
623
# - figure out <, >, missing, present
624
# - result present references so we can return them.
626
# keys that we cannot answer until we resolve references
627
pending_references = []
628
pending_locations = set()
629
for location, key in location_keys:
630
# can we answer from cache?
631
if key in self._bisect_nodes:
632
# the key has been parsed, so no lookup is needed
633
if self.node_ref_lists:
634
# the references may not have been all parsed.
635
value, refs = self._bisect_nodes[key]
636
wanted_locations = []
637
for ref_list in refs:
639
if ref not in self._keys_by_offset:
640
wanted_locations.append(ref)
642
pending_locations.update(wanted_locations)
643
pending_references.append((location, key))
645
result.append(((location, key), (self, key,
646
value, self._resolve_references(refs))))
648
result.append(((location, key),
649
(self, key, self._bisect_nodes[key])))
652
# has the region the key should be in, been parsed?
653
index = self._parsed_key_index(key)
654
if (self._parsed_key_map[index][0] <= key and
655
(self._parsed_key_map[index][1] >= key or
656
# end of the file has been parsed
657
self._parsed_byte_map[index][1] == self._size)):
658
result.append(((location, key), False))
660
# no, is the key above or below the probed location:
661
# get the range of the probed & parsed location
662
index = self._parsed_byte_index(location)
663
# if the key is below the start of the range, its below
664
if key < self._parsed_key_map[index][0]:
668
result.append(((location, key), direction))
670
# lookup data to resolve references
671
for location in pending_locations:
673
if location + length > self._size:
674
length = self._size - location
675
# TODO: trim out parsed locations (e.g. if the 800 is into the
676
# parsed region trim it, and dont use the adjust_for_latency
679
readv_ranges.append((location, length))
680
self._read_and_parse(readv_ranges)
681
for location, key in pending_references:
682
# answer key references we had to look-up-late.
683
index = self._parsed_key_index(key)
684
value, refs = self._bisect_nodes[key]
685
result.append(((location, key), (self, key,
686
value, self._resolve_references(refs))))
689
def _parse_header_from_bytes(self, bytes):
690
"""Parse the header from a region of bytes.
692
:param bytes: The data to parse.
693
:return: An offset, data tuple such as readv yields, for the unparsed
694
data. (which may length 0).
696
signature = bytes[0:len(self._signature())]
697
if not signature == self._signature():
698
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
699
lines = bytes[len(self._signature()):].splitlines()
700
options_line = lines[0]
701
if not options_line.startswith(_OPTION_NODE_REFS):
702
raise errors.BadIndexOptions(self)
704
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
706
raise errors.BadIndexOptions(self)
707
options_line = lines[1]
708
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
709
raise errors.BadIndexOptions(self)
711
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
713
raise errors.BadIndexOptions(self)
714
options_line = lines[2]
715
if not options_line.startswith(_OPTION_LEN):
716
raise errors.BadIndexOptions(self)
718
self._key_count = int(options_line[len(_OPTION_LEN):])
720
raise errors.BadIndexOptions(self)
721
# calculate the bytes we have processed
722
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
724
self._parsed_bytes(0, None, header_end, None)
725
# setup parsing state
726
self._expected_elements = 3 + self._key_length
727
# raw data keyed by offset
728
self._keys_by_offset = {}
729
# keys with the value and node references
730
self._bisect_nodes = {}
731
return header_end, bytes[header_end:]
733
def _parse_region(self, offset, data):
734
"""Parse node data returned from a readv operation.
736
:param offset: The byte offset the data starts at.
737
:param data: The data to parse.
741
end = offset + len(data)
744
# Trivial test - if the current index's end is within the
745
# low-matching parsed range, we're done.
746
index = self._parsed_byte_index(high_parsed)
747
if end < self._parsed_byte_map[index][1]:
749
# print "[%d:%d]" % (offset, end), \
750
# self._parsed_byte_map[index:index + 2]
751
high_parsed, last_segment = self._parse_segment(
752
offset, data, end, index)
756
def _parse_segment(self, offset, data, end, index):
757
"""Parse one segment of data.
759
:param offset: Where 'data' begins in the file.
760
:param data: Some data to parse a segment of.
761
:param end: Where data ends
762
:param index: The current index into the parsed bytes map.
763
:return: True if the parsed segment is the last possible one in the
765
:return: high_parsed_byte, last_segment.
766
high_parsed_byte is the location of the highest parsed byte in this
767
segment, last_segment is True if the parsed segment is the last
768
possible one in the data block.
770
# default is to use all data
772
# accomodate overlap with data before this.
773
if offset < self._parsed_byte_map[index][1]:
774
# overlaps the lower parsed region
775
# skip the parsed data
776
trim_start = self._parsed_byte_map[index][1] - offset
777
# don't trim the start for \n
778
start_adjacent = True
779
elif offset == self._parsed_byte_map[index][1]:
780
# abuts the lower parsed region
783
# do not trim anything
784
start_adjacent = True
786
# does not overlap the lower parsed region
789
# but trim the leading \n
790
start_adjacent = False
791
if end == self._size:
792
# lines up to the end of all data:
795
# do not strip to the last \n
798
elif index + 1 == len(self._parsed_byte_map):
799
# at the end of the parsed data
802
# but strip to the last \n
805
elif end == self._parsed_byte_map[index + 1][0]:
806
# buts up against the next parsed region
809
# do not strip to the last \n
812
elif end > self._parsed_byte_map[index + 1][0]:
813
# overlaps into the next parsed region
814
# only consider the unparsed data
815
trim_end = self._parsed_byte_map[index + 1][0] - offset
816
# do not strip to the last \n as we know its an entire record
818
last_segment = end < self._parsed_byte_map[index + 1][1]
820
# does not overlap into the next region
823
# but strip to the last \n
826
# now find bytes to discard if needed
827
if not start_adjacent:
828
# work around python bug in rfind
829
if trim_start is None:
830
trim_start = data.find('\n') + 1
832
trim_start = data.find('\n', trim_start) + 1
833
assert trim_start != 0, 'no \n was present'
834
# print 'removing start', offset, trim_start, repr(data[:trim_start])
836
# work around python bug in rfind
838
trim_end = data.rfind('\n') + 1
840
trim_end = data.rfind('\n', None, trim_end) + 1
841
assert trim_end != 0, 'no \n was present'
842
# print 'removing end', offset, trim_end, repr(data[trim_end:])
843
# adjust offset and data to the parseable data.
844
trimmed_data = data[trim_start:trim_end]
845
assert trimmed_data, 'read unneeded data [%d:%d] from [%d:%d]' % (
846
trim_start, trim_end, offset, offset + len(data))
849
# print "parsing", repr(trimmed_data)
850
# splitlines mangles the \r delimiters.. don't use it.
851
lines = trimmed_data.split('\n')
854
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
855
for key, value in nodes:
856
self._bisect_nodes[key] = value
857
self._parsed_bytes(offset, first_key,
858
offset + len(trimmed_data), last_key)
859
return offset + len(trimmed_data), last_segment
861
def _parse_lines(self, lines, pos):
870
assert self._size == pos + 1, "%s %s" % (self._size, pos)
873
elements = line.split('\0')
874
if len(elements) != self._expected_elements:
875
raise errors.BadIndexData(self)
877
key = tuple(elements[:self._key_length])
878
if first_key is None:
880
absent, references, value = elements[-3:]
882
for ref_string in references.split('\t'):
883
ref_lists.append(tuple([
884
int(ref) for ref in ref_string.split('\r') if ref
886
ref_lists = tuple(ref_lists)
887
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
888
pos += len(line) + 1 # +1 for the \n
891
if self.node_ref_lists:
892
node_value = (value, ref_lists)
895
nodes.append((key, node_value))
896
# print "parsed ", key
897
return first_key, key, nodes, trailers
899
def _parsed_bytes(self, start, start_key, end, end_key):
900
"""Mark the bytes from start to end as parsed.
902
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
905
:param start: The start of the parsed region.
906
:param end: The end of the parsed region.
908
index = self._parsed_byte_index(start)
909
new_value = (start, end)
910
new_key = (start_key, end_key)
912
# first range parsed is always the beginning.
913
self._parsed_byte_map.insert(index, new_value)
914
self._parsed_key_map.insert(index, new_key)
918
# extend lower region
919
# extend higher region
920
# combine two regions
921
if (index + 1 < len(self._parsed_byte_map) and
922
self._parsed_byte_map[index][1] == start and
923
self._parsed_byte_map[index + 1][0] == end):
924
# combine two regions
925
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
926
self._parsed_byte_map[index + 1][1])
927
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
928
self._parsed_key_map[index + 1][1])
929
del self._parsed_byte_map[index + 1]
930
del self._parsed_key_map[index + 1]
931
elif self._parsed_byte_map[index][1] == start:
932
# extend the lower entry
933
self._parsed_byte_map[index] = (
934
self._parsed_byte_map[index][0], end)
935
self._parsed_key_map[index] = (
936
self._parsed_key_map[index][0], end_key)
937
elif (index + 1 < len(self._parsed_byte_map) and
938
self._parsed_byte_map[index + 1][0] == end):
939
# extend the higher entry
940
self._parsed_byte_map[index + 1] = (
941
start, self._parsed_byte_map[index + 1][1])
942
self._parsed_key_map[index + 1] = (
943
start_key, self._parsed_key_map[index + 1][1])
946
self._parsed_byte_map.insert(index + 1, new_value)
947
self._parsed_key_map.insert(index + 1, new_key)
949
def _read_and_parse(self, readv_ranges):
950
"""Read the the ranges and parse the resulting data.
952
:param readv_ranges: A prepared readv range list.
955
readv_data = self._transport.readv(self._name, readv_ranges, True,
958
for offset, data in readv_data:
959
if self._bisect_nodes is None:
960
# this must be the start
962
offset, data = self._parse_header_from_bytes(data)
963
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
964
self._parse_region(offset, data)
966
435
def _signature(self):
967
436
"""The file signature for this index type."""
968
437
return _SIGNATURE