1045
1045
output.append(cur_out)
1048
def _walk_through_internal_nodes(self, keys):
1049
"""Take the given set of keys, and find the corresponding LeafNodes.
1051
:param keys: An unsorted iterable of keys to search for
1052
:return: (nodes, index_and_keys)
1053
nodes is a dict mapping {index: LeafNode}
1054
keys_at_index is a list of tuples of [(index, [keys for Leaf])]
1056
# 6 seconds spent in miss_torture using the sorted() line.
1057
# Even with out of order disk IO it seems faster not to sort it when
1058
# large queries are being made.
1059
keys_at_index = [(0, sorted(keys))]
1061
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1062
node_indexes = [idx for idx, s_keys in keys_at_index]
1063
nodes = self._get_internal_nodes(node_indexes)
1065
next_nodes_and_keys = []
1066
for node_index, sub_keys in keys_at_index:
1067
node = nodes[node_index]
1068
positions = self._multi_bisect_right(sub_keys, node.keys)
1069
node_offset = next_row_start + node.offset
1070
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1071
for pos, s_keys in positions])
1072
keys_at_index = next_nodes_and_keys
1073
# We should now be at the _LeafNodes
1074
node_indexes = [idx for idx, s_keys in keys_at_index]
1076
# TODO: We may *not* want to always read all the nodes in one
1077
# big go. Consider setting a max size on this.
1078
nodes = self._get_leaf_nodes(node_indexes)
1079
return nodes, keys_at_index
1048
1081
def iter_entries(self, keys):
1049
1082
"""Iterate over keys within the index.
1088
1121
needed_keys = keys
1089
1122
if not needed_keys:
1091
# 6 seconds spent in miss_torture using the sorted() line.
1092
# Even with out of order disk IO it seems faster not to sort it when
1093
# large queries are being made.
1094
needed_keys = sorted(needed_keys)
1096
nodes_and_keys = [(0, needed_keys)]
1098
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1099
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1100
nodes = self._get_internal_nodes(node_indexes)
1102
next_nodes_and_keys = []
1103
for node_index, sub_keys in nodes_and_keys:
1104
node = nodes[node_index]
1105
positions = self._multi_bisect_right(sub_keys, node.keys)
1106
node_offset = next_row_start + node.offset
1107
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1108
for pos, s_keys in positions])
1109
nodes_and_keys = next_nodes_and_keys
1110
# We should now be at the _LeafNodes
1111
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1113
# TODO: We may *not* want to always read all the nodes in one
1114
# big go. Consider setting a max size on this.
1116
nodes = self._get_leaf_nodes(node_indexes)
1124
nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
1117
1125
for node_index, sub_keys in nodes_and_keys:
1118
1126
if not sub_keys:
1127
1135
yield (self, next_sub_key, value)
1129
def get_ancestry(self, keys, ref_list_num, parent_map):
1137
def get_ancestry(self, keys, ref_list_num, parent_map, missing_keys):
1130
1138
"""Iterate over the given keys and all parents that are found.
1132
1140
:param keys: A sorted list keys whose ancestry we want to return
1135
1143
:param parent_map: keys that we already know the parents to. when
1136
1144
finding new keys we will add nodes to this dict.
1137
:return: [not_present_keys], [sorted_search_tips]
1145
:param missing_keys: keys which are known to be missing in this index.
1146
New entries that are found to be missing will be added to this set.
1147
:return: [not_present_keys], [search_keys]
1138
1148
A dict mapping {key: parent_keys} but including all
1139
1149
parent_keys that we encounter.
1140
1150
not_present_keys are keys where we found the LeafNode, but the key
1141
1151
just isn't there.
1142
search_tips parents that we found, which might exist in this
1152
search_keys parents that we found, which might exist in this
1143
1153
index, but which we were unable to find immediately, callers
1144
1154
should re-query this index for those keys.
1156
1166
# key listing its parents, we expect that the parent key is also likely
1157
1167
# to sit on the same page. Allowing us to expand parents quickly
1158
1168
# without suffering the full stack of bisecting, etc.
1159
nodes_and_keys = [(0, sorted(keys))]
1161
# Search through the internal nodes, until we get to the leaf nodes
1162
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1163
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1164
nodes = self._get_internal_nodes(node_indexes)
1166
next_nodes_and_keys = []
1167
for node_index, sub_keys in nodes_and_keys:
1168
node = nodes[node_index]
1169
positions = self._multi_bisect_right(sub_keys, node.keys)
1170
node_offset = next_row_start + node.offset
1171
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1172
for pos, s_keys in positions])
1173
nodes_and_keys = next_nodes_and_keys
1174
# We should now be at the _LeafNodes
1175
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1177
# TODO: We may *not* want to always read all the nodes in one
1178
# big go. Consider setting a max size on this.
1180
# Should missing_keys be a set?
1181
missing_keys = set()
1169
nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
1182
1171
# These are parent keys which could not be immediately resolved on the
1183
1172
# page where the child was present. Note that we may already be
1184
1173
# searching for that key, and it may actually be present [or known
1201
1190
# Mostly, it is an idea, one which should be benchmarked.
1202
1191
parents_not_on_page = set()
1204
nodes = self._get_leaf_nodes(node_indexes)
1205
1193
for node_index, sub_keys in nodes_and_keys:
1206
1194
if not sub_keys:
1211
1199
node_keys = node.keys
1212
1200
parents_to_check = set()
1213
1201
for next_sub_key in sub_keys:
1202
if next_sub_key not in node_keys:
1203
# This one is just not present in the index at all
1204
missing_keys.add(next_sub_key)
1215
1206
value, refs = node_keys[next_sub_key]
1217
# This one is just not present
1218
missing_keys.add(next_sub_key)
1220
1207
parent_keys = refs[ref_list_num]
1221
1208
parent_map[next_sub_key] = parent_keys
1222
1209
parents_to_check.update(parent_keys)
1244
1231
# of packs' is going to show a reasonable improvement
1245
1232
# from the check, because it avoids 'going around
1246
1233
# again' for everything that is in another index
1247
parents_not_on_page.add(key)
1248
# # Missing for some reason
1249
# if key < node.min_key:
1250
# # in the case of bzr.dev, 3.4k/5.3k misses are
1251
# # 'earlier' misses (65%)
1252
# parents_not_on_page.add(key)
1253
# elif key > node.max_key:
1254
# # This parent key would be present on a different
1256
# parents_not_on_page.add(key)
1258
# # assert key != node.min_key and key != node.max_key
1259
# # If it was going to be present, it would be on
1260
# # *this* page, so mark it missing.
1261
# missing_keys.add(key)
1234
# parents_not_on_page.add(key)
1235
# Missing for some reason
1236
if key < node.min_key:
1237
# in the case of bzr.dev, 3.4k/5.3k misses are
1238
# 'earlier' misses (65%)
1239
parents_not_on_page.add(key)
1240
elif key > node.max_key:
1241
# This parent key would be present on a different
1243
parents_not_on_page.add(key)
1245
# assert key != node.min_key and key != node.max_key
1246
# If it was going to be present, it would be on
1247
# *this* page, so mark it missing.
1248
missing_keys.add(key)
1262
1249
parents_to_check = next_parents_to_check.difference(parent_map)
1263
1250
# Might want to do another .difference() from missing_keys
1264
1251
# parents_not_on_page could have been found on a different page, or be
1265
1252
# known to be missing. So cull out everything that has already been
1267
search_tips = parents_not_on_page.difference(
1254
search_keys = parents_not_on_page.difference(
1268
1255
parent_map).difference(missing_keys)
1269
return missing_keys, search_tips
1271
1258
def iter_entries_prefix(self, keys):
1272
1259
"""Iterate over keys within the index using prefix matching.