~bzr-pqm/bzr/bzr.dev

Viewing changes to bzrlib/btree_index.py

Committer: John Arbash Meinel
Date: 2009-08-07 18:52:37 UTC
mto: This revision was merged to the branch mainline in revision 4613.
Revision ID: john@arbash-meinel.com-20090807185237-itdqujj0f88udx1p

Start adding some tests.
Extend the code a little bit, to make parent_map and missing_keys be state
that is passed in and then mutated, rather than one and not the other.
We need the state since we are walking more than the minimal, though passing
in the 'goal' of the function does seem a little silly.

files modified:
bzrlib/btree_index.py

bzrlib/tests/test_btree_index.py

Show diffs side-by-side

added added

removed removed

bzrlib/btree_index.py

1045

output.append(cur_out)

1046

return output

1047

1048

def _walk_through_internal_nodes(self, keys):

1049

"""Take the given set of keys, and find the corresponding LeafNodes.

1050

1051

:param keys: An unsorted iterable of keys to search for

1052

:return: (nodes, index_and_keys)

1053

nodes is a dict mapping {index: LeafNode}

1054

keys_at_index is a list of tuples of [(index, [keys for Leaf])]

1055

"""

1056

# 6 seconds spent in miss_torture using the sorted() line.

1057

# Even with out of order disk IO it seems faster not to sort it when

1058

# large queries are being made.

1059

keys_at_index = [(0, sorted(keys))]

1060

1061

for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):

1062

node_indexes = [idx for idx, s_keys in keys_at_index]

1063

nodes = self._get_internal_nodes(node_indexes)

1064

1065

next_nodes_and_keys = []

1066

for node_index, sub_keys in keys_at_index:

1067

node = nodes[node_index]

1068

positions = self._multi_bisect_right(sub_keys, node.keys)

1069

node_offset = next_row_start + node.offset

1070

next_nodes_and_keys.extend([(node_offset + pos, s_keys)

1071

for pos, s_keys in positions])

1072

keys_at_index = next_nodes_and_keys

1073

# We should now be at the _LeafNodes

1074

node_indexes = [idx for idx, s_keys in keys_at_index]

1075

1076

# TODO: We may *not* want to always read all the nodes in one

1077

# big go. Consider setting a max size on this.

1078

nodes = self._get_leaf_nodes(node_indexes)

1079

return nodes, keys_at_index

1080

1048

1081

def iter_entries(self, keys):

1049

1082

"""Iterate over keys within the index.

1050

1083

1088

1121

needed_keys = keys

1089

1122

if not needed_keys:

1090

1123

return

1091

# 6 seconds spent in miss_torture using the sorted() line.

1092

# Even with out of order disk IO it seems faster not to sort it when

1093

# large queries are being made.

1094

needed_keys = sorted(needed_keys)

1095

1096

nodes_and_keys = [(0, needed_keys)]

1097

1098

for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):

1099

node_indexes = [idx for idx, s_keys in nodes_and_keys]

1100

nodes = self._get_internal_nodes(node_indexes)

1101

1102

next_nodes_and_keys = []

1103

for node_index, sub_keys in nodes_and_keys:

1104

node = nodes[node_index]

1105

positions = self._multi_bisect_right(sub_keys, node.keys)

1106

node_offset = next_row_start + node.offset

1107

next_nodes_and_keys.extend([(node_offset + pos, s_keys)

1108

for pos, s_keys in positions])

1109

nodes_and_keys = next_nodes_and_keys

1110

# We should now be at the _LeafNodes

1111

node_indexes = [idx for idx, s_keys in nodes_and_keys]

1112

1113

# TODO: We may *not* want to always read all the nodes in one

1114

# big go. Consider setting a max size on this.

1115

1116

nodes = self._get_leaf_nodes(node_indexes)

1124

nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)

1117

1125

for node_index, sub_keys in nodes_and_keys:

1118

1126

if not sub_keys:

1119

1127

continue

1126

1134

else:

1127

1135

yield (self, next_sub_key, value)

1128

1136

1129

def get_ancestry(self, keys, ref_list_num, parent_map):

1137

def get_ancestry(self, keys, ref_list_num, parent_map, missing_keys):

1130

1138

"""Iterate over the given keys and all parents that are found.

1131

1139

1132

1140

:param keys: A sorted list keys whose ancestry we want to return

1134

1142

care about.

1135

1143

:param parent_map: keys that we already know the parents to. when

1136

1144

finding new keys we will add nodes to this dict.

1137

:return: [not_present_keys], [sorted_search_tips]

1145

:param missing_keys: keys which are known to be missing in this index.

1146

New entries that are found to be missing will be added to this set.

1147

:return: [not_present_keys], [search_keys]

1138

1148

A dict mapping {key: parent_keys} but including all

1139

1149

parent_keys that we encounter.

1140

1150

not_present_keys are keys where we found the LeafNode, but the key

1141

1151

just isn't there.

1142

search_tips parents that we found, which might exist in this

1152

search_keys parents that we found, which might exist in this

1143

1153

index, but which we were unable to find immediately, callers

1144

1154

should re-query this index for those keys.

1145

1155

"""

1156

1166

# key listing its parents, we expect that the parent key is also likely

1157

1167

# to sit on the same page. Allowing us to expand parents quickly

1158

1168

# without suffering the full stack of bisecting, etc.

1159

nodes_and_keys = [(0, sorted(keys))]

1160

1161

# Search through the internal nodes, until we get to the leaf nodes

1162

for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):

1163

node_indexes = [idx for idx, s_keys in nodes_and_keys]

1164

nodes = self._get_internal_nodes(node_indexes)

1165

1166

next_nodes_and_keys = []

1167

for node_index, sub_keys in nodes_and_keys:

1168

node = nodes[node_index]

1169

positions = self._multi_bisect_right(sub_keys, node.keys)

1170

node_offset = next_row_start + node.offset

1171

next_nodes_and_keys.extend([(node_offset + pos, s_keys)

1172

for pos, s_keys in positions])

1173

nodes_and_keys = next_nodes_and_keys

1174

# We should now be at the _LeafNodes

1175

node_indexes = [idx for idx, s_keys in nodes_and_keys]

1176

1177

# TODO: We may *not* want to always read all the nodes in one

1178

# big go. Consider setting a max size on this.

1179

1180

# Should missing_keys be a set?

1181

missing_keys = set()

1169

nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)

1170

1182

1171

# These are parent keys which could not be immediately resolved on the

1183

1172

# page where the child was present. Note that we may already be

1184

1173

# searching for that key, and it may actually be present [or known

1201

1190

# Mostly, it is an idea, one which should be benchmarked.

1202

1191

parents_not_on_page = set()

1203

1192

1204

nodes = self._get_leaf_nodes(node_indexes)

1205

1193

for node_index, sub_keys in nodes_and_keys:

1206

1194

if not sub_keys:

1207

1195

continue

1211

1199

node_keys = node.keys

1212

1200

parents_to_check = set()

1213

1201

for next_sub_key in sub_keys:

1214

try:

1202

if next_sub_key not in node_keys:

1203

# This one is just not present in the index at all

1204

missing_keys.add(next_sub_key)

1205

else:

1215

1206

value, refs = node_keys[next_sub_key]

1216

except KeyError:

1217

# This one is just not present

1218

missing_keys.add(next_sub_key)

1219

else:

1220

1207

parent_keys = refs[ref_list_num]

1221

1208

parent_map[next_sub_key] = parent_keys

1222

1209

parents_to_check.update(parent_keys)

1244

1231

# of packs' is going to show a reasonable improvement

1245

1232

# from the check, because it avoids 'going around

1246

1233

# again' for everything that is in another index

1247

parents_not_on_page.add(key)

1248

# # Missing for some reason

1249

# if key < node.min_key:

1250

# # in the case of bzr.dev, 3.4k/5.3k misses are

1251

# # 'earlier' misses (65%)

1252

# parents_not_on_page.add(key)

1253

# elif key > node.max_key:

1254

# # This parent key would be present on a different

1255

# # LeafNode

1256

# parents_not_on_page.add(key)

1257

# else:

1258

# # assert key != node.min_key and key != node.max_key

1259

# # If it was going to be present, it would be on

1260

# # *this* page, so mark it missing.

1261

# missing_keys.add(key)

1234

# parents_not_on_page.add(key)

1235

# Missing for some reason

1236

if key < node.min_key:

1237

# in the case of bzr.dev, 3.4k/5.3k misses are

1238

# 'earlier' misses (65%)

1239

parents_not_on_page.add(key)

1240

elif key > node.max_key:

1241

# This parent key would be present on a different

1242

# LeafNode

1243

parents_not_on_page.add(key)

1244

else:

1245

# assert key != node.min_key and key != node.max_key

1246

# If it was going to be present, it would be on

1247

# *this* page, so mark it missing.

1248

missing_keys.add(key)

1262

1249

parents_to_check = next_parents_to_check.difference(parent_map)

1263

1250

# Might want to do another .difference() from missing_keys

1264

1251

# parents_not_on_page could have been found on a different page, or be

1265

1252

# known to be missing. So cull out everything that has already been

1266

1253

# found.

1267

search_tips = parents_not_on_page.difference(

1254

search_keys = parents_not_on_page.difference(

1268

1255

parent_map).difference(missing_keys)

1269

return missing_keys, search_tips

1256

return search_keys

1270

1257

1271

1258

def iter_entries_prefix(self, keys):

1272

1259

"""Iterate over keys within the index using prefix matching.

Older »