~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/index.py

Committer: Canonical.com Patch Queue Manager
Date: 2007-06-28 07:08:27 UTC
mfrom: (2553.1.3 integration)
Revision ID: pqm@pqm.ubuntu.com-20070628070827-h5s313dg5tnag9vj

(robertc) Show the names of commit hooks during commit.

files added:
bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

files removed:
bzrlib/_dirstate_helpers_c.pyx

bzrlib/_dirstate_helpers_py.py

bzrlib/_knit_load_data_c.pyx

bzrlib/_knit_load_data_py.py

bzrlib/benchmarks/bench_dirstate.py

bzrlib/benchmarks/bench_knit.py

bzrlib/bundle/serializer/v4.py

bzrlib/email_message.py

bzrlib/file_names.py

bzrlib/index.py

bzrlib/multiparent.py

bzrlib/pack.py

bzrlib/plugins/multiparent.py

bzrlib/tests/blackbox/test_pack.py

bzrlib/tests/commands

bzrlib/tests/commands/__init__.py

bzrlib/tests/commands/test_branch.py

bzrlib/tests/commands/test_cat.py

bzrlib/tests/commands/test_checkout.py

bzrlib/tests/commands/test_init.py

bzrlib/tests/commands/test_init_repository.py

bzrlib/tests/commands/test_merge.py

bzrlib/tests/commands/test_missing.py

bzrlib/tests/commands/test_pull.py

bzrlib/tests/commands/test_push.py

bzrlib/tests/repository_implementations/test_pack.py

bzrlib/tests/test__dirstate_helpers.py

bzrlib/tests/test_email_message.py

bzrlib/tests/test_file_names.py

bzrlib/tests/test_index.py

bzrlib/tests/test_multiparent.py

bzrlib/tests/test_pack.py

bzrlib/tests/transport_util.py

bzrlib/tests/workingtree_implementations/test_uncommit.py

bzrlib/transport/brokenrename.py

bzrlib/transport/unlistable.py

bzrlib/util/tests

bzrlib/util/tests/__init__.py

bzrlib/util/tests/test_bencode.py

doc/bug_trackers.txt

doc/developers/bundle-format4.txt

doc/developers/diff.txt

doc/developers/indices.txt

doc/developers/repository.txt

doc/developers/status.txt

files renamed:
bzrlib/tests/blackbox/test_submit.py => bzrlib/tests/blackbox/test_bundle.py

files modified:
.bzrignore

Makefile

NEWS

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_bundle.py

bzrlib/benchmarks/bench_cache_utf8.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_sftp.py

bzrlib/benchmarks/bench_startup.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/benchmarks/bench_xml.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle/__init__.py

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/bundle/serializer/v09.py

bzrlib/bzrdir.py

bzrlib/cmd_version_info.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/debug.py

bzrlib/delta.py

bzrlib/deprecated_graph.py

bzrlib/dirstate.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/generate_ids.py

bzrlib/graph.py

bzrlib/help_topics.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/memorytree.py

bzrlib/merge.py

bzrlib/merge_directive.py

bzrlib/mutabletree.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/plugin.py

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/remote.py

bzrlib/repofmt/knitrepo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisiontree.py

bzrlib/sign_my_commits.py

bzrlib/smart/client.py

bzrlib/smart/protocol.py

bzrlib/smart/repository.py

bzrlib/smart/server.py

bzrlib/smart/vfs.py

bzrlib/smtp_connection.py

bzrlib/status.py

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/strace.py

bzrlib/symbol_versioning.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_cat_revision.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_debug.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_inventory.py

bzrlib/tests/blackbox/test_join.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_lsprof.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_merge_directive.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_nick.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_split.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_tags.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/blackbox/test_version_info.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_revision_id_to_revno.py

bzrlib/tests/branch_implementations/test_sprout.py

bzrlib/tests/branch_implementations/test_uncommit.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/intertree_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_dirstate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_help.py

bzrlib/tests/test_http.py

bzrlib/tests/test_info.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_merge_directive.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_smtp_connection.py

bzrlib/tests/test_source.py

bzrlib/tests/test_strace.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_xml.py

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_tree.py

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_parents.py

bzrlib/tests/workingtree_implementations/test_remove.py

bzrlib/tests/workingtree_implementations/test_smart_add.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/chroot.py

bzrlib/transport/decorator.py

bzrlib/transport/ftp.py

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/http/_urllib2_wrappers.py

bzrlib/transport/http/response.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/remote.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/uncommit.py

bzrlib/util/bencode.py

bzrlib/version.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/win32utils.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

doc/README.1st

doc/configuration.txt

doc/developers/HACKING

doc/developers/bundles.txt

doc/developers/container-format.txt

doc/developers/index.txt

doc/developers/performance-roadmap.txt

doc/developers/performance.dot

doc/developers/profiling.txt

doc/developers/scratch.txt

doc/http_smart_server.txt

doc/plugins.txt

setup.py

tools/capture_tree.py

tools/rst2prettyhtml.py

tools/trace-revisions

tools/win32/bzr.iss.cog

Show diffs side-by-side

added added

removed removed

bzrlib/index.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Indexing facilities."""

__all__ = [

'CombinedGraphIndex',

'GraphIndex',

'GraphIndexBuilder',

'GraphIndexPrefixAdapter',

'InMemoryGraphIndex',

]

from cStringIO import StringIO

import re

from bzrlib import errors

_OPTION_KEY_ELEMENTS = "key_elements="

_OPTION_NODE_REFS = "node_ref_lists="

_SIGNATURE = "Bazaar Graph Index 1\n"

_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')

_newline_null_re = re.compile('[\n\0]')

class GraphIndexBuilder(object):

"""A builder that can build a GraphIndex.

The resulting graph has the structure:

_SIGNATURE OPTIONS NODES NEWLINE

_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE

OPTIONS := 'node_ref_lists=' DIGITS NEWLINE

NODES := NODE*

NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE

KEY := Not-whitespace-utf8

ABSENT := 'a'

REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}

REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?

REFERENCE := DIGITS ; digits is the byte offset in the index of the

; referenced key.

VALUE := no-newline-no-null-bytes

"""

def __init__(self, reference_lists=0, key_elements=1):

"""Create a GraphIndex builder.

:param reference_lists: The number of node references lists for each

entry.

:param key_elements: The number of bytestrings in each key.

"""

self.reference_lists = reference_lists

self._nodes = {}

self._nodes_by_key = {}

self._key_length = key_elements

def _check_key(self, key):

"""Raise BadIndexKey if key is not a valid key for this index."""

if type(key) != tuple:

raise errors.BadIndexKey(key)

if self._key_length != len(key):

raise errors.BadIndexKey(key)

for element in key:

if not element or _whitespace_re.search(element) is not None:

raise errors.BadIndexKey(element)

def add_node(self, key, value, references=()):

"""Add a node to the index.

:param key: The key. keys are non-empty tuples containing

as many whitespace-free utf8 bytestrings as the key length

defined for this index.

:param references: An iterable of iterables of keys. Each is a

reference to another key.

:param value: The value to associate with the key. It may be any

bytes as long as it does not contain \0 or \n.

"""

self._check_key(key)

if _newline_null_re.search(value) is not None:

raise errors.BadIndexValue(value)

if len(references) != self.reference_lists:

raise errors.BadIndexValue(references)

node_refs = []

for reference_list in references:

100

for reference in reference_list:

101

self._check_key(reference)

102

if reference not in self._nodes:

103

self._nodes[reference] = ('a', (), '')

104

node_refs.append(tuple(reference_list))

105

if key in self._nodes and self._nodes[key][0] == '':

106

raise errors.BadIndexDuplicateKey(key, self)

107

self._nodes[key] = ('', tuple(node_refs), value)

108

if self._key_length > 1:

109

key_dict = self._nodes_by_key

110

if self.reference_lists:

111

key_value = key, value, tuple(node_refs)

112

else:

113

key_value = key, value

114

# possibly should do this on-demand, but it seems likely it is

115

# always wanted

116

# For a key of (foo, bar, baz) create

117

# _nodes_by_key[foo][bar][baz] = key_value

118

for subkey in key[:-1]:

119

key_dict = key_dict.setdefault(subkey, {})

120

key_dict[key[-1]] = key_value

121

122

def finish(self):

123

lines = [_SIGNATURE]

124

lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')

125

lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')

126

prefix_length = sum(len(x) for x in lines)

127

# references are byte offsets. To avoid having to do nasty

128

# polynomial work to resolve offsets (references to later in the

129

# file cannot be determined until all the inbetween references have

130

# been calculated too) we pad the offsets with 0's to make them be

131

# of consistent length. Using binary offsets would break the trivial

132

# file parsing.

133

# to calculate the width of zero's needed we do three passes:

134

# one to gather all the non-reference data and the number of references.

135

# one to pad all the data with reference-length and determine entry

136

# addresses.

137

# One to serialise.

138

139

# forward sorted by key. In future we may consider topological sorting,

140

# at the cost of table scans for direct lookup, or a second index for

141

# direct lookup

142

nodes = sorted(self._nodes.items())

143

# if we do not prepass, we don't know how long it will be up front.

144

expected_bytes = None

145

# we only need to pre-pass if we have reference lists at all.

146

if self.reference_lists:

147

key_offset_info = []

148

non_ref_bytes = prefix_length

149

total_references = 0

150

# TODO use simple multiplication for the constants in this loop.

151

for key, (absent, references, value) in nodes:

152

# record the offset known *so far* for this key:

153

# the non reference bytes to date, and the total references to

154

# date - saves reaccumulating on the second pass

155

key_offset_info.append((key, non_ref_bytes, total_references))

156

# key is literal, value is literal, there are 3 null's, 1 NL

157

# key is variable length tuple, \x00 between elements

158

non_ref_bytes += sum(len(element) for element in key)

159

if self._key_length > 1:

160

non_ref_bytes += self._key_length - 1

161

# value is literal bytes, there are 3 null's, 1 NL.

162

non_ref_bytes += len(value) + 3 + 1

163

# one byte for absent if set.

164

if absent:

165

non_ref_bytes += 1

166

elif self.reference_lists:

167

# (ref_lists -1) tabs

168

non_ref_bytes += self.reference_lists - 1

169

# (ref-1 cr's per ref_list)

170

for ref_list in references:

171

# how many references across the whole file?

172

total_references += len(ref_list)

173

# accrue reference separators

174

if ref_list:

175

non_ref_bytes += len(ref_list) - 1

176

# how many digits are needed to represent the total byte count?

177

digits = 1

178

possible_total_bytes = non_ref_bytes + total_references*digits

179

while 10 ** digits < possible_total_bytes:

180

digits += 1

181

possible_total_bytes = non_ref_bytes + total_references*digits

182

expected_bytes = possible_total_bytes + 1 # terminating newline

183

# resolve key addresses.

184

key_addresses = {}

185

for key, non_ref_bytes, total_references in key_offset_info:

186

key_addresses[key] = non_ref_bytes + total_references*digits

187

# serialise

188

format_string = '%%0%sd' % digits

189

for key, (absent, references, value) in nodes:

190

flattened_references = []

191

for ref_list in references:

192

ref_addresses = []

193

for reference in ref_list:

194

ref_addresses.append(format_string % key_addresses[reference])

195

flattened_references.append('\r'.join(ref_addresses))

196

string_key = '\x00'.join(key)

197

lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,

198

'\t'.join(flattened_references), value))

199

lines.append('\n')

200

result = StringIO(''.join(lines))

201

if expected_bytes and len(result.getvalue()) != expected_bytes:

202

raise errors.BzrError('Failed index creation. Internal error:'

203

' mismatched output length and expected length: %d %d' %

204

(len(result.getvalue()), expected_bytes))

205

return StringIO(''.join(lines))

206

207

208

class GraphIndex(object):

209

"""An index for data with embedded graphs.

210

211

The index maps keys to a list of key reference lists, and a value.

212

Each node has the same number of key reference lists. Each key reference

213

list can be empty or an arbitrary length. The value is an opaque NULL

214

terminated string without any newlines. The storage of the index is

215

hidden in the interface: keys and key references are always tuples of

216

bytestrings, never the internal representation (e.g. dictionary offsets).

217

218

It is presumed that the index will not be mutated - it is static data.

219

220

Successive iter_all_entries calls will read the entire index each time.

221

Additionally, iter_entries calls will read the index linearly until the

222

desired keys are found. XXX: This must be fixed before the index is

223

suitable for production use. :XXX

224

"""

225

226

def __init__(self, transport, name):

227

"""Open an index called name on transport.

228

229

:param transport: A bzrlib.transport.Transport.

230

:param name: A path to provide to transport API calls.

231

"""

232

self._transport = transport

233

self._name = name

234

self._nodes = None

235

self._keys_by_offset = None

236

self._nodes_by_key = None

237

238

def _buffer_all(self):

239

"""Buffer all the index data.

240

241

Mutates self._nodes and self.keys_by_offset.

242

"""

243

stream = self._transport.get(self._name)

244

self._read_prefix(stream)

245

expected_elements = 3 + self._key_length

246

line_count = 0

247

# raw data keyed by offset

248

self._keys_by_offset = {}

249

# ready-to-return key:value or key:value, node_ref_lists

250

self._nodes = {}

251

self._nodes_by_key = {}

252

trailers = 0

253

pos = stream.tell()

254

for line in stream.readlines():

255

if line == '\n':

256

trailers += 1

257

continue

258

elements = line.split('\0')

259

if len(elements) != expected_elements:

260

raise errors.BadIndexData(self)

261

# keys are tuples

262

key = tuple(elements[:self._key_length])

263

absent, references, value = elements[-3:]

264

value = value[:-1] # remove the newline

265

ref_lists = []

266

for ref_string in references.split('\t'):

267

ref_lists.append(tuple([

268

int(ref) for ref in ref_string.split('\r') if ref

269

]))

270

ref_lists = tuple(ref_lists)

271

self._keys_by_offset[pos] = (key, absent, ref_lists, value)

272

pos += len(line)

273

for key, absent, references, value in self._keys_by_offset.itervalues():

274

if absent:

275

continue

276

# resolve references:

277

if self.node_ref_lists:

278

node_refs = []

279

for ref_list in references:

280

node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))

281

node_value = (value, tuple(node_refs))

282

else:

283

node_value = value

284

self._nodes[key] = node_value

285

if self._key_length > 1:

286

subkey = list(reversed(key[:-1]))

287

key_dict = self._nodes_by_key

288

if self.node_ref_lists:

289

key_value = key, node_value[0], node_value[1]

290

else:

291

key_value = key, node_value

292

# possibly should do this on-demand, but it seems likely it is

293

# always wanted

294

# For a key of (foo, bar, baz) create

295

# _nodes_by_key[foo][bar][baz] = key_value

296

for subkey in key[:-1]:

297

key_dict = key_dict.setdefault(subkey, {})

298

key_dict[key[-1]] = key_value

299

self._keys = set(self._nodes)

300

if trailers != 1:

301

# there must be one line - the empty trailer line.

302

raise errors.BadIndexData(self)

303

304

def iter_all_entries(self):

305

"""Iterate over all keys within the index.

306

307

:return: An iterable of (key, value) or (key, value, reference_lists).

308

The former tuple is used when there are no reference lists in the

309

index, making the API compatible with simple key:value index types.

310

There is no defined order for the result iteration - it will be in

311

the most efficient order for the index.

312

"""

313

if self._nodes is None:

314

self._buffer_all()

315

if self.node_ref_lists:

316

for key, (value, node_ref_lists) in self._nodes.iteritems():

317

yield self, key, value, node_ref_lists

318

else:

319

for key, value in self._nodes.iteritems():

320

yield self, key, value

321

322

def _read_prefix(self, stream):

323

signature = stream.read(len(self._signature()))

324

if not signature == self._signature():

325

raise errors.BadIndexFormatSignature(self._name, GraphIndex)

326

options_line = stream.readline()

327

if not options_line.startswith(_OPTION_NODE_REFS):

328

raise errors.BadIndexOptions(self)

329

try:

330

self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])

331

except ValueError:

332

raise errors.BadIndexOptions(self)

333

options_line = stream.readline()

334

if not options_line.startswith(_OPTION_KEY_ELEMENTS):

335

raise errors.BadIndexOptions(self)

336

try:

337

self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])

338

except ValueError:

339

raise errors.BadIndexOptions(self)

340

341

def iter_entries(self, keys):

342

"""Iterate over keys within the index.

343

344

:param keys: An iterable providing the keys to be retrieved.

345

:return: An iterable as per iter_all_entries, but restricted to the

346

keys supplied. No additional keys will be returned, and every

347

key supplied that is in the index will be returned.

348

"""

349

keys = set(keys)

350

if not keys:

351

return

352

if self._nodes is None:

353

self._buffer_all()

354

keys = keys.intersection(self._keys)

355

if self.node_ref_lists:

356

for key in keys:

357

value, node_refs = self._nodes[key]

358

yield self, key, value, node_refs

359

else:

360

for key in keys:

361

yield self, key, self._nodes[key]

362

363

def iter_entries_prefix(self, keys):

364

"""Iterate over keys within the index using prefix matching.

365

366

Prefix matching is applied within the tuple of a key, not to within

367

the bytestring of each key element. e.g. if you have the keys ('foo',

368

'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then

369

only the former key is returned.

370

371

:param keys: An iterable providing the key prefixes to be retrieved.

372

Each key prefix takes the form of a tuple the length of a key, but

373

with the last N elements 'None' rather than a regular bytestring.

374

The first element cannot be 'None'.

375

:return: An iterable as per iter_all_entries, but restricted to the

376

keys with a matching prefix to those supplied. No additional keys

377

will be returned, and every match that is in the index will be

378

returned.

379

"""

380

keys = set(keys)

381

if not keys:

382

return

383

# load data - also finds key lengths

384

if self._nodes is None:

385

self._buffer_all()

386

if self._key_length == 1:

387

for key in keys:

388

# sanity check

389

if key[0] is None:

390

raise errors.BadIndexKey(key)

391

if len(key) != self._key_length:

392

raise errors.BadIndexKey(key)

393

if self.node_ref_lists:

394

value, node_refs = self._nodes[key]

395

yield self, key, value, node_refs

396

else:

397

yield self, key, self._nodes[key]

398

return

399

for key in keys:

400

# sanity check

401

if key[0] is None:

402

raise errors.BadIndexKey(key)

403

if len(key) != self._key_length:

404

raise errors.BadIndexKey(key)

405

# find what it refers to:

406

key_dict = self._nodes_by_key

407

elements = list(key)

408

# find the subdict whose contents should be returned.

409

try:

410

while len(elements) and elements[0] is not None:

411

key_dict = key_dict[elements[0]]

412

elements.pop(0)

413

except KeyError:

414

# a non-existant lookup.

415

continue

416

if len(elements):

417

dicts = [key_dict]

418

while dicts:

419

key_dict = dicts.pop(-1)

420

# can't be empty or would not exist

421

item, value = key_dict.iteritems().next()

422

if type(value) == dict:

423

# push keys

424

dicts.extend(key_dict.itervalues())

425

else:

426

# yield keys

427

for value in key_dict.itervalues():

428

# each value is the key:value:node refs tuple

429

# ready to yield.

430

yield (self, ) + value

431

else:

432

# the last thing looked up was a terminal element

433

yield (self, ) + key_dict

434

435

def _signature(self):

436

"""The file signature for this index type."""

437

return _SIGNATURE

438

439

def validate(self):

440

"""Validate that everything in the index can be accessed."""

441

# iter_all validates completely at the moment, so just do that.

442

for node in self.iter_all_entries():

443

pass

444

445

446

class CombinedGraphIndex(object):

447

"""A GraphIndex made up from smaller GraphIndices.

448

449

The backing indices must implement GraphIndex, and are presumed to be

450

static data.

451

452

Queries against the combined index will be made against the first index,

453

and then the second and so on. The order of index's can thus influence

454

performance significantly. For example, if one index is on local disk and a

455

second on a remote server, the local disk index should be before the other

456

in the index list.

457

"""

458

459

def __init__(self, indices):

460

"""Create a CombinedGraphIndex backed by indices.

461

462

:param indices: An ordered list of indices to query for data.

463

"""

464

self._indices = indices

465

466

def insert_index(self, pos, index):

467

"""Insert a new index in the list of indices to query.

468

469

:param pos: The position to insert the index.

470

:param index: The index to insert.

471

"""

472

self._indices.insert(pos, index)

473

474

def iter_all_entries(self):

475

"""Iterate over all keys within the index

476

477

Duplicate keys across child indices are presumed to have the same

478

value and are only reported once.

479

480

:return: An iterable of (key, reference_lists, value). There is no

481

defined order for the result iteration - it will be in the most

482

efficient order for the index.

483

"""

484

seen_keys = set()

485

for index in self._indices:

486

for node in index.iter_all_entries():

487

if node[1] not in seen_keys:

488

yield node

489

seen_keys.add(node[1])

490

491

def iter_entries(self, keys):

492

"""Iterate over keys within the index.

493

494

Duplicate keys across child indices are presumed to have the same

495

value and are only reported once.

496

497

:param keys: An iterable providing the keys to be retrieved.

498

:return: An iterable of (key, reference_lists, value). There is no

499

defined order for the result iteration - it will be in the most

500

efficient order for the index.

501

"""

502

keys = set(keys)

503

for index in self._indices:

504

if not keys:

505

return

506

for node in index.iter_entries(keys):

507

keys.remove(node[1])

508

yield node

509

510

def iter_entries_prefix(self, keys):

511

"""Iterate over keys within the index using prefix matching.

512

513

Duplicate keys across child indices are presumed to have the same

514

value and are only reported once.

515

516

Prefix matching is applied within the tuple of a key, not to within

517

the bytestring of each key element. e.g. if you have the keys ('foo',

518

'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then

519

only the former key is returned.

520

521

:param keys: An iterable providing the key prefixes to be retrieved.

522

Each key prefix takes the form of a tuple the length of a key, but

523

with the last N elements 'None' rather than a regular bytestring.

524

The first element cannot be 'None'.

525

:return: An iterable as per iter_all_entries, but restricted to the

526

keys with a matching prefix to those supplied. No additional keys

527

will be returned, and every match that is in the index will be

528

returned.

529

"""

530

keys = set(keys)

531

if not keys:

532

return

533

seen_keys = set()

534

for index in self._indices:

535

for node in index.iter_entries_prefix(keys):

536

if node[1] in seen_keys:

537

continue

538

seen_keys.add(node[1])

539

yield node

540

541

def validate(self):

542

"""Validate that everything in the index can be accessed."""

543

for index in self._indices:

544

index.validate()

545

546

547

class InMemoryGraphIndex(GraphIndexBuilder):

548

"""A GraphIndex which operates entirely out of memory and is mutable.

549

550

This is designed to allow the accumulation of GraphIndex entries during a

551

single write operation, where the accumulated entries need to be immediately

552

available - for example via a CombinedGraphIndex.

553

"""

554

555

def add_nodes(self, nodes):

556

"""Add nodes to the index.

557

558

:param nodes: An iterable of (key, node_refs, value) entries to add.

559

"""

560

if self.reference_lists:

561

for (key, value, node_refs) in nodes:

562

self.add_node(key, value, node_refs)

563

else:

564

for (key, value) in nodes:

565

self.add_node(key, value)

566

567

def iter_all_entries(self):

568

"""Iterate over all keys within the index

569

570

:return: An iterable of (key, reference_lists, value). There is no

571

defined order for the result iteration - it will be in the most

572

efficient order for the index (in this case dictionary hash order).

573

"""

574

if self.reference_lists:

575

for key, (absent, references, value) in self._nodes.iteritems():

576

if not absent:

577

yield self, key, value, references

578

else:

579

for key, (absent, references, value) in self._nodes.iteritems():

580

if not absent:

581

yield self, key, value

582

583

def iter_entries(self, keys):

584

"""Iterate over keys within the index.

585

586

:param keys: An iterable providing the keys to be retrieved.

587

:return: An iterable of (key, reference_lists, value). There is no

588

defined order for the result iteration - it will be in the most

589

efficient order for the index (keys iteration order in this case).

590

"""

591

keys = set(keys)

592

if self.reference_lists:

593

for key in keys.intersection(self._nodes):

594

node = self._nodes[key]

595

if not node[0]:

596

yield self, key, node[2], node[1]

597

else:

598

for key in keys.intersection(self._nodes):

599

node = self._nodes[key]

600

if not node[0]:

601

yield self, key, node[2]

602

603

def iter_entries_prefix(self, keys):

604

"""Iterate over keys within the index using prefix matching.

605

606

Prefix matching is applied within the tuple of a key, not to within

607

the bytestring of each key element. e.g. if you have the keys ('foo',

608

'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then

609

only the former key is returned.

610

611

:param keys: An iterable providing the key prefixes to be retrieved.

612

Each key prefix takes the form of a tuple the length of a key, but

613

with the last N elements 'None' rather than a regular bytestring.

614

The first element cannot be 'None'.

615

:return: An iterable as per iter_all_entries, but restricted to the

616

keys with a matching prefix to those supplied. No additional keys

617

will be returned, and every match that is in the index will be

618

returned.

619

"""

620

# XXX: To much duplication with the GraphIndex class; consider finding

621

# a good place to pull out the actual common logic.

622

keys = set(keys)

623

if not keys:

624

return

625

if self._key_length == 1:

626

for key in keys:

627

# sanity check

628

if key[0] is None:

629

raise errors.BadIndexKey(key)

630

if len(key) != self._key_length:

631

raise errors.BadIndexKey(key)

632

node = self._nodes[key]

633

if node[0]:

634

continue

635

if self.reference_lists:

636

yield self, key, node[2], node[1]

637

else:

638

yield self ,key, node[2]

639

return

640

for key in keys:

641

# sanity check

642

if key[0] is None:

643

raise errors.BadIndexKey(key)

644

if len(key) != self._key_length:

645

raise errors.BadIndexKey(key)

646

# find what it refers to:

647

key_dict = self._nodes_by_key

648

elements = list(key)

649

# find the subdict to return

650

try:

651

while len(elements) and elements[0] is not None:

652

key_dict = key_dict[elements[0]]

653

elements.pop(0)

654

except KeyError:

655

# a non-existant lookup.

656

continue

657

if len(elements):

658

dicts = [key_dict]

659

while dicts:

660

key_dict = dicts.pop(-1)

661

# can't be empty or would not exist

662

item, value = key_dict.iteritems().next()

663

if type(value) == dict:

664

# push keys

665

dicts.extend(key_dict.itervalues())

666

else:

667

# yield keys

668

for value in key_dict.itervalues():

669

yield (self, ) + value

670

else:

671

yield (self, ) + key_dict

672

673

def validate(self):

674

"""In memory index's have no known corruption at the moment."""

675

676

677

class GraphIndexPrefixAdapter(object):

678

"""An adapter between GraphIndex with different key lengths.

679

680

Queries against this will emit queries against the adapted Graph with the

681

prefix added, queries for all items use iter_entries_prefix. The returned

682

nodes will have their keys and node references adjusted to remove the

683

prefix. Finally, an add_nodes_callback can be supplied - when called the

684

nodes and references being added will have prefix prepended.

685

"""

686

687

def __init__(self, adapted, prefix, missing_key_length, add_nodes_callback=None):

688

"""Construct an adapter against adapted with prefix."""

689

self.adapted = adapted

690

self.prefix = prefix + (None,)*missing_key_length

691

self.prefix_key = prefix

692

self.prefix_len = len(prefix)

693

self.add_nodes_callback = add_nodes_callback

694

695

def add_nodes(self, nodes):

696

"""Add nodes to the index.

697

698

:param nodes: An iterable of (key, node_refs, value) entries to add.

699

"""

700

# save nodes in case its an iterator

701

nodes = tuple(nodes)

702

translated_nodes = []

703

try:

704

for (key, value, node_refs) in nodes:

705

adjusted_references = (

706

tuple(tuple(self.prefix_key + ref_node for ref_node in ref_list)

707

for ref_list in node_refs))

708

translated_nodes.append((self.prefix_key + key, value,

709

adjusted_references))

710

except ValueError:

711

# XXX: TODO add an explicit interface for getting the reference list

712

# status, to handle this bit of user-friendliness in the API more

713

# explicitly.

714

for (key, value) in nodes:

715

translated_nodes.append((self.prefix_key + key, value))

716

self.add_nodes_callback(translated_nodes)

717

718

def add_node(self, key, value, references=()):

719

"""Add a node to the index.

720

721

:param key: The key. keys are non-empty tuples containing

722

as many whitespace-free utf8 bytestrings as the key length

723

defined for this index.

724

:param references: An iterable of iterables of keys. Each is a

725

reference to another key.

726

:param value: The value to associate with the key. It may be any

727

bytes as long as it does not contain \0 or \n.

728

"""

729

self.add_nodes(((key, value, references), ))

730

731

def _strip_prefix(self, an_iter):

732

"""Strip prefix data from nodes and return it."""

733

for node in an_iter:

734

# cross checks

735

if node[1][:self.prefix_len] != self.prefix_key:

736

raise errors.BadIndexData(self)

737

for ref_list in node[3]:

738

for ref_node in ref_list:

739

if ref_node[:self.prefix_len] != self.prefix_key:

740

raise errors.BadIndexData(self)

741

yield node[0], node[1][self.prefix_len:], node[2], (

742

tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)

743

for ref_list in node[3]))

744

745

def iter_all_entries(self):

746

"""Iterate over all keys within the index

747

748

iter_all_entries is implemented against the adapted index using

749

iter_entries_prefix.

750

751

:return: An iterable of (key, reference_lists, value). There is no

752

defined order for the result iteration - it will be in the most

753

efficient order for the index (in this case dictionary hash order).

754

"""

755

return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix]))

756

757

def iter_entries(self, keys):

758

"""Iterate over keys within the index.

759

760

:param keys: An iterable providing the keys to be retrieved.

761

:return: An iterable of (key, reference_lists, value). There is no

762

defined order for the result iteration - it will be in the most

763

efficient order for the index (keys iteration order in this case).

764

"""

765

return self._strip_prefix(self.adapted.iter_entries(

766

self.prefix_key + key for key in keys))

767

768

def iter_entries_prefix(self, keys):

769

"""Iterate over keys within the index using prefix matching.

770

771

Prefix matching is applied within the tuple of a key, not to within

772

the bytestring of each key element. e.g. if you have the keys ('foo',

773

'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then

774

only the former key is returned.

775

776

:param keys: An iterable providing the key prefixes to be retrieved.

777

Each key prefix takes the form of a tuple the length of a key, but

778

with the last N elements 'None' rather than a regular bytestring.

779

The first element cannot be 'None'.

780

:return: An iterable as per iter_all_entries, but restricted to the

781

keys with a matching prefix to those supplied. No additional keys

782

will be returned, and every match that is in the index will be

783

returned.

784

"""

785

return self._strip_prefix(self.adapted.iter_entries_prefix(

786

self.prefix_key + key for key in keys))

787

788

def validate(self):

789

"""Call the adapted's validate."""

790

self.adapted.validate()

Older »