~bzr-pqm/bzr/bzr.dev : revision 2624

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

16

17

"""Indexing facilities."""

18

19

__all__ = [

20

'CombinedGraphIndex',

21

'GraphIndex',

22

'GraphIndexBuilder',

23

'InMemoryGraphIndex',

24

]

25

26

from cStringIO import StringIO

27

import re

28

29

from bzrlib import errors

30

31

_OPTION_NODE_REFS = "node_ref_lists="

32

_SIGNATURE = "Bazaar Graph Index 1\n"

33

34

35

_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')

36

_newline_null_re = re.compile('[\n\0]')

37

38

39

class GraphIndexBuilder(object):

40

"""A builder that can build a GraphIndex.

41

42

The resulting graph has the structure:

43

44

_SIGNATURE OPTIONS NODES NEWLINE

45

_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE

46

OPTIONS := 'node_ref_lists=' DIGITS NEWLINE

47

NODES := NODE*

48

NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE

49

KEY := Not-whitespace-utf8

50

ABSENT := 'a'

51

REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}

52

REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?

53

REFERENCE := DIGITS ; digits is the byte offset in the index of the

54

; referenced key.

55

VALUE := no-newline-no-null-bytes

56

"""

57

58

def __init__(self, reference_lists=0):

59

"""Create a GraphIndex builder.

60

61

:param reference_lists: The number of node references lists for each

62

entry.

63

"""

64

self.reference_lists = reference_lists

65

self._nodes = {}

66

67

def add_node(self, key, value, references=()):

68

"""Add a node to the index.

69

70

:param key: The key. keys must be whitespace-free utf8.

71

:param references: An iterable of iterables of keys. Each is a

72

reference to another key.

73

:param value: The value to associate with the key. It may be any

74

bytes as long as it does not contain \0 or \n.

75

"""

76

if not key or _whitespace_re.search(key) is not None:

77

raise errors.BadIndexKey(key)

78

if _newline_null_re.search(value) is not None:

79

raise errors.BadIndexValue(value)

80

if len(references) != self.reference_lists:

81

raise errors.BadIndexValue(references)

82

node_refs = []

83

for reference_list in references:

84

for reference in reference_list:

85

if _whitespace_re.search(reference) is not None:

86

raise errors.BadIndexKey(reference)

87

if reference not in self._nodes:

88

self._nodes[reference] = ('a', (), '')

89

node_refs.append(tuple(reference_list))

90

if key in self._nodes and self._nodes[key][0] == '':

91

raise errors.BadIndexDuplicateKey(key, self)

92

self._nodes[key] = ('', tuple(node_refs), value)

93

94

def finish(self):

95

lines = [_SIGNATURE]

96

lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')

97

prefix_length = len(lines[0]) + len(lines[1])

98

# references are byte offsets. To avoid having to do nasty

99

# polynomial work to resolve offsets (references to later in the

100

# file cannot be determined until all the inbetween references have

101

# been calculated too) we pad the offsets with 0's to make them be

102

# of consistent length. Using binary offsets would break the trivial

103

# file parsing.

104

# to calculate the width of zero's needed we do three passes:

105

# one to gather all the non-reference data and the number of references.

106

# one to pad all the data with reference-length and determine entry

107

# addresses.

108

# One to serialise.

109

110

# forward sorted by key. In future we may consider topological sorting,

111

# at the cost of table scans for direct lookup, or a second index for

112

# direct lookup

113

nodes = sorted(self._nodes.items())

114

# if we do not prepass, we don't know how long it will be up front.

115

expected_bytes = None

116

# we only need to pre-pass if we have reference lists at all.

117

if self.reference_lists:

118

key_offset_info = []

119

non_ref_bytes = prefix_length

120

total_references = 0

121

# TODO use simple multiplication for the constants in this loop.

122

for key, (absent, references, value) in nodes:

123

# record the offset known *so far* for this key:

124

# the non reference bytes to date, and the total references to

125

# date - saves reaccumulating on the second pass

126

key_offset_info.append((key, non_ref_bytes, total_references))

127

# key is literal, value is literal, there are 3 null's, 1 NL

128

non_ref_bytes += len(key) + len(value) + 3 + 1

129

# one byte for absent if set.

130

if absent:

131

non_ref_bytes += 1

132

elif self.reference_lists:

133

# (ref_lists -1) tabs

134

non_ref_bytes += self.reference_lists - 1

135

# (ref-1 cr's per ref_list)

136

for ref_list in references:

137

# how many references across the whole file?

138

total_references += len(ref_list)

139

# accrue reference separators

140

if ref_list:

141

non_ref_bytes += len(ref_list) - 1

142

# how many digits are needed to represent the total byte count?

143

digits = 1

144

possible_total_bytes = non_ref_bytes + total_references*digits

145

while 10 ** digits < possible_total_bytes:

146

digits += 1

147

possible_total_bytes = non_ref_bytes + total_references*digits

148

expected_bytes = possible_total_bytes + 1 # terminating newline

149

# resolve key addresses.

150

key_addresses = {}

151

for key, non_ref_bytes, total_references in key_offset_info:

152

key_addresses[key] = non_ref_bytes + total_references*digits

153

# serialise

154

format_string = '%%0%sd' % digits

155

for key, (absent, references, value) in nodes:

156

flattened_references = []

157

for ref_list in references:

158

ref_addresses = []

159

for reference in ref_list:

160

ref_addresses.append(format_string % key_addresses[reference])

161

flattened_references.append('\r'.join(ref_addresses))

162

lines.append("%s\0%s\0%s\0%s\n" % (key, absent,

163

'\t'.join(flattened_references), value))

164

lines.append('\n')

165

result = StringIO(''.join(lines))

166

if expected_bytes and len(result.getvalue()) != expected_bytes:

167

raise errors.BzrError('Failed index creation. Internal error:'

168

' mismatched output length and expected length: %d %d' %

169

(len(result.getvalue()), expected_bytes))

170

return StringIO(''.join(lines))

171

172

173

class GraphIndex(object):

174

"""An index for data with embedded graphs.

175

176

The index maps keys to a list of key reference lists, and a value.

177

Each node has the same number of key reference lists. Each key reference

178

list can be empty or an arbitrary length. The value is an opaque NULL

179

terminated string without any newlines. The storage of the index is

180

hidden in the interface: keys and key references are always bytestrings,

181

never the internal representation (e.g. dictionary offsets).

182

183

It is presumed that the index will not be mutated - it is static data.

184

185

Successive iter_all_entries calls will read the entire index each time.

186

Additionally, iter_entries calls will read the index linearly until the

187

desired keys are found. XXX: This must be fixed before the index is

188

suitable for production use. :XXX

189

"""

190

191

def __init__(self, transport, name):

192

"""Open an index called name on transport.

193

194

:param transport: A bzrlib.transport.Transport.

195

:param name: A path to provide to transport API calls.

196

"""

197

self._transport = transport

198

self._name = name

199

200

def iter_all_entries(self):

201

"""Iterate over all keys within the index.

202

203

:return: An iterable of (key, value) or (key, value, reference_lists).

204

The former tuple is used when there are no reference lists in the

205

index, making the API compatible with simple key:value index types.

206

There is no defined order for the result iteration - it will be in

207

the most efficient order for the index.

208

"""

209

stream = self._transport.get(self._name)

210

self._read_prefix(stream)

211

line_count = 0

212

self.keys_by_offset = {}

213

trailers = 0

214

pos = stream.tell()

215

for line in stream.readlines():

216

if line == '\n':

217

trailers += 1

218

continue

219

key, absent, references, value = line.split('\0')

220

value = value[:-1] # remove the newline

221

ref_lists = []

222

for ref_string in references.split('\t'):

223

ref_lists.append(tuple([

224

int(ref) for ref in ref_string.split('\r') if ref

225

]))

226

ref_lists = tuple(ref_lists)

227

self.keys_by_offset[pos] = (key, absent, ref_lists, value)

228

pos += len(line)

229

for key, absent, references, value in self.keys_by_offset.itervalues():

230

if absent:

231

continue

232

# resolve references:

233

if self.node_ref_lists:

234

node_refs = []

235

for ref_list in references:

236

node_refs.append(tuple([self.keys_by_offset[ref][0] for ref in ref_list]))

237

yield (key, value, tuple(node_refs))

238

else:

239

yield (key, value)

240

if trailers != 1:

241

# there must be one line - the empty trailer line.

242

raise errors.BadIndexData(self)

243

244

def _read_prefix(self, stream):

245

signature = stream.read(len(self._signature()))

246

if not signature == self._signature():

247

raise errors.BadIndexFormatSignature(self._name, GraphIndex)

248

options_line = stream.readline()

249

if not options_line.startswith(_OPTION_NODE_REFS):

250

raise errors.BadIndexOptions(self)

251

try:

252

self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])

253

except ValueError:

254

raise errors.BadIndexOptions(self)

255

256

def iter_entries(self, keys):

257

"""Iterate over keys within the index.

258

259

:param keys: An iterable providing the keys to be retrieved.

260

:return: An iterable as per iter_all_entries, but restricted to the

261

keys supplied. No additional keys will be returned, and every

262

key supplied that is in the index will be returned.

263

"""

264

keys = set(keys)

265

if not keys:

266

return

267

for node in self.iter_all_entries():

268

if not keys:

269

return

270

if node[0] in keys:

271

yield node

272

keys.remove(node[0])

273

274

def _signature(self):

275

"""The file signature for this index type."""

276

return _SIGNATURE

277

278

def validate(self):

279

"""Validate that everything in the index can be accessed."""

280

# iter_all validates completely at the moment, so just do that.

281

for node in self.iter_all_entries():

282

pass

283

284

285

class CombinedGraphIndex(object):

286

"""A GraphIndex made up from smaller GraphIndices.

287

288

The backing indices must implement GraphIndex, and are presumed to be

289

static data.

290

291

Queries against the combined index will be made against the first index,

292

and then the second and so on. The order of index's can thus influence

293

performance significantly. For example, if one index is on local disk and a

294

second on a remote server, the local disk index should be before the other

295

in the index list.

296

"""

297

298

def __init__(self, indices):

299

"""Create a CombinedGraphIndex backed by indices.

300

301

:param indices: An ordered list of indices to query for data.

302

"""

303

self._indices = indices

304

305

def insert_index(self, pos, index):

306

"""Insert a new index in the list of indices to query.

307

308

:param pos: The position to insert the index.

309

:param index: The index to insert.

310

"""

311

self._indices.insert(pos, index)

312

313

def iter_all_entries(self):

314

"""Iterate over all keys within the index

315

316

Duplicate keys across child indices are presumed to have the same

317

value and are only reported once.

318

319

:return: An iterable of (key, reference_lists, value). There is no

320

defined order for the result iteration - it will be in the most

321

efficient order for the index.

322

"""

323

seen_keys = set()

324

for index in self._indices:

325

for node in index.iter_all_entries():

326

if node[0] not in seen_keys:

327

yield node

328

seen_keys.add(node[0])

329

330

def iter_entries(self, keys):

331

"""Iterate over keys within the index.

332

333

Duplicate keys across child indices are presumed to have the same

334

value and are only reported once.

335

336

:param keys: An iterable providing the keys to be retrieved.

337

:return: An iterable of (key, reference_lists, value). There is no

338

defined order for the result iteration - it will be in the most

339

efficient order for the index.

340

"""

341

keys = set(keys)

342

for index in self._indices:

343

if not keys:

344

return

345

for node in index.iter_entries(keys):

346

keys.remove(node[0])

347

yield node

348

349

def validate(self):

350

"""Validate that everything in the index can be accessed."""

351

for index in self._indices:

352

index.validate()

353

354

355

class InMemoryGraphIndex(GraphIndexBuilder):

356

"""A GraphIndex which operates entirely out of memory and is mutable.

357

358

This is designed to allow the accumulation of GraphIndex entries during a

359

single write operation, where the accumulated entries need to be immediately

360

available - for example via a CombinedGraphIndex.

361

"""

362

363

def add_nodes(self, nodes):

364

"""Add nodes to the index.

365

366

:param nodes: An iterable of (key, node_refs, value) entries to add.

367

"""

368

for (key, value, node_refs) in nodes:

369

self.add_node(key, value, node_refs)

370

371

def iter_all_entries(self):

372

"""Iterate over all keys within the index

373

374

:return: An iterable of (key, reference_lists, value). There is no

375

defined order for the result iteration - it will be in the most

376

efficient order for the index (in this case dictionary hash order).

377

"""

378

if self.reference_lists:

379

for key, (absent, references, value) in self._nodes.iteritems():

380

if not absent:

381

yield key, value, references

382

else:

383

for key, (absent, references, value) in self._nodes.iteritems():

384

if not absent:

385

yield key, value

386

387

def iter_entries(self, keys):

388

"""Iterate over keys within the index.

389

390

:param keys: An iterable providing the keys to be retrieved.

391

:return: An iterable of (key, reference_lists, value). There is no

392

defined order for the result iteration - it will be in the most

393

efficient order for the index (keys iteration order in this case).

394

"""

395

keys = set(keys)

396

if self.reference_lists:

397

for key in keys.intersection(self._nodes):

398

node = self._nodes[key]

399

if not node[0]:

400

yield key, node[2], node[1]

401

else:

402

for key in keys.intersection(self._nodes):

403

node = self._nodes[key]

404

if not node[0]:

405

yield key, node[2]

406

407

def validate(self):

408

"""In memory index's have no known corruption at the moment."""