~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/index.py

Committer: Canonical.com Patch Queue Manager
Date: 2007-10-15 09:04:41 UTC
mfrom: (2890.2.18 index)
Revision ID: pqm@pqm.ubuntu.com-20071015090441-ud1o5gta7klf0nn8

(robertc) Partial GraphIndex reads via new bzrlib.bisect_multi module. (Robert Collins)

files added:
bzrlib/bisect_multi.py

bzrlib/tests/test_bisect_multi.py

files modified:
NEWS

bzrlib/index.py

bzrlib/tests/__init__.py

bzrlib/tests/test_index.py

bzrlib/tests/test_knit.py

Show diffs side-by-side

added added

removed removed

bzrlib/index.py

'InMemoryGraphIndex',

]

from bisect import bisect_right

from cStringIO import StringIO

import re

from bzrlib.lazy_import import lazy_import

lazy_import(globals(), """

from bzrlib import trace

from bzrlib.bisect_multi import bisect_multi_bytes

from bzrlib.trace import mutter

""")

from bzrlib import debug, errors

232

234

suitable for production use. :XXX

233

235

"""

234

236

235

def __init__(self, transport, name):

237

def __init__(self, transport, name, size):

236

238

"""Open an index called name on transport.

237

239

238

240

:param transport: A bzrlib.transport.Transport.

239

241

:param name: A path to provide to transport API calls.

242

:param size: The size of the index in bytes. This is used for bisection

243

logic to perform partial index reads. While the size could be

244

obtained by statting the file this introduced an additional round

245

trip as well as requiring stat'able transports, both of which are

246

avoided by having it supplied. If size is None, then bisection

247

support will be disabled and accessing the index will just stream

248

all the data.

240

249

"""

241

250

self._transport = transport

242

251

self._name = name

252

# Becomes a dict of key:(value, reference-list-byte-locations) used by

253

# the bisection interface to store parsed but not resolved keys.

254

self._bisect_nodes = None

255

# Becomes a dict of key:(value, reference-list-keys) which are ready to

256

# be returned directly to callers.

243

257

self._nodes = None

258

# a sorted list of slice-addresses for the parsed bytes of the file.

259

# e.g. (0,1) would mean that byte 0 is parsed.

260

self._parsed_byte_map = []

261

# a sorted list of keys matching each slice address for parsed bytes

262

# e.g. (None, 'foo@bar') would mean that the first byte contained no

263

# key, and the end byte of the slice is the of the data for 'foo@bar'

264

self._parsed_key_map = []

244

265

self._key_count = None

245

266

self._keys_by_offset = None

246

267

self._nodes_by_key = None

268

self._size = size

247

269

248

270

def _buffer_all(self):

249

271

"""Buffer all the index data.

254

276

mutter('Reading entire index %s', self._transport.abspath(self._name))

255

277

stream = self._transport.get(self._name)

256

278

self._read_prefix(stream)

257

expected_elements = 3 + self._key_length

279

self._expected_elements = 3 + self._key_length

258

280

line_count = 0

259

281

# raw data keyed by offset

260

282

self._keys_by_offset = {}

263

285

self._nodes_by_key = {}

264

286

trailers = 0

265

287

pos = stream.tell()

266

for line in stream.readlines():

267

if line == '\n':

268

trailers += 1

269

continue

270

elements = line.split('\0')

271

if len(elements) != expected_elements:

272

raise errors.BadIndexData(self)

273

# keys are tuples

274

key = tuple(elements[:self._key_length])

275

absent, references, value = elements[-3:]

276

value = value[:-1] # remove the newline

277

ref_lists = []

278

for ref_string in references.split('\t'):

279

ref_lists.append(tuple([

280

int(ref) for ref in ref_string.split('\r') if ref

281

]))

282

ref_lists = tuple(ref_lists)

283

self._keys_by_offset[pos] = (key, absent, ref_lists, value)

284

pos += len(line)

288

lines = stream.read().split('\n')

289

del lines[-1]

290

_, _, _, trailers = self._parse_lines(lines, pos)

285

291

for key, absent, references, value in self._keys_by_offset.itervalues():

286

292

if absent:

287

293

continue

288

294

# resolve references:

289

295

if self.node_ref_lists:

290

node_refs = []

291

for ref_list in references:

292

node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))

293

node_value = (value, tuple(node_refs))

296

node_value = (value, self._resolve_references(references))

294

297

else:

295

298

node_value = value

296

299

self._nodes[key] = node_value

361

364

except ValueError:

362

365

raise errors.BadIndexOptions(self)

363

366

364

def iter_entries(self, keys):

365

"""Iterate over keys within the index.

366

367

:param keys: An iterable providing the keys to be retrieved.

368

:return: An iterable as per iter_all_entries, but restricted to the

369

keys supplied. No additional keys will be returned, and every

370

key supplied that is in the index will be returned.

371

"""

372

keys = set(keys)

373

if not keys:

374

return

375

if self._nodes is None:

376

self._buffer_all()

367

def _resolve_references(self, references):

368

"""Return the resolved key references for references.

369

370

References are resolved by looking up the location of the key in the

371

_keys_by_offset map and substituting the key name, preserving ordering.

372

373

:param references: An iterable of iterables of key locations. e.g.

374

[[123, 456], [123]]

375

:return: A tuple of tuples of keys.

376

"""

377

node_refs = []

378

for ref_list in references:

379

node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))

380

return tuple(node_refs)

381

382

def _find_index(self, range_map, key):

383

"""Helper for the _parsed_*_index calls.

384

385

Given a range map - [(start, end), ...], finds the index of the range

386

in the map for key if it is in the map, and if it is not there, the

387

immediately preceeding range in the map.

388

"""

389

result = bisect_right(range_map, key) - 1

390

if result + 1 < len(range_map):

391

# check the border condition, it may be in result + 1

392

if range_map[result + 1][0] == key[0]:

393

return result + 1

394

return result

395

396

def _parsed_byte_index(self, offset):

397

"""Return the index of the entry immediately before offset.

398

399

e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that

400

there is one unparsed byte (the 11th, addressed as[10]). then:

401

asking for 0 will return 0

402

asking for 10 will return 0

403

asking for 11 will return 1

404

asking for 12 will return 1

405

"""

406

key = (offset, 0)

407

return self._find_index(self._parsed_byte_map, key)

408

409

def _parsed_key_index(self, key):

410

"""Return the index of the entry immediately before key.

411

412

e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,

413

meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive

414

have been parsed, then:

415

asking for '' will return 0

416

asking for 'a' will return 0

417

asking for 'b' will return 1

418

asking for 'e' will return 1

419

"""

420

search_key = (key, None)

421

return self._find_index(self._parsed_key_map, search_key)

422

423

def _is_parsed(self, offset):

424

"""Returns True if offset has been parsed."""

425

index = self._parsed_byte_index(offset)

426

if index == len(self._parsed_byte_map):

427

return offset < self._parsed_byte_map[index - 1][1]

428

start, end = self._parsed_byte_map[index]

429

return offset >= start and offset < end

430

431

def _iter_entries_from_total_buffer(self, keys):

432

"""Iterate over keys when the entire index is parsed."""

377

433

keys = keys.intersection(self._keys)

378

434

if self.node_ref_lists:

379

435

for key in keys:

383

439

for key in keys:

384

440

yield self, key, self._nodes[key]

385

441

442

def iter_entries(self, keys):

443

"""Iterate over keys within the index.

444

445

:param keys: An iterable providing the keys to be retrieved.

446

:return: An iterable as per iter_all_entries, but restricted to the

447

keys supplied. No additional keys will be returned, and every

448

key supplied that is in the index will be returned.

449

"""

450

# PERFORMANCE TODO: parse and bisect all remaining data at some

451

# threshold of total-index processing/get calling layers that expect to

452

# read the entire index to use the iter_all_entries method instead.

453

keys = set(keys)

454

if not keys:

455

return []

456

if self._size is None and self._nodes is None:

457

self._buffer_all()

458

if self._nodes is not None:

459

return self._iter_entries_from_total_buffer(keys)

460

else:

461

return (result[1] for result in bisect_multi_bytes(

462

self._lookup_keys_via_location, self._size, keys))

463

386

464

def iter_entries_prefix(self, keys):

387

465

"""Iterate over keys within the index using prefix matching.

388

466

391

469

'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then

392

470

only the former key is returned.

393

471

472

WARNING: Note that this method currently causes a full index parse

473

unconditionally (which is reasonably appropriate as it is a means for

474

thunking many small indices into one larger one and still supplies

475

iter_all_entries at the thunk layer).

476

394

477

:param keys: An iterable providing the key prefixes to be retrieved.

395

478

Each key prefix takes the form of a tuple the length of a key, but

396

479

with the last N elements 'None' rather than a regular bytestring.

465

548

self._buffer_all()

466

549

return self._key_count

467

550

551

def _lookup_keys_via_location(self, location_keys):

552

"""Public interface for implementing bisection.

553

554

If _buffer_all has been called, then all the data for the index is in

555

memory, and this method should not be called, as it uses a separate

556

cache because it cannot pre-resolve all indices, which buffer_all does

557

for performance.

558

559

:param location_keys: A list of location(byte offset), key tuples.

560

:return: A list of (location_key, result) tuples as expected by

561

bzrlib.bisect_multi.bisect_multi_bytes.

562

"""

563

# Possible improvements:

564

# - only bisect lookup each key once

565

# - sort the keys first, and use that to reduce the bisection window

566

# -----

567

# this progresses in three parts:

568

# read data

569

# parse it

570

# attempt to answer the question from the now in memory data.

571

# build the readv request

572

# for each location, ask for 800 bytes - much more than rows we've seen

573

# anywhere.

574

readv_ranges = []

575

for location, key in location_keys:

576

# can we answer from cache?

577

# - if we know the answer - yes

578

index = self._parsed_key_index(key)

579

if (len(self._parsed_key_map) and

580

self._parsed_key_map[index][0] <= key and

581

(self._parsed_key_map[index][1] > key or

582

# end of the file has been parsed

583

self._parsed_byte_map[index][1] == self._size)):

584

# the key has been parsed, so no lookup is needed

585

continue

586

# - if we have examined this part of the file already - yes

587

index = self._parsed_byte_index(location)

588

if (len(self._parsed_byte_map) and

589

self._parsed_byte_map[index][0] <= location and

590

self._parsed_byte_map[index][1] > location):

591

# the byte region has been parsed, so no read is needed.

592

continue

593

length = 800

594

if location + length > self._size:

595

length = self._size - location

596

# todo, trim out parsed locations.

597

if length > 0:

598

readv_ranges.append((location, length))

599

# read the header if needed

600

if self._bisect_nodes is None:

601

readv_ranges.append((0, 200))

602

self._read_and_parse(readv_ranges)

603

# generate results:

604

# - figure out <, >, missing, present

605

# - result present references so we can return them.

606

result = []

607

# keys that we cannot answer until we resolve references

608

pending_references = []

609

pending_locations = set()

610

for location, key in location_keys:

611

# can we answer from cache?

612

index = self._parsed_key_index(key)

613

if (self._parsed_key_map[index][0] <= key and

614

(self._parsed_key_map[index][1] > key or

615

# end of the file has been parsed

616

self._parsed_byte_map[index][1] == self._size)):

617

# the key has been parsed, so no lookup is needed

618

if key in self._bisect_nodes:

619

if self.node_ref_lists:

620

# the references may not have been all parsed.

621

value, refs = self._bisect_nodes[key]

622

wanted_locations = []

623

for ref_list in refs:

624

for ref in ref_list:

625

if ref not in self._keys_by_offset:

626

wanted_locations.append(ref)

627

if wanted_locations:

628

pending_locations.update(wanted_locations)

629

pending_references.append((location, key))

630

continue

631

result.append(((location, key), (self, key,

632

value, self._resolve_references(refs))))

633

else:

634

result.append(((location, key),

635

(self, key, self._bisect_nodes[key])))

636

else:

637

result.append(((location, key), False))

638

continue

639

# no, is the key above or below the probed location:

640

# get the range of the probed & parsed location

641

index = self._parsed_byte_index(location)

642

# if the key is below the start of the range, its below

643

if key < self._parsed_key_map[index][0]:

644

direction = -1

645

else:

646

direction = +1

647

result.append(((location, key), direction))

648

readv_ranges = []

649

# lookup data to resolve references

650

for location in pending_locations:

651

length = 800

652

if location + length > self._size:

653

length = self._size - location

654

# TODO: trim out parsed locations (e.g. if the 800 is into the

655

# parsed region trim it, and dont use the adjust_for_latency

656

# facility)

657

if length > 0:

658

readv_ranges.append((location, length))

659

self._read_and_parse(readv_ranges)

660

for location, key in pending_references:

661

# answer key references we had to look-up-late.

662

index = self._parsed_key_index(key)

663

value, refs = self._bisect_nodes[key]

664

result.append(((location, key), (self, key,

665

value, self._resolve_references(refs))))

666

return result

667

668

def _parse_header_from_bytes(self, bytes):

669

"""Parse the header from a region of bytes.

670

671

:param bytes: The data to parse.

672

:return: An offset, data tuple such as readv yields, for the unparsed

673

data. (which may length 0).

674

"""

675

signature = bytes[0:len(self._signature())]

676

if not signature == self._signature():

677

raise errors.BadIndexFormatSignature(self._name, GraphIndex)

678

lines = bytes[len(self._signature()):].splitlines()

679

options_line = lines[0]

680

if not options_line.startswith(_OPTION_NODE_REFS):

681

raise errors.BadIndexOptions(self)

682

try:

683

self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])

684

except ValueError:

685

raise errors.BadIndexOptions(self)

686

options_line = lines[1]

687

if not options_line.startswith(_OPTION_KEY_ELEMENTS):

688

raise errors.BadIndexOptions(self)

689

try:

690

self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])

691

except ValueError:

692

raise errors.BadIndexOptions(self)

693

options_line = lines[2]

694

if not options_line.startswith(_OPTION_LEN):

695

raise errors.BadIndexOptions(self)

696

try:

697

self._key_count = int(options_line[len(_OPTION_LEN):])

698

except ValueError:

699

raise errors.BadIndexOptions(self)

700

# calculate the bytes we have processed

701

header_end = (len(signature) + len(lines[0]) + len(lines[1]) +

702

len(lines[2]) + 3)

703

self._parsed_bytes(0, None, header_end, None)

704

# setup parsing state

705

self._expected_elements = 3 + self._key_length

706

# raw data keyed by offset

707

self._keys_by_offset = {}

708

# keys with the value and node references

709

self._bisect_nodes = {}

710

return header_end, bytes[header_end:]

711

712

def _parse_region(self, offset, data):

713

"""Parse node data returned from a readv operation.

714

715

:param offset: The byte offset the data starts at.

716

:param data: The data to parse.

717

"""

718

# trim the data.

719

# end first:

720

end = offset + len(data)

721

high_parsed = offset

722

while True:

723

# Trivial test - if the current index's end is within the

724

# low-matching parsed range, we're done.

725

index = self._parsed_byte_index(high_parsed)

726

if end < self._parsed_byte_map[index][1]:

727

return

728

# print "[%d:%d]" % (offset, end), \

729

# self._parsed_byte_map[index:index + 2]

730

high_parsed, last_segment = self._parse_segment(

731

offset, data, end, index)

732

if last_segment:

733

return

734

735

def _parse_segment(self, offset, data, end, index):

736

"""Parse one segment of data.

737

738

:param offset: Where 'data' begins in the file.

739

:param data: Some data to parse a segment of.

740

:param end: Where data ends

741

:param index: The current index into the parsed bytes map.

742

:return: True if the parsed segment is the last possible one in the

743

range of data.

744

:return: high_parsed_byte, last_segment.

745

high_parsed_byte is the location of the highest parsed byte in this

746

segment, last_segment is True if the parsed segment is the last

747

possible one in the data block.

748

"""

749

# default is to use all data

750

trim_end = None

751

# accomodate overlap with data before this.

752

if offset < self._parsed_byte_map[index][1]:

753

# overlaps the lower parsed region

754

# skip the parsed data

755

trim_start = self._parsed_byte_map[index][1] - offset

756

# don't trim the start for \n

757

start_adjacent = True

758

elif offset == self._parsed_byte_map[index][1]:

759

# abuts the lower parsed region

760

# use all data

761

trim_start = None

762

# do not trim anything

763

start_adjacent = True

764

else:

765

# does not overlap the lower parsed region

766

# use all data

767

trim_start = None

768

# but trim the leading \n

769

start_adjacent = False

770

if end == self._size:

771

# lines up to the end of all data:

772

# use it all

773

trim_end = None

774

# do not strip to the last \n

775

end_adjacent = True

776

last_segment = True

777

elif index + 1 == len(self._parsed_byte_map):

778

# at the end of the parsed data

779

# use it all

780

trim_end = None

781

# but strip to the last \n

782

end_adjacent = False

783

last_segment = True

784

elif end == self._parsed_byte_map[index + 1][0]:

785

# buts up against the next parsed region

786

# use it all

787

trim_end = None

788

# do not strip to the last \n

789

end_adjacent = True

790

last_segment = True

791

elif end > self._parsed_byte_map[index + 1][0]:

792

# overlaps into the next parsed region

793

# only consider the unparsed data

794

trim_end = self._parsed_byte_map[index + 1][0] - offset

795

# do not strip to the last \n as we know its an entire record

796

end_adjacent = True

797

last_segment = end < self._parsed_byte_map[index + 1][1]

798

else:

799

# does not overlap into the next region

800

# use it all

801

trim_end = None

802

# but strip to the last \n

803

end_adjacent = False

804

last_segment = True

805

# now find bytes to discard if needed

806

if not start_adjacent:

807

# work around python bug in rfind

808

if trim_start is None:

809

trim_start = data.find('\n') + 1

810

else:

811

trim_start = data.find('\n', trim_start) + 1

812

assert trim_start != 0, 'no \n was present'

813

# print 'removing start', offset, trim_start, repr(data[:trim_start])

814

if not end_adjacent:

815

# work around python bug in rfind

816

if trim_end is None:

817

trim_end = data.rfind('\n') + 1

818

else:

819

trim_end = data.rfind('\n', None, trim_end) + 1

820

assert trim_end != 0, 'no \n was present'

821

# print 'removing end', offset, trim_end, repr(data[trim_end:])

822

# adjust offset and data to the parseable data.

823

trimmed_data = data[trim_start:trim_end]

824

assert trimmed_data, 'read unneeded data [%d:%d] from [%d:%d]' % (

825

trim_start, trim_end, offset, offset + len(data))

826

if trim_start:

827

offset += trim_start

828

# print "parsing", repr(trimmed_data)

829

# splitlines mangles the \r delimiters.. don't use it.

830

lines = trimmed_data.split('\n')

831

del lines[-1]

832

pos = offset

833

first_key, last_key, nodes, _ = self._parse_lines(lines, pos)

834

for key, value in nodes:

835

self._bisect_nodes[key] = value

836

self._parsed_bytes(offset, first_key,

837

offset + len(trimmed_data), last_key)

838

return offset + len(trimmed_data), last_segment

839

840

def _parse_lines(self, lines, pos):

841

key = None

842

first_key = None

843

trailers = 0

844

nodes = []

845

for line in lines:

846

if line == '':

847

# must be at the end

848

if self._size:

849

assert self._size == pos + 1, "%s %s" % (self._size, pos)

850

trailers += 1

851

continue

852

elements = line.split('\0')

853

if len(elements) != self._expected_elements:

854

raise errors.BadIndexData(self)

855

# keys are tuples

856

key = tuple(elements[:self._key_length])

857

if first_key is None:

858

first_key = key

859

absent, references, value = elements[-3:]

860

ref_lists = []

861

for ref_string in references.split('\t'):

862

ref_lists.append(tuple([

863

int(ref) for ref in ref_string.split('\r') if ref

864

]))

865

ref_lists = tuple(ref_lists)

866

self._keys_by_offset[pos] = (key, absent, ref_lists, value)

867

pos += len(line) + 1 # +1 for the \n

868

if absent:

869

continue

870

if self.node_ref_lists:

871

node_value = (value, ref_lists)

872

else:

873

node_value = value

874

nodes.append((key, node_value))

875

# print "parsed ", key

876

return first_key, key, nodes, trailers

877

878

def _parsed_bytes(self, start, start_key, end, end_key):

879

"""Mark the bytes from start to end as parsed.

880

881

Calling self._parsed_bytes(1,2) will mark one byte (the one at offset

882

1) as parsed.

883

884

:param start: The start of the parsed region.

885

:param end: The end of the parsed region.

886

"""

887

index = self._parsed_byte_index(start)

888

new_value = (start, end)

889

new_key = (start_key, end_key)

890

if index == -1:

891

# first range parsed is always the beginning.

892

self._parsed_byte_map.insert(index, new_value)

893

self._parsed_key_map.insert(index, new_key)

894

return

895

# four cases:

896

# new region

897

# extend lower region

898

# extend higher region

899

# combine two regions

900

if (index + 1 < len(self._parsed_byte_map) and

901

self._parsed_byte_map[index][1] == start and

902

self._parsed_byte_map[index + 1][0] == end):

903

# combine two regions

904

self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],

905

self._parsed_byte_map[index + 1][1])

906

self._parsed_key_map[index] = (self._parsed_key_map[index][0],

907

self._parsed_key_map[index + 1][1])

908

del self._parsed_byte_map[index + 1]

909

del self._parsed_key_map[index + 1]

910

elif self._parsed_byte_map[index][1] == start:

911

# extend the lower entry

912

self._parsed_byte_map[index] = (

913

self._parsed_byte_map[index][0], end)

914

self._parsed_key_map[index] = (

915

self._parsed_key_map[index][0], end_key)

916

elif (index + 1 < len(self._parsed_byte_map) and

917

self._parsed_byte_map[index + 1][0] == end):

918

# extend the higher entry

919

self._parsed_byte_map[index + 1] = (

920

start, self._parsed_byte_map[index + 1][1])

921

self._parsed_key_map[index + 1] = (

922

start_key, self._parsed_key_map[index + 1][1])

923

else:

924

# new entry

925

self._parsed_byte_map.insert(index + 1, new_value)

926

self._parsed_key_map.insert(index + 1, new_key)

927

928

def _read_and_parse(self, readv_ranges):

929

"""Read the the ranges and parse the resulting data.

930

931

:param readv_ranges: A prepared readv range list.

932

"""

933

if readv_ranges:

934

readv_data = self._transport.readv(self._name, readv_ranges, True,

935

self._size)

936

# parse

937

for offset, data in readv_data:

938

if self._bisect_nodes is None:

939

# this must be the start

940

assert offset == 0

941

offset, data = self._parse_header_from_bytes(data)

942

# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))

943

self._parse_region(offset, data)

944

468

945

def _signature(self):

469

946

"""The file signature for this index type."""

470

947

return _SIGNATURE

Older »