~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/index.py

Committer: Robert Collins
Date: 2007-10-07 22:04:49 UTC
mto: (2592.3.168 repository)
mto: This revision was merged to the branch mainline in revision 2908.
Revision ID: robertc@robertcollins.net-20071007220449-stt24xz9eaaj703l

Create a content lookup function for bisection in GraphIndex.

files modified:
bzrlib/index.py

bzrlib/tests/test_index.py

Show diffs side-by-side

added added

removed removed

bzrlib/index.py

'InMemoryGraphIndex',

]

from bisect import bisect_right

from cStringIO import StringIO

import re

245

246

self._transport = transport

246

247

self._name = name

247

248

self._nodes = None

249

# a sorted list of slice-addresses for the parsed bytes of the file.

250

# e.g. (0,1) would mean that byte 0 is parsed.

248

251

self._parsed_byte_map = []

252

# a sorted list of keys matching each slice address for parsed bytes

253

# e.g. (None, 'foo@bar') would mean that the first byte contained no

254

# key, and the end byte of the slice is the of the data for 'foo@bar'

255

self._parsed_key_map = []

249

256

self._key_count = None

250

257

self._keys_by_offset = None

251

258

self._nodes_by_key = None

367

374

except ValueError:

368

375

raise errors.BadIndexOptions(self)

369

376

377

def _parsed_byte_index(self, offset):

378

"""Return the index of the entry immediately before offset.

379

380

e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that

381

there is one unparsed byte (the 11th, addressed as[10]). then:

382

asking for 0 will return 0

383

asking for 10 will return 0

384

asking for 11 will return 1

385

asking for 12 will return 1

386

"""

387

key = (offset, 0)

388

return bisect_right(self._parsed_byte_map, key) - 1

389

390

def _parsed_key_index(self, key):

391

"""Return the index of the entry immediately before key.

392

393

e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,

394

meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive

395

have been parsed, then:

396

asking for '' will return 0

397

asking for 'a' will return 0

398

asking for 'b' will return 1

399

asking for 'e' will return 1

400

"""

401

search_key = (key, 0)

402

return bisect_right(self._parsed_key_map, search_key) - 1

403

404

def _is_parsed(self, offset):

405

"""Returns True if offset has been parsed."""

406

index = self._parsed_byte_index(offset)

407

if index == len(self._parsed_byte_map):

408

return offset < self._parsed_byte_map[index - 1][1]

409

start, end = self._parsed_byte_map[index]

410

return offset >= start and offset < end

411

370

412

def iter_entries(self, keys):

371

413

"""Iterate over keys within the index.

372

414

471

513

self._buffer_all()

472

514

return self._key_count

473

515

516

def lookup_keys_via_location(self, location_keys):

517

"""Public interface for implementing bisection.

518

519

:param location_keys: A list of location, key tuples.

520

:return: A list of (location_key, result) tuples as expected by

521

bzrlib.bisect_multi.bisect_multi_bytes.

522

"""

523

# Possible improvements:

524

# - only bisect lookup each key once

525

# - sort the keys first, and use that to reduce the bisection window

526

# -----

527

# this progresses in three parts:

528

# read data

529

# parse it

530

# attempt to answer the question from the now in memory data.

531

# build the readv request

532

# for each location, ask for 800 bytes - much more than rows we've seen

533

# anywhere.

534

readv_ranges = []

535

for location, key in location_keys:

536

# can we answer from cache?

537

index = self._parsed_key_index(key)

538

if (len(self._parsed_key_map) and

539

self._parsed_key_map[index][0] <= key and

540

self._parsed_key_map[index][1] > key):

541

# the key has been parsed, so no lookup is needed

542

continue

543

length = 800

544

if location + length > self._size:

545

length = self._size - location

546

# todo, trim out parsed locations.

547

if length > 0:

548

readv_ranges.append((location, length))

549

# read the header if needed

550

if self._nodes is None:

551

readv_ranges.append((0, 200))

552

if readv_ranges:

553

readv_data = self._transport.readv(self._name, readv_ranges, True,

554

self._size)

555

# parse

556

for offset, data in readv_data:

557

if self._nodes is None:

558

# this must be the start

559

assert offset == 0

560

offset, data = self._parse_header_from_bytes(data)

561

self._parse_region(offset, data)

562

# print offset, len(data), data

563

# generate results:

564

# - figure out <, >, missing, present

565

# - result present references so we can return them.

566

result = []

567

for location, key in location_keys:

568

# can we answer from cache?

569

index = self._parsed_key_index(key)

570

if (self._parsed_key_map[index][0] <= key and

571

(self._parsed_key_map[index][1] > key or

572

# end of the file has been parsed

573

self._parsed_byte_map[index][1] == self._size)):

574

# the key has been parsed, so no lookup is needed

575

if key in self._nodes:

576

if self.node_ref_lists:

577

value, refs = self._nodes[key]

578

result.append(((location, key), (self, key, value, refs)))

579

else:

580

result.append(((location, key), (self, key, self._nodes[key])))

581

else:

582

result.append(((location, key), False))

583

continue

584

# no, is the key above or below the probed location:

585

# get the range of the probed & parsed location

586

index = self._parsed_byte_index(location)

587

# if the key is below the start of the range, its below

588

if key < self._parsed_key_map[index][0]:

589

direction = -1

590

else:

591

direction = +1

592

result.append(((location, key), direction))

593

return result

594

595

def _parse_header_from_bytes(self, bytes):

596

"""Parse the header from a region of bytes.

597

598

:param bytes: The data to parse.

599

:return: An offset, data tuple such as readv yields, for the unparsed

600

data. (which may length 0).

601

"""

602

signature = bytes[0:len(self._signature())]

603

if not signature == self._signature():

604

raise errors.BadIndexFormatSignature(self._name, GraphIndex)

605

lines = bytes[len(self._signature()):].splitlines()

606

options_line = lines[0]

607

if not options_line.startswith(_OPTION_NODE_REFS):

608

raise errors.BadIndexOptions(self)

609

try:

610

self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])

611

except ValueError:

612

raise errors.BadIndexOptions(self)

613

options_line = lines[1]

614

if not options_line.startswith(_OPTION_KEY_ELEMENTS):

615

raise errors.BadIndexOptions(self)

616

try:

617

self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])

618

except ValueError:

619

raise errors.BadIndexOptions(self)

620

options_line = lines[2]

621

if not options_line.startswith(_OPTION_LEN):

622

raise errors.BadIndexOptions(self)

623

try:

624

self._key_count = int(options_line[len(_OPTION_LEN):])

625

except ValueError:

626

raise errors.BadIndexOptions(self)

627

# calculate the bytes we have processed

628

header_end = (len(signature) + len(lines[0]) + len(lines[1]) +

629

len(lines[2]) + 3)

630

self._parsed_bytes(0, None, header_end, None)

631

# setup parsing state

632

self._expected_elements = 3 + self._key_length

633

# raw data keyed by offset

634

self._keys_by_offset = {}

635

# ready-to-return key:value or key:value, node_ref_lists

636

self._nodes = {}

637

self._nodes_by_key = {}

638

return header_end, bytes[header_end:]

639

640

def _parse_region(self, offset, data):

641

"""Parse node data returned from a readv operation.

642

643

:param offset: The byte offset the data starts at.

644

:param data: The data to parse.

645

"""

646

# trim the data.

647

# end first:

648

end = offset + len(data)

649

index = self._parsed_byte_index(offset)

650

# default is to use all data

651

trim_end = None

652

# trivial check for entirely parsed data:

653

if end < self._parsed_byte_map[index][1]:

654

return

655

# accomodate overlap with data before this.

656

if offset < self._parsed_byte_map[index][1]:

657

# overlaps the lower parsed region

658

# skip the parsed data

659

trim_start = self._parsed_byte_map[index][1] - offset

660

# don't trim the start for \n

661

start_adjacent = True

662

elif offset == self._parsed_byte_map[index][1]:

663

# abuts the lower parsed region

664

# use all data

665

trim_start = None

666

# do not trim anything

667

start_adjacent = True

668

else:

669

# does not overlap the lower parsed region

670

# use all data

671

trim_start = None

672

# but trim the leading \n

673

start_adjacent = False

674

if end == self._size:

675

# lines up to the end of all data:

676

# use it all

677

trim_end = None

678

# do not strip to the last \n

679

end_adjacent = True

680

elif index + 1 == len(self._parsed_byte_map):

681

# at the end of the parsed data

682

# use it all

683

trim_end = None

684

# but strip to the last \n

685

end_adjacent = False

686

elif end == self._parsed_byte_map[index + 1][0]:

687

# buts up against the next parsed region

688

# use it all

689

trim_end = None

690

# do not strip to the last \n

691

end_adjacent = True

692

elif end > self._parsed_byte_map[index + 1][0]:

693

# overlaps into the next parsed region

694

# only consider the unparsed data

695

trim_end = self._parsed_byte_map[index + 1][0] - offset

696

# do not strip to the last \n as we know its an entire record

697

end_adjacent = True

698

else:

699

# does not overlap into the next region

700

# use it all

701

trim_end = None

702

# but strip to the last \n

703

end_adjacent = False

704

# now find bytes to discard if needed

705

if not start_adjacent:

706

# work around python bug in rfind

707

if trim_start is None:

708

trim_start = data.find('\n') + 1

709

else:

710

trim_start = data.find('\n', trim_start) + 1

711

assert trim_start != 0, 'no \n was present'

712

# print 'removing start', offset, trim_start, repr(data[:trim_start])

713

if not end_adjacent:

714

# work around python bug in rfind

715

if trim_end is None:

716

trim_end = data.rfind('\n') + 1

717

else:

718

trim_end = data.rfind('\n', None, trim_end) + 1

719

assert trim_end != 0, 'no \n was present'

720

# print 'removing end', offset, trim_end, repr(data[trim_end:])

721

# adjust offset and data to the parseable data.

722

data = data[trim_start:trim_end]

723

if trim_start:

724

offset += trim_start

725

# print "parsing", repr(data)

726

lines = data.splitlines()

727

pos = offset

728

first_key = None

729

key = None

730

for line in lines:

731

if line == '':

732

# must be at the end

733

assert self._size == pos + 1, "%s %s" % (self._size, pos)

734

continue

735

elements = line.split('\0')

736

if len(elements) != self._expected_elements:

737

raise errors.BadIndexData(self)

738

# keys are tuples

739

key = tuple(elements[:self._key_length])

740

if first_key is None:

741

first_key = key

742

absent, references, value = elements[-3:]

743

ref_lists = []

744

for ref_string in references.split('\t'):

745

ref_lists.append(tuple([

746

int(ref) for ref in ref_string.split('\r') if ref

747

]))

748

ref_lists = tuple(ref_lists)

749

self._keys_by_offset[pos] = (key, absent, ref_lists, value)

750

pos += len(line) + 1 # +1 for the \n

751

self._parsed_bytes(offset, first_key, offset + len(data), key)

752

# XXXXXX repeated work here.

753

for key, absent, references, value in self._keys_by_offset.itervalues():

754

if absent:

755

continue

756

# resolve references:

757

if self.node_ref_lists:

758

node_refs = []

759

for ref_list in references:

760

node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))

761

node_value = (value, tuple(node_refs))

762

else:

763

node_value = value

764

self._nodes[key] = node_value

765

if self._key_length > 1:

766

subkey = list(reversed(key[:-1]))

767

key_dict = self._nodes_by_key

768

if self.node_ref_lists:

769

key_value = key, node_value[0], node_value[1]

770

else:

771

key_value = key, node_value

772

# possibly should do this on-demand, but it seems likely it is

773

# always wanted

774

# For a key of (foo, bar, baz) create

775

# _nodes_by_key[foo][bar][baz] = key_value

776

for subkey in key[:-1]:

777

key_dict = key_dict.setdefault(subkey, {})

778

key_dict[key[-1]] = key_value

779

780

def _parsed_bytes(self, start, start_key, end, end_key):

781

"""Mark the bytes from start to end as parsed.

782

783

Calling self._parsed_bytes(1,2) will mark one byte (the one at offset

784

1) as parsed.

785

786

:param start: The start of the parsed region.

787

:param end: The end of the parsed region.

788

"""

789

key = (start, 0)

790

index = bisect_right(self._parsed_byte_map, key)

791

new_value = (start, end)

792

new_key = (start_key, end_key)

793

if index == 0:

794

# first range parsed is always the beginning.

795

self._parsed_byte_map.insert(index, new_value)

796

self._parsed_key_map.insert(index, new_key)

797

elif index == len(self._parsed_byte_map):

798

# may be adjacent to the prior entry

799

if start == self._parsed_byte_map[index - 1][1]:

800

self._parsed_byte_map[index - 1] = (

801

self._parsed_byte_map[index - 1][0], end)

802

self._parsed_key_map[index - 1] = (

803

self._parsed_key_map[index - 1][0], end_key)

804

else:

805

#no, new entry

806

self._parsed_byte_map.insert(index, new_value)

807

self._parsed_key_map.insert(index, new_key)

808

else:

809

# may be adjacent to the next entry

810

if end == self._parsed_byte_map[index][0]:

811

# move the next entry down to this range

812

self._parsed_byte_map[index] = (

813

start, self._parsed_byte_map[index][1])

814

self._parsed_key_map[index] = (

815

start_key, self._parsed_key_map[index][1])

816

else:

817

# no, new entry

818

self._parsed_byte_map.insert(index, new_value)

819

self._parsed_key_map.insert(index, new_key)

820

474

821

def _signature(self):

475

822

"""The file signature for this index type."""

476

823

return _SIGNATURE

Older »