~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml8.py

Committer: Canonical.com Patch Queue Manager
Date: 2011-02-21 22:11:57 UTC
mfrom: (5671.2.4 move-xml-fns)
Revision ID: pqm@pqm.ubuntu.com-20110221221157-3dtpgotjz8ktfviu

(jelmer) Move Repository._find_text_key_references_from_xml_inventory_lines
onto the serializer. (Jelmer Vernooij)

files modified:
bzrlib/repofmt/pack_repo.py

bzrlib/repository.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_xml.py

bzrlib/xml8.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml8.py

cache_utf8,

errors,

inventory,

lazy_regex,

revision as _mod_revision,

trace,

)

">":">",

}

_xml_unescape_map = {

'apos':"'",

'quot':'"',

'amp':'&',

'lt':'<',

'gt':'>'

}

def _unescaper(match, _map=_xml_unescape_map):

code = match.group(1)

try:

return _map[code]

except KeyError:

if not code.startswith('#'):

raise

return unichr(int(code[1:])).encode('utf8')

_unescape_re = None

def _unescape_xml(data):

"""Unescape predefined XML entities in a string of data."""

global _unescape_re

if _unescape_re is None:

_unescape_re = re.compile('\&([^;]*);')

return _unescape_re.sub(_unescaper, data)

def _ensure_utf8_re():

"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""

161

191

format_num = '8'

162

192

revision_format_num = None

163

193

194

# The search regex used by xml based repositories to determine what things

195

# where changed in a single commit.

196

_file_ids_altered_regex = lazy_regex.lazy_compile(

197

r'file_id="(?P<file_id>[^"]+)"'

198

r'.* revision="(?P<revision_id>[^"]+)"'

199

)

200

164

201

def _check_revisions(self, inv):

165

202

"""Extension point for subclasses to check during serialisation.

166

203

532

569

raise AssertionError("repeated property %r" % name)

533

570

rev.properties[name] = value

534

571

572

def _find_text_key_references(self, line_iterator):

573

"""Core routine for extracting references to texts from inventories.

574

575

This performs the translation of xml lines to revision ids.

576

577

:param line_iterator: An iterator of lines, origin_version_id

578

:return: A dictionary mapping text keys ((fileid, revision_id) tuples)

579

to whether they were referred to by the inventory of the

580

revision_id that they contain. Note that if that revision_id was

581

not part of the line_iterator's output then False will be given -

582

even though it may actually refer to that key.

583

"""

584

if not self.support_altered_by_hack:

585

raise AssertionError(

586

"_find_text_key_references only "

587

"supported for branches which store inventory as unnested xml"

588

", not on %r" % self)

589

result = {}

590

591

# this code needs to read every new line in every inventory for the

592

# inventories [revision_ids]. Seeing a line twice is ok. Seeing a line

593

# not present in one of those inventories is unnecessary but not

594

# harmful because we are filtering by the revision id marker in the

595

# inventory lines : we only select file ids altered in one of those

596

# revisions. We don't need to see all lines in the inventory because

597

# only those added in an inventory in rev X can contain a revision=X

598

# line.

599

unescape_revid_cache = {}

600

unescape_fileid_cache = {}

601

602

# jam 20061218 In a big fetch, this handles hundreds of thousands

603

# of lines, so it has had a lot of inlining and optimizing done.

604

# Sorry that it is a little bit messy.

605

# Move several functions to be local variables, since this is a long

606

# running loop.

607

search = self._file_ids_altered_regex.search

608

unescape = _unescape_xml

609

setdefault = result.setdefault

610

for line, line_key in line_iterator:

611

match = search(line)

612

if match is None:

613

continue

614

# One call to match.group() returning multiple items is quite a

615

# bit faster than 2 calls to match.group() each returning 1

616

file_id, revision_id = match.group('file_id', 'revision_id')

617

618

# Inlining the cache lookups helps a lot when you make 170,000

619

# lines and 350k ids, versus 8.4 unique ids.

620

# Using a cache helps in 2 ways:

621

# 1) Avoids unnecessary decoding calls

622

# 2) Re-uses cached strings, which helps in future set and

623

# equality checks.

624

# (2) is enough that removing encoding entirely along with

625

# the cache (so we are using plain strings) results in no

626

# performance improvement.

627

try:

628

revision_id = unescape_revid_cache[revision_id]

629

except KeyError:

630

unescaped = unescape(revision_id)

631

unescape_revid_cache[revision_id] = unescaped

632

revision_id = unescaped

633

634

# Note that unconditionally unescaping means that we deserialise

635

# every fileid, which for general 'pull' is not great, but we don't

636

# really want to have some many fulltexts that this matters anyway.

637

# RBC 20071114.

638

try:

639

file_id = unescape_fileid_cache[file_id]

640

except KeyError:

641

unescaped = unescape(file_id)

642

unescape_fileid_cache[file_id] = unescaped

643

file_id = unescaped

644

645

key = (file_id, revision_id)

646

setdefault(key, False)

647

if revision_id == line_key[-1]:

648

result[key] = True

649

return result

650

535

651

536

652

serializer_v8 = Serializer_v8()

Older »