2063
2056
w = self.inventories
2064
2057
pb = ui.ui_factory.nested_progress_bar()
2066
return self._find_text_key_references_from_xml_inventory_lines(
2059
return self._serializer._find_text_key_references(
2067
2060
w.iter_lines_added_or_present_in_keys(revision_keys, pb=pb))
2071
def _find_text_key_references_from_xml_inventory_lines(self,
2073
"""Core routine for extracting references to texts from inventories.
2075
This performs the translation of xml lines to revision ids.
2077
:param line_iterator: An iterator of lines, origin_version_id
2078
:return: A dictionary mapping text keys ((fileid, revision_id) tuples)
2079
to whether they were referred to by the inventory of the
2080
revision_id that they contain. Note that if that revision_id was
2081
not part of the line_iterator's output then False will be given -
2082
even though it may actually refer to that key.
2084
if not self._serializer.support_altered_by_hack:
2085
raise AssertionError(
2086
"_find_text_key_references_from_xml_inventory_lines only "
2087
"supported for branches which store inventory as unnested xml"
2088
", not on %r" % self)
2091
# this code needs to read every new line in every inventory for the
2092
# inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
2093
# not present in one of those inventories is unnecessary but not
2094
# harmful because we are filtering by the revision id marker in the
2095
# inventory lines : we only select file ids altered in one of those
2096
# revisions. We don't need to see all lines in the inventory because
2097
# only those added in an inventory in rev X can contain a revision=X
2099
unescape_revid_cache = {}
2100
unescape_fileid_cache = {}
2102
# jam 20061218 In a big fetch, this handles hundreds of thousands
2103
# of lines, so it has had a lot of inlining and optimizing done.
2104
# Sorry that it is a little bit messy.
2105
# Move several functions to be local variables, since this is a long
2107
search = self._file_ids_altered_regex.search
2108
unescape = _unescape_xml
2109
setdefault = result.setdefault
2110
for line, line_key in line_iterator:
2111
match = search(line)
2114
# One call to match.group() returning multiple items is quite a
2115
# bit faster than 2 calls to match.group() each returning 1
2116
file_id, revision_id = match.group('file_id', 'revision_id')
2118
# Inlining the cache lookups helps a lot when you make 170,000
2119
# lines and 350k ids, versus 8.4 unique ids.
2120
# Using a cache helps in 2 ways:
2121
# 1) Avoids unnecessary decoding calls
2122
# 2) Re-uses cached strings, which helps in future set and
2124
# (2) is enough that removing encoding entirely along with
2125
# the cache (so we are using plain strings) results in no
2126
# performance improvement.
2128
revision_id = unescape_revid_cache[revision_id]
2130
unescaped = unescape(revision_id)
2131
unescape_revid_cache[revision_id] = unescaped
2132
revision_id = unescaped
2134
# Note that unconditionally unescaping means that we deserialise
2135
# every fileid, which for general 'pull' is not great, but we don't
2136
# really want to have some many fulltexts that this matters anyway.
2139
file_id = unescape_fileid_cache[file_id]
2141
unescaped = unescape(file_id)
2142
unescape_fileid_cache[file_id] = unescaped
2145
key = (file_id, revision_id)
2146
setdefault(key, False)
2147
if revision_id == line_key[-1]:
2151
2064
def _inventory_xml_lines_for_keys(self, keys):
2152
2065
"""Get a line iterator of the sort needed for findind references.
2183
2096
revision_ids. Each altered file-ids has the exact revision_ids that
2184
2097
altered it listed explicitly.
2186
seen = set(self._find_text_key_references_from_xml_inventory_lines(
2099
seen = set(self._serializer._find_text_key_references(
2187
2100
line_iterator).iterkeys())
2188
2101
parent_keys = self._find_parent_keys_of_revisions(revision_keys)
2189
parent_seen = set(self._find_text_key_references_from_xml_inventory_lines(
2102
parent_seen = set(self._serializer._find_text_key_references(
2190
2103
self._inventory_xml_lines_for_keys(parent_keys)))
2191
2104
new_keys = seen - parent_seen
4063
def _unescaper(match, _map=_unescape_map):
4064
code = match.group(1)
4068
if not code.startswith('#'):
4070
return unichr(int(code[1:])).encode('utf8')
4076
def _unescape_xml(data):
4077
"""Unescape predefined XML entities in a string of data."""
4079
if _unescape_re is None:
4080
_unescape_re = re.compile('\&([^;]*);')
4081
return _unescape_re.sub(_unescaper, data)
4084
3977
class _VersionedFileChecker(object):
4086
3979
def __init__(self, repository, text_key_references=None, ancestors=None):