~bzr-pqm/bzr/bzr.dev

6341.1.3 by Jelmer Vernooij
Move search result code to vf_search module.
1
# Copyright (C) 2007-2011 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17
"""Searching in versioned file repositories."""
18
19
from bzrlib import (
20
    debug,
21
    revision,
22
    trace,
23
    )
24
25
from bzrlib.graph import (
26
    DictParentsProvider,
27
    Graph,
28
    invert_parent_map,
29
    )
30
31
32
class AbstractSearchResult(object):
33
    """The result of a search, describing a set of keys.
34
    
35
    Search results are typically used as the 'fetch_spec' parameter when
36
    fetching revisions.
37
38
    :seealso: AbstractSearch
39
    """
40
41
    def get_recipe(self):
42
        """Return a recipe that can be used to replay this search.
43
44
        The recipe allows reconstruction of the same results at a later date.
45
46
        :return: A tuple of `(search_kind_str, *details)`.  The details vary by
47
            kind of search result.
48
        """
49
        raise NotImplementedError(self.get_recipe)
50
51
    def get_network_struct(self):
52
        """Return a tuple that can be transmitted via the HPSS protocol."""
53
        raise NotImplementedError(self.get_network_struct)
54
55
    def get_keys(self):
56
        """Return the keys found in this search.
57
58
        :return: A set of keys.
59
        """
60
        raise NotImplementedError(self.get_keys)
61
62
    def is_empty(self):
63
        """Return false if the search lists 1 or more revisions."""
64
        raise NotImplementedError(self.is_empty)
65
66
    def refine(self, seen, referenced):
67
        """Create a new search by refining this search.
68
69
        :param seen: Revisions that have been satisfied.
70
        :param referenced: Revision references observed while satisfying some
71
            of this search.
72
        :return: A search result.
73
        """
74
        raise NotImplementedError(self.refine)
75
76
77
class AbstractSearch(object):
78
    """A search that can be executed, producing a search result.
79
80
    :seealso: AbstractSearchResult
81
    """
82
83
    def execute(self):
84
        """Construct a network-ready search result from this search description.
85
86
        This may take some time to search repositories, etc.
87
88
        :return: A search result (an object that implements
89
            AbstractSearchResult's API).
90
        """
91
        raise NotImplementedError(self.execute)
92
93
94
class SearchResult(AbstractSearchResult):
95
    """The result of a breadth first search.
96
97
    A SearchResult provides the ability to reconstruct the search or access a
98
    set of the keys the search found.
99
    """
100
101
    def __init__(self, start_keys, exclude_keys, key_count, keys):
102
        """Create a SearchResult.
103
104
        :param start_keys: The keys the search started at.
105
        :param exclude_keys: The keys the search excludes.
106
        :param key_count: The total number of keys (from start to but not
107
            including exclude).
108
        :param keys: The keys the search found. Note that in future we may get
109
            a SearchResult from a smart server, in which case the keys list is
110
            not necessarily immediately available.
111
        """
112
        self._recipe = ('search', start_keys, exclude_keys, key_count)
113
        self._keys = frozenset(keys)
114
115
    def __repr__(self):
116
        kind, start_keys, exclude_keys, key_count = self._recipe
117
        if len(start_keys) > 5:
118
            start_keys_repr = repr(list(start_keys)[:5])[:-1] + ', ...]'
119
        else:
120
            start_keys_repr = repr(start_keys)
121
        if len(exclude_keys) > 5:
122
            exclude_keys_repr = repr(list(exclude_keys)[:5])[:-1] + ', ...]'
123
        else:
124
            exclude_keys_repr = repr(exclude_keys)
125
        return '<%s %s:(%s, %s, %d)>' % (self.__class__.__name__,
126
            kind, start_keys_repr, exclude_keys_repr, key_count)
127
128
    def get_recipe(self):
129
        """Return a recipe that can be used to replay this search.
130
131
        The recipe allows reconstruction of the same results at a later date
132
        without knowing all the found keys. The essential elements are a list
133
        of keys to start and to stop at. In order to give reproducible
134
        results when ghosts are encountered by a search they are automatically
135
        added to the exclude list (or else ghost filling may alter the
136
        results).
137
138
        :return: A tuple ('search', start_keys_set, exclude_keys_set,
139
            revision_count). To recreate the results of this search, create a
140
            breadth first searcher on the same graph starting at start_keys.
141
            Then call next() (or next_with_ghosts()) repeatedly, and on every
142
            result, call stop_searching_any on any keys from the exclude_keys
143
            set. The revision_count value acts as a trivial cross-check - the
144
            found revisions of the new search should have as many elements as
145
            revision_count. If it does not, then additional revisions have been
146
            ghosted since the search was executed the first time and the second
147
            time.
148
        """
149
        return self._recipe
150
151
    def get_network_struct(self):
152
        start_keys = ' '.join(self._recipe[1])
153
        stop_keys = ' '.join(self._recipe[2])
154
        count = str(self._recipe[3])
155
        return (self._recipe[0], '\n'.join((start_keys, stop_keys, count)))
156
157
    def get_keys(self):
158
        """Return the keys found in this search.
159
160
        :return: A set of keys.
161
        """
162
        return self._keys
163
164
    def is_empty(self):
165
        """Return false if the search lists 1 or more revisions."""
166
        return self._recipe[3] == 0
167
168
    def refine(self, seen, referenced):
169
        """Create a new search by refining this search.
170
171
        :param seen: Revisions that have been satisfied.
172
        :param referenced: Revision references observed while satisfying some
173
            of this search.
174
        """
175
        start = self._recipe[1]
176
        exclude = self._recipe[2]
177
        count = self._recipe[3]
178
        keys = self.get_keys()
179
        # New heads = referenced + old heads - seen things - exclude
180
        pending_refs = set(referenced)
181
        pending_refs.update(start)
182
        pending_refs.difference_update(seen)
183
        pending_refs.difference_update(exclude)
184
        # New exclude = old exclude + satisfied heads
185
        seen_heads = start.intersection(seen)
186
        exclude.update(seen_heads)
187
        # keys gets seen removed
188
        keys = keys - seen
189
        # length is reduced by len(seen)
190
        count -= len(seen)
191
        return SearchResult(pending_refs, exclude, count, keys)
192
193
194
class PendingAncestryResult(AbstractSearchResult):
195
    """A search result that will reconstruct the ancestry for some graph heads.
196
197
    Unlike SearchResult, this doesn't hold the complete search result in
198
    memory, it just holds a description of how to generate it.
199
    """
200
201
    def __init__(self, heads, repo):
202
        """Constructor.
203
204
        :param heads: an iterable of graph heads.
205
        :param repo: a repository to use to generate the ancestry for the given
206
            heads.
207
        """
208
        self.heads = frozenset(heads)
209
        self.repo = repo
210
211
    def __repr__(self):
212
        if len(self.heads) > 5:
213
            heads_repr = repr(list(self.heads)[:5])[:-1]
214
            heads_repr += ', <%d more>...]' % (len(self.heads) - 5,)
215
        else:
216
            heads_repr = repr(self.heads)
217
        return '<%s heads:%s repo:%r>' % (
218
            self.__class__.__name__, heads_repr, self.repo)
219
220
    def get_recipe(self):
221
        """Return a recipe that can be used to replay this search.
222
223
        The recipe allows reconstruction of the same results at a later date.
224
225
        :seealso SearchResult.get_recipe:
226
227
        :return: A tuple ('proxy-search', start_keys_set, set(), -1)
228
            To recreate this result, create a PendingAncestryResult with the
229
            start_keys_set.
230
        """
231
        return ('proxy-search', self.heads, set(), -1)
232
233
    def get_network_struct(self):
234
        parts = ['ancestry-of']
235
        parts.extend(self.heads)
236
        return parts
237
238
    def get_keys(self):
239
        """See SearchResult.get_keys.
240
241
        Returns all the keys for the ancestry of the heads, excluding
242
        NULL_REVISION.
243
        """
244
        return self._get_keys(self.repo.get_graph())
245
246
    def _get_keys(self, graph):
247
        NULL_REVISION = revision.NULL_REVISION
248
        keys = [key for (key, parents) in graph.iter_ancestry(self.heads)
249
                if key != NULL_REVISION and parents is not None]
250
        return keys
251
252
    def is_empty(self):
253
        """Return false if the search lists 1 or more revisions."""
254
        if revision.NULL_REVISION in self.heads:
255
            return len(self.heads) == 1
256
        else:
257
            return len(self.heads) == 0
258
259
    def refine(self, seen, referenced):
260
        """Create a new search by refining this search.
261
262
        :param seen: Revisions that have been satisfied.
263
        :param referenced: Revision references observed while satisfying some
264
            of this search.
265
        """
266
        referenced = self.heads.union(referenced)
267
        return PendingAncestryResult(referenced - seen, self.repo)
268
269
270
class EmptySearchResult(AbstractSearchResult):
271
    """An empty search result."""
272
273
    def is_empty(self):
274
        return True
275
276
277
class EverythingResult(AbstractSearchResult):
278
    """A search result that simply requests everything in the repository."""
279
280
    def __init__(self, repo):
281
        self._repo = repo
282
283
    def __repr__(self):
284
        return '%s(%r)' % (self.__class__.__name__, self._repo)
285
286
    def get_recipe(self):
287
        raise NotImplementedError(self.get_recipe)
288
289
    def get_network_struct(self):
290
        return ('everything',)
291
292
    def get_keys(self):
293
        if 'evil' in debug.debug_flags:
294
            from bzrlib import remote
295
            if isinstance(self._repo, remote.RemoteRepository):
296
                # warn developers (not users) not to do this
297
                trace.mutter_callsite(
298
                    2, "EverythingResult(RemoteRepository).get_keys() is slow.")
299
        return self._repo.all_revision_ids()
300
301
    def is_empty(self):
302
        # It's ok for this to wrongly return False: the worst that can happen
303
        # is that RemoteStreamSource will initiate a get_stream on an empty
304
        # repository.  And almost all repositories are non-empty.
305
        return False
306
307
    def refine(self, seen, referenced):
308
        heads = set(self._repo.all_revision_ids())
309
        heads.difference_update(seen)
310
        heads.update(referenced)
311
        return PendingAncestryResult(heads, self._repo)
312
313
314
class EverythingNotInOther(AbstractSearch):
315
    """Find all revisions in that are in one repo but not the other."""
316
317
    def __init__(self, to_repo, from_repo, find_ghosts=False):
318
        self.to_repo = to_repo
319
        self.from_repo = from_repo
320
        self.find_ghosts = find_ghosts
321
322
    def execute(self):
323
        return self.to_repo.search_missing_revision_ids(
324
            self.from_repo, find_ghosts=self.find_ghosts)
325
326
327
class NotInOtherForRevs(AbstractSearch):
328
    """Find all revisions missing in one repo for a some specific heads."""
329
330
    def __init__(self, to_repo, from_repo, required_ids, if_present_ids=None,
331
            find_ghosts=False, limit=None):
332
        """Constructor.
333
334
        :param required_ids: revision IDs of heads that must be found, or else
335
            the search will fail with NoSuchRevision.  All revisions in their
336
            ancestry not already in the other repository will be included in
337
            the search result.
338
        :param if_present_ids: revision IDs of heads that may be absent in the
339
            source repository.  If present, then their ancestry not already
340
            found in other will be included in the search result.
341
        :param limit: maximum number of revisions to fetch
342
        """
343
        self.to_repo = to_repo
344
        self.from_repo = from_repo
345
        self.find_ghosts = find_ghosts
346
        self.required_ids = required_ids
347
        self.if_present_ids = if_present_ids
348
        self.limit = limit
349
350
    def __repr__(self):
351
        if len(self.required_ids) > 5:
352
            reqd_revs_repr = repr(list(self.required_ids)[:5])[:-1] + ', ...]'
353
        else:
354
            reqd_revs_repr = repr(self.required_ids)
355
        if self.if_present_ids and len(self.if_present_ids) > 5:
356
            ifp_revs_repr = repr(list(self.if_present_ids)[:5])[:-1] + ', ...]'
357
        else:
358
            ifp_revs_repr = repr(self.if_present_ids)
359
360
        return ("<%s from:%r to:%r find_ghosts:%r req'd:%r if-present:%r"
361
                "limit:%r>") % (
362
                self.__class__.__name__, self.from_repo, self.to_repo,
363
                self.find_ghosts, reqd_revs_repr, ifp_revs_repr,
364
                self.limit)
365
366
    def execute(self):
367
        return self.to_repo.search_missing_revision_ids(
368
            self.from_repo, revision_ids=self.required_ids,
369
            if_present_ids=self.if_present_ids, find_ghosts=self.find_ghosts,
370
            limit=self.limit)
371
372
373
def search_result_from_parent_map(parent_map, missing_keys):
374
    """Transform a parent_map into SearchResult information."""
375
    if not parent_map:
376
        # parent_map is empty or None, simple search result
377
        return [], [], 0
378
    # start_set is all the keys in the cache
379
    start_set = set(parent_map)
380
    # result set is all the references to keys in the cache
381
    result_parents = set()
382
    for parents in parent_map.itervalues():
383
        result_parents.update(parents)
384
    stop_keys = result_parents.difference(start_set)
385
    # We don't need to send ghosts back to the server as a position to
386
    # stop either.
387
    stop_keys.difference_update(missing_keys)
388
    key_count = len(parent_map)
389
    if (revision.NULL_REVISION in result_parents
390
        and revision.NULL_REVISION in missing_keys):
391
        # If we pruned NULL_REVISION from the stop_keys because it's also
392
        # in our cache of "missing" keys we need to increment our key count
393
        # by 1, because the reconsitituted SearchResult on the server will
394
        # still consider NULL_REVISION to be an included key.
395
        key_count += 1
396
    included_keys = start_set.intersection(result_parents)
397
    start_set.difference_update(included_keys)
398
    return start_set, stop_keys, key_count
399
400
401
def _run_search(parent_map, heads, exclude_keys):
402
    """Given a parent map, run a _BreadthFirstSearcher on it.
403
404
    Start at heads, walk until you hit exclude_keys. As a further improvement,
405
    watch for any heads that you encounter while walking, which means they were
406
    not heads of the search.
407
408
    This is mostly used to generate a succinct recipe for how to walk through
409
    most of parent_map.
410
411
    :return: (_BreadthFirstSearcher, set(heads_encountered_by_walking))
412
    """
413
    g = Graph(DictParentsProvider(parent_map))
414
    s = g._make_breadth_first_searcher(heads)
415
    found_heads = set()
416
    while True:
417
        try:
418
            next_revs = s.next()
419
        except StopIteration:
420
            break
421
        for parents in s._current_parents.itervalues():
422
            f_heads = heads.intersection(parents)
423
            if f_heads:
424
                found_heads.update(f_heads)
425
        stop_keys = exclude_keys.intersection(next_revs)
426
        if stop_keys:
427
            s.stop_searching_any(stop_keys)
428
    for parents in s._current_parents.itervalues():
429
        f_heads = heads.intersection(parents)
430
        if f_heads:
431
            found_heads.update(f_heads)
432
    return s, found_heads
433
434
435
def _find_possible_heads(parent_map, tip_keys, depth):
436
    """Walk backwards (towards children) through the parent_map.
437
438
    This finds 'heads' that will hopefully succinctly describe our search
439
    graph.
440
    """
441
    child_map = invert_parent_map(parent_map)
442
    heads = set()
443
    current_roots = tip_keys
444
    walked = set(current_roots)
445
    while current_roots and depth > 0:
446
        depth -= 1
447
        children = set()
448
        children_update = children.update
449
        for p in current_roots:
450
            # Is it better to pre- or post- filter the children?
451
            try:
452
                children_update(child_map[p])
453
            except KeyError:
454
                heads.add(p)
455
        # If we've seen a key before, we don't want to walk it again. Note that
456
        # 'children' stays relatively small while 'walked' grows large. So
457
        # don't use 'difference_update' here which has to walk all of 'walked'.
458
        # '.difference' is smart enough to walk only children and compare it to
459
        # walked.
460
        children = children.difference(walked)
461
        walked.update(children)
462
        current_roots = children
463
    if current_roots:
464
        # We walked to the end of depth, so these are the new tips.
465
        heads.update(current_roots)
466
    return heads
467
468
469
def limited_search_result_from_parent_map(parent_map, missing_keys, tip_keys,
470
                                          depth):
471
    """Transform a parent_map that is searching 'tip_keys' into an
472
    approximate SearchResult.
473
474
    We should be able to generate a SearchResult from a given set of starting
475
    keys, that covers a subset of parent_map that has the last step pointing at
476
    tip_keys. This is to handle the case that really-long-searches shouldn't be
477
    started from scratch on each get_parent_map request, but we *do* want to
478
    filter out some of the keys that we've already seen, so we don't get
479
    information that we already know about on every request.
480
481
    The server will validate the search (that starting at start_keys and
482
    stopping at stop_keys yields the exact key_count), so we have to be careful
483
    to give an exact recipe.
484
485
    Basic algorithm is:
486
        1) Invert parent_map to get child_map (todo: have it cached and pass it
487
           in)
488
        2) Starting at tip_keys, walk towards children for 'depth' steps.
489
        3) At that point, we have the 'start' keys.
490
        4) Start walking parent_map from 'start' keys, counting how many keys
491
           are seen, and generating stop_keys for anything that would walk
492
           outside of the parent_map.
493
494
    :param parent_map: A map from {child_id: (parent_ids,)}
495
    :param missing_keys: parent_ids that we know are unavailable
496
    :param tip_keys: the revision_ids that we are searching
497
    :param depth: How far back to walk.
498
    """
499
    if not parent_map:
500
        # No search to send, because we haven't done any searching yet.
501
        return [], [], 0
502
    heads = _find_possible_heads(parent_map, tip_keys, depth)
503
    s, found_heads = _run_search(parent_map, heads, set(tip_keys))
6341.1.5 by Jelmer Vernooij
Fix get_state().
504
    start_keys, exclude_keys, keys = s.get_state()
6341.1.3 by Jelmer Vernooij
Move search result code to vf_search module.
505
    if found_heads:
506
        # Anything in found_heads are redundant start_keys, we hit them while
507
        # walking, so we can exclude them from the start list.
508
        start_keys = set(start_keys).difference(found_heads)
6341.1.5 by Jelmer Vernooij
Fix get_state().
509
    return start_keys, exclude_keys, len(keys)