~bzr-pqm/bzr/bzr.dev

1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
1
# (C) 2005, 2006 Canonical Limited.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
1570.1.7 by Robert Collins
Replace the slow topo_sort routine with a much faster one for non trivial datasets.
17
"""Reconcilers are able to fix some potential data errors in a branch."""
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
18
19
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
20
__all__ = ['reconcile', 'Reconciler', 'RepoReconciler', 'KnitReconciler']
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
21
22
1773.4.1 by Martin Pool
Add pyflakes makefile target; fix many warnings
23
from bzrlib import ui
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
24
from bzrlib.trace import mutter
1570.1.7 by Robert Collins
Replace the slow topo_sort routine with a much faster one for non trivial datasets.
25
from bzrlib.tsort import TopoSorter
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
26
27
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
28
def reconcile(dir, other=None):
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
29
    """Reconcile the data in dir.
30
31
    Currently this is limited to a inventory 'reweave'.
32
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
33
    This is a convenience method, for using a Reconciler object.
34
35
    Directly using Reconciler is recommended for library users that
36
    desire fine grained control or analysis of the found issues.
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
37
38
    :param other: another bzrdir to reconcile against.
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
39
    """
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
40
    reconciler = Reconciler(dir, other=other)
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
41
    reconciler.reconcile()
42
43
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
44
class Reconciler(object):
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
45
    """Reconcilers are used to reconcile existing data."""
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
46
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
47
    def __init__(self, dir, other=None):
48
        """Create a Reconciler."""
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
49
        self.bzrdir = dir
50
51
    def reconcile(self):
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
52
        """Perform reconciliation.
53
        
54
        After reconciliation the following attributes document found issues:
55
        inconsistent_parents: The number of revisions in the repository whose
56
                              ancestry was being reported incorrectly.
57
        garbage_inventories: The number of inventory objects without revisions
58
                             that were garbage collected.
59
        """
1594.1.3 by Robert Collins
Fixup pb usage to use nested_progress_bar.
60
        self.pb = ui.ui_factory.nested_progress_bar()
61
        try:
62
            self._reconcile()
63
        finally:
64
            self.pb.finished()
65
66
    def _reconcile(self):
67
        """Helper function for performing reconciliation."""
1570.1.11 by Robert Collins
Make reconcile work with shared repositories.
68
        self.repo = self.bzrdir.find_repository()
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
69
        self.pb.note('Reconciling repository %s',
70
                     self.repo.bzrdir.root_transport.base)
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
71
        repo_reconciler = self.repo.reconcile(thorough=True)
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
72
        self.inconsistent_parents = repo_reconciler.inconsistent_parents
73
        self.garbage_inventories = repo_reconciler.garbage_inventories
74
        self.pb.note('Reconciliation complete.')
75
76
77
class RepoReconciler(object):
78
    """Reconciler that reconciles a repository.
79
80
    Currently this consists of an inventory reweave with revision cross-checks.
81
    """
82
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
83
    def __init__(self, repo, other=None, thorough=False):
84
        """Construct a RepoReconciler.
85
86
        :param thorough: perform a thorough check which may take longer but
87
                         will correct non-data loss issues such as incorrect
88
                         cached data.
89
        """
90
        self.garbage_inventories = 0
91
        self.inconsistent_parents = 0
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
92
        self.repo = repo
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
93
        self.thorough = thorough
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
94
95
    def reconcile(self):
96
        """Perform reconciliation.
97
        
98
        After reconciliation the following attributes document found issues:
99
        inconsistent_parents: The number of revisions in the repository whose
100
                              ancestry was being reported incorrectly.
101
        garbage_inventories: The number of inventory objects without revisions
102
                             that were garbage collected.
103
        """
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
104
        self.repo.lock_write()
105
        try:
1594.1.3 by Robert Collins
Fixup pb usage to use nested_progress_bar.
106
            self.pb = ui.ui_factory.nested_progress_bar()
107
            try:
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
108
                self._reconcile_steps()
1594.1.3 by Robert Collins
Fixup pb usage to use nested_progress_bar.
109
            finally:
110
                self.pb.finished()
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
111
        finally:
112
            self.repo.unlock()
113
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
114
    def _reconcile_steps(self):
115
        """Perform the steps to reconcile this repository."""
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
116
        self._reweave_inventory()
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
117
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
118
    def _reweave_inventory(self):
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
119
        """Regenerate the inventory weave for the repository from scratch.
120
        
121
        This is a smart function: it will only do the reweave if doing it 
122
        will correct data issues. The self.thorough flag controls whether
123
        only data-loss causing issues (!self.thorough) or all issues
124
        (self.thorough) are treated as requiring the reweave.
125
        """
126
        # local because needing to know about WeaveFile is a wart we want to hide
1563.2.42 by Robert Collins
Stop reconcile on weaves being quadratic.
127
        from bzrlib.weave import WeaveFile, Weave
1563.2.29 by Robert Collins
Remove all but fetch references to repository.revision_store.
128
        transaction = self.repo.get_transaction()
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
129
        self.pb.update('Reading inventory data.')
130
        self.inventory = self.repo.get_inventory_weave()
131
        # the total set of revisions to process
1563.2.29 by Robert Collins
Remove all but fetch references to repository.revision_store.
132
        self.pending = set([rev_id for rev_id in self.repo._revision_store.all_revision_ids(transaction)])
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
133
134
        # mapping from revision_id to parents
135
        self._rev_graph = {}
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
136
        # errors that we detect
137
        self.inconsistent_parents = 0
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
138
        # we need the revision id of each revision and its available parents list
1570.1.10 by Robert Collins
UI tweaks to reconcile - show progress for inventory backup.
139
        self._setup_steps(len(self.pending))
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
140
        for rev_id in self.pending:
141
            # put a revision into the graph.
142
            self._graph_revision(rev_id)
1594.2.2 by Robert Collins
Trivial change to reconcile to mutter the cause of reconciliation to bzr.log
143
        self._check_garbage_inventories()
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
144
        # if there are no inconsistent_parents and 
145
        # (no garbage inventories or we are not doing a thorough check)
146
        if (not self.inconsistent_parents and 
147
            (not self.garbage_inventories or not self.thorough)):
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
148
            self.pb.note('Inventory ok.')
149
            return
1570.1.10 by Robert Collins
UI tweaks to reconcile - show progress for inventory backup.
150
        self.pb.update('Backing up inventory...', 0, 0)
1563.2.25 by Robert Collins
Merge in upstream.
151
        self.repo.control_weaves.copy(self.inventory, 'inventory.backup', self.repo.get_transaction())
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
152
        self.pb.note('Backup Inventory created.')
153
        # asking for '' should never return a non-empty weave
1616.1.1 by Martin Pool
[merge] robertc
154
        new_inventory_vf = self.repo.control_weaves.get_empty('inventory.new',
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
155
            self.repo.get_transaction())
1570.1.6 by Robert Collins
Update fast topological_sort to be a function and to have the topo_sort tests run against it.
156
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
157
        # we have topological order of revisions and non ghost parents ready.
1570.1.10 by Robert Collins
UI tweaks to reconcile - show progress for inventory backup.
158
        self._setup_steps(len(self._rev_graph))
1570.1.7 by Robert Collins
Replace the slow topo_sort routine with a much faster one for non trivial datasets.
159
        for rev_id in TopoSorter(self._rev_graph.items()).iter_topo_order():
160
            parents = self._rev_graph[rev_id]
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
161
            # double check this really is in topological order.
1616.1.1 by Martin Pool
[merge] robertc
162
            unavailable = [p for p in parents if p not in new_inventory_vf]
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
163
            assert len(unavailable) == 0
164
            # this entry has all the non ghost parents in the inventory
165
            # file already.
166
            self._reweave_step('adding inventories')
1616.1.1 by Martin Pool
[merge] robertc
167
            if isinstance(new_inventory_vf, WeaveFile):
168
                # It's really a WeaveFile, but we call straight into the
169
                # Weave's add method to disable the auto-write-out behaviour.
1607.1.11 by Robert Collins
Merge from bzr.dev
170
                # This is done to avoid a revision_count * time-to-write additional overhead on 
171
                # reconcile.
1616.1.1 by Martin Pool
[merge] robertc
172
                new_inventory_vf._check_write_ok()
173
                Weave._add_lines(new_inventory_vf, rev_id, parents, self.inventory.get_lines(rev_id),
174
                                 None)
1563.2.42 by Robert Collins
Stop reconcile on weaves being quadratic.
175
            else:
1616.1.1 by Martin Pool
[merge] robertc
176
                new_inventory_vf.add_lines(rev_id, parents, self.inventory.get_lines(rev_id))
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
177
1616.1.1 by Martin Pool
[merge] robertc
178
        if isinstance(new_inventory_vf, WeaveFile):
179
            new_inventory_vf._save()
180
        # if this worked, the set of new_inventory_vf.names should equal
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
181
        # self.pending
1616.1.1 by Martin Pool
[merge] robertc
182
        assert set(new_inventory_vf.versions()) == self.pending
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
183
        self.pb.update('Writing weave')
1616.1.1 by Martin Pool
[merge] robertc
184
        self.repo.control_weaves.copy(new_inventory_vf, 'inventory', self.repo.get_transaction())
1563.2.25 by Robert Collins
Merge in upstream.
185
        self.repo.control_weaves.delete('inventory.new', self.repo.get_transaction())
1570.1.3 by Robert Collins
Optimise reconcilation to only hit each revision once.
186
        self.inventory = None
1570.1.2 by Robert Collins
Import bzrtools' 'fix' command as 'bzr reconcile.'
187
        self.pb.note('Inventory regenerated.')
1570.1.3 by Robert Collins
Optimise reconcilation to only hit each revision once.
188
1570.1.10 by Robert Collins
UI tweaks to reconcile - show progress for inventory backup.
189
    def _setup_steps(self, new_total):
190
        """Setup the markers we need to control the progress bar."""
191
        self.total = new_total
192
        self.count = 0
193
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
194
    def _graph_revision(self, rev_id):
195
        """Load a revision into the revision graph."""
196
        # pick a random revision
197
        # analyse revision id rev_id and put it in the stack.
198
        self._reweave_step('loading revisions')
1570.1.13 by Robert Collins
Check for incorrect revision parentage in the weave during revision access.
199
        rev = self.repo.get_revision_reconcile(rev_id)
1570.1.3 by Robert Collins
Optimise reconcilation to only hit each revision once.
200
        assert rev.revision_id == rev_id
201
        parents = []
202
        for parent in rev.parent_ids:
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
203
            if self._parent_is_available(parent):
1570.1.3 by Robert Collins
Optimise reconcilation to only hit each revision once.
204
                parents.append(parent)
205
            else:
206
                mutter('found ghost %s', parent)
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
207
        self._rev_graph[rev_id] = parents   
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
208
        if self._parents_are_inconsistent(rev_id, parents):
1570.1.8 by Robert Collins
Only reconcile if doing so will perform gc or correct ancestry.
209
            self.inconsistent_parents += 1
1594.2.2 by Robert Collins
Trivial change to reconcile to mutter the cause of reconciliation to bzr.log
210
            mutter('Inconsistent inventory parents: id {%s} '
211
                   'inventory claims %r, '
212
                   'available parents are %r, '
213
                   'unavailable parents are %r',
214
                   rev_id, 
1563.2.39 by Robert Collins
Merge from integration.
215
                   set(self.inventory.get_parents(rev_id)),
1594.2.2 by Robert Collins
Trivial change to reconcile to mutter the cause of reconciliation to bzr.log
216
                   set(parents),
217
                   set(rev.parent_ids).difference(set(parents)))
218
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
219
    def _parents_are_inconsistent(self, rev_id, parents):
220
        """Return True if the parents list of rev_id does not match the weave.
221
1759.2.2 by Jelmer Vernooij
Revert some of my spelling fixes and fix some typos after review by Aaron.
222
        This detects inconsistencies based on the self.thorough value:
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
223
        if thorough is on, the first parent value is checked as well as ghost
224
        differences.
225
        Otherwise only the ghost differences are evaluated.
226
        """
227
        weave_parents = self.inventory.get_parents(rev_id)
228
        weave_missing_old_ghosts = set(weave_parents) != set(parents)
229
        first_parent_is_wrong = (
230
            len(weave_parents) and len(parents) and
231
            parents[0] != weave_parents[0])
232
        if self.thorough:
233
            return weave_missing_old_ghosts or first_parent_is_wrong
234
        else:
235
            return weave_missing_old_ghosts
236
1594.2.2 by Robert Collins
Trivial change to reconcile to mutter the cause of reconciliation to bzr.log
237
    def _check_garbage_inventories(self):
238
        """Check for garbage inventories which we cannot trust
239
240
        We cant trust them because their pre-requisite file data may not
241
        be present - all we know is that their revision was not installed.
242
        """
1692.1.3 by Robert Collins
Finish the reconcile tweak: filled in ghosts are a data loss issue and need to be checked during fast reconciles.
243
        if not self.thorough:
244
            return
1563.2.39 by Robert Collins
Merge from integration.
245
        inventories = set(self.inventory.versions())
1594.2.2 by Robert Collins
Trivial change to reconcile to mutter the cause of reconciliation to bzr.log
246
        revisions = set(self._rev_graph.keys())
247
        garbage = inventories.difference(revisions)
248
        self.garbage_inventories = len(garbage)
249
        for revision_id in garbage:
250
            mutter('Garbage inventory {%s} found.', revision_id)
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
251
1570.1.14 by Robert Collins
Enforce repository consistency during 'fetch' operations.
252
    def _parent_is_available(self, parent):
253
        """True if parent is a fully available revision
254
255
        A fully available revision has a inventory and a revision object in the
256
        repository.
257
        """
258
        return (parent in self._rev_graph or 
259
                (parent in self.inventory and self.repo.has_revision(parent)))
260
1570.1.4 by Robert Collins
Somewhat optimised version of reconciler.
261
    def _reweave_step(self, message):
262
        """Mark a single step of regeneration complete."""
263
        self.pb.update(message, self.count, self.total)
264
        self.count += 1
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
265
266
267
class KnitReconciler(RepoReconciler):
268
    """Reconciler that reconciles a knit format repository.
269
270
    This will detect garbage inventories and remove them.
271
272
    Inconsistent parentage is checked for in the revision weave.
273
    """
274
275
    def _reconcile_steps(self):
276
        """Perform the steps to reconcile this repository."""
1692.1.1 by Robert Collins
* Repository.reconcile now takes a thorough keyword parameter to allow
277
        if self.thorough:
278
            self._load_indexes()
279
            # knits never suffer this
280
            self._gc_inventory()
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
281
282
    def _load_indexes(self):
283
        """Load indexes for the reconciliation."""
284
        self.transaction = self.repo.get_transaction()
285
        self.pb.update('Reading indexes.', 0, 2)
286
        self.inventory = self.repo.get_inventory_weave()
287
        self.pb.update('Reading indexes.', 1, 2)
288
        self.revisions = self.repo._revision_store.get_revision_file(self.transaction)
289
        self.pb.update('Reading indexes.', 2, 2)
290
291
    def _gc_inventory(self):
292
        """Remove inventories that are not referenced from the revision store."""
293
        self.pb.update('Checking unused inventories.', 0, 1)
294
        self._check_garbage_inventories()
295
        self.pb.update('Checking unused inventories.', 1, 3)
296
        if not self.garbage_inventories:
297
            self.pb.note('Inventory ok.')
298
            return
299
        self.pb.update('Backing up inventory...', 0, 0)
300
        self.repo.control_weaves.copy(self.inventory, 'inventory.backup', self.transaction)
301
        self.pb.note('Backup Inventory created.')
302
        # asking for '' should never return a non-empty weave
1616.1.1 by Martin Pool
[merge] robertc
303
        new_inventory_vf = self.repo.control_weaves.get_empty('inventory.new',
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
304
            self.transaction)
305
306
        # we have topological order of revisions and non ghost parents ready.
1594.2.9 by Robert Collins
Teach Knit repositories how to handle ghosts without corrupting at all.
307
        self._setup_steps(len(self.revisions))
308
        for rev_id in TopoSorter(self.revisions.get_graph().items()).iter_topo_order():
309
            parents = self.revisions.get_parents(rev_id)
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
310
            # double check this really is in topological order.
1616.1.1 by Martin Pool
[merge] robertc
311
            unavailable = [p for p in parents if p not in new_inventory_vf]
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
312
            assert len(unavailable) == 0
313
            # this entry has all the non ghost parents in the inventory
314
            # file already.
315
            self._reweave_step('adding inventories')
316
            # ugly but needed, weaves are just way tooooo slow else.
1616.1.1 by Martin Pool
[merge] robertc
317
            new_inventory_vf.add_lines(rev_id, parents, self.inventory.get_lines(rev_id))
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
318
1616.1.1 by Martin Pool
[merge] robertc
319
        # if this worked, the set of new_inventory_vf.names should equal
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
320
        # self.pending
1616.1.1 by Martin Pool
[merge] robertc
321
        assert set(new_inventory_vf.versions()) == set(self.revisions.versions())
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
322
        self.pb.update('Writing weave')
1616.1.1 by Martin Pool
[merge] robertc
323
        self.repo.control_weaves.copy(new_inventory_vf, 'inventory', self.transaction)
1594.2.7 by Robert Collins
Add versionedfile.fix_parents api for correcting data post hoc.
324
        self.repo.control_weaves.delete('inventory.new', self.transaction)
325
        self.inventory = None
326
        self.pb.note('Inventory regenerated.')
327
328
    def _check_garbage_inventories(self):
329
        """Check for garbage inventories which we cannot trust
330
331
        We cant trust them because their pre-requisite file data may not
332
        be present - all we know is that their revision was not installed.
333
        """
334
        inventories = set(self.inventory.versions())
335
        revisions = set(self.revisions.versions())
336
        garbage = inventories.difference(revisions)
337
        self.garbage_inventories = len(garbage)
338
        for revision_id in garbage:
339
            mutter('Garbage inventory {%s} found.', revision_id)