866
868
which views a particular line of development through that history.
868
870
The Repository builds on top of some byte storage facilies (the revisions,
869
signatures, inventories and texts attributes) and a Transport, which
870
respectively provide byte storage and a means to access the (possibly
871
signatures, inventories, texts and chk_bytes attributes) and a Transport,
872
which respectively provide byte storage and a means to access the (possibly
873
875
The byte storage facilities are addressed via tuples, which we refer to
874
876
as 'keys' throughout the code base. Revision_keys, inventory_keys and
875
877
signature_keys are all 1-tuples: (revision_id,). text_keys are two-tuples:
876
(file_id, revision_id). We use this interface because it allows low
877
friction with the underlying code that implements disk indices, network
878
encoding and other parts of bzrlib.
878
(file_id, revision_id). chk_bytes uses CHK keys - a 1-tuple with a single
879
byte string made up of a hash identifier and a hash value.
880
We use this interface because it allows low friction with the underlying
881
code that implements disk indices, network encoding and other parts of
880
884
:ivar revisions: A bzrlib.versionedfile.VersionedFiles instance containing
881
885
the serialised revisions for the repository. This can be used to obtain
3470
3536
return deltas[0][1:]
3538
def _get_parent_keys(self, root_key, parent_map):
3539
"""Get the parent keys for a given root id."""
3540
root_id, rev_id = root_key
3541
# Include direct parents of the revision, but only if they used
3544
for parent_id in parent_map[rev_id]:
3545
if parent_id == _mod_revision.NULL_REVISION:
3547
if parent_id not in self._revision_id_to_root_id:
3548
# We probably didn't read this revision, go spend the
3549
# extra effort to actually check
3551
tree = self.source.revision_tree(parent_id)
3552
except errors.NoSuchRevision:
3553
# Ghost, fill out _revision_id_to_root_id in case we
3554
# encounter this again.
3555
# But set parent_root_id to None since we don't really know
3556
parent_root_id = None
3558
parent_root_id = tree.get_root_id()
3559
self._revision_id_to_root_id[parent_id] = None
3561
parent_root_id = self._revision_id_to_root_id[parent_id]
3562
if root_id == parent_root_id or parent_root_id is None:
3563
parent_keys.append((root_id, parent_id))
3564
return tuple(parent_keys)
3566
def _new_root_data_stream(self, root_keys_to_create, parent_map):
3567
for root_key in root_keys_to_create:
3568
parent_keys = self._get_parent_keys(root_key, parent_map)
3569
yield versionedfile.FulltextContentFactory(root_key,
3570
parent_keys, None, '')
3472
3572
def _fetch_batch(self, revision_ids, basis_id, cache):
3473
3573
"""Fetch across a few revisions.
4038
4167
return (not self.from_repository._format.rich_root_data and
4039
4168
self.to_format.rich_root_data)
4170
def _get_inventory_stream(self, revision_ids):
4171
from_format = self.from_repository._format
4172
if (from_format.supports_chks and self.to_format.supports_chks
4173
and (from_format._serializer == self.to_format._serializer)):
4174
# Both sides support chks, and they use the same serializer, so it
4175
# is safe to transmit the chk pages and inventory pages across
4177
return self._get_chk_inventory_stream(revision_ids)
4178
elif (not from_format.supports_chks):
4179
# Source repository doesn't support chks. So we can transmit the
4180
# inventories 'as-is' and either they are just accepted on the
4181
# target, or the Sink will properly convert it.
4182
return self._get_simple_inventory_stream(revision_ids)
4184
# XXX: Hack to make not-chk->chk fetch: copy the inventories as
4185
# inventories. Note that this should probably be done somehow
4186
# as part of bzrlib.repository.StreamSink. Except JAM couldn't
4187
# figure out how a non-chk repository could possibly handle
4188
# deserializing an inventory stream from a chk repo, as it
4189
# doesn't have a way to understand individual pages.
4190
return self._get_convertable_inventory_stream(revision_ids)
4192
def _get_simple_inventory_stream(self, revision_ids):
4193
from_weave = self.from_repository.inventories
4194
yield ('inventories', from_weave.get_record_stream(
4195
[(rev_id,) for rev_id in revision_ids],
4196
self.inventory_fetch_order(),
4197
not self.delta_on_metadata()))
4199
def _get_chk_inventory_stream(self, revision_ids):
4200
"""Fetch the inventory texts, along with the associated chk maps."""
4201
# We want an inventory outside of the search set, so that we can filter
4202
# out uninteresting chk pages. For now we use
4203
# _find_revision_outside_set, but if we had a Search with cut_revs, we
4204
# could use that instead.
4205
start_rev_id = self.from_repository._find_revision_outside_set(
4207
start_rev_key = (start_rev_id,)
4208
inv_keys_to_fetch = [(rev_id,) for rev_id in revision_ids]
4209
if start_rev_id != _mod_revision.NULL_REVISION:
4210
inv_keys_to_fetch.append((start_rev_id,))
4211
# Any repo that supports chk_bytes must also support out-of-order
4212
# insertion. At least, that is how we expect it to work
4213
# We use get_record_stream instead of iter_inventories because we want
4214
# to be able to insert the stream as well. We could instead fetch
4215
# allowing deltas, and then iter_inventories, but we don't know whether
4216
# source or target is more 'local' anway.
4217
inv_stream = self.from_repository.inventories.get_record_stream(
4218
inv_keys_to_fetch, 'unordered',
4219
True) # We need them as full-texts so we can find their references
4220
uninteresting_chk_roots = set()
4221
interesting_chk_roots = set()
4222
def filter_inv_stream(inv_stream):
4223
for idx, record in enumerate(inv_stream):
4224
### child_pb.update('fetch inv', idx, len(inv_keys_to_fetch))
4225
bytes = record.get_bytes_as('fulltext')
4226
chk_inv = inventory.CHKInventory.deserialise(
4227
self.from_repository.chk_bytes, bytes, record.key)
4228
if record.key == start_rev_key:
4229
uninteresting_chk_roots.add(chk_inv.id_to_entry.key())
4230
p_id_map = chk_inv.parent_id_basename_to_file_id
4231
if p_id_map is not None:
4232
uninteresting_chk_roots.add(p_id_map.key())
4235
interesting_chk_roots.add(chk_inv.id_to_entry.key())
4236
p_id_map = chk_inv.parent_id_basename_to_file_id
4237
if p_id_map is not None:
4238
interesting_chk_roots.add(p_id_map.key())
4239
### pb.update('fetch inventory', 0, 2)
4240
yield ('inventories', filter_inv_stream(inv_stream))
4241
# Now that we have worked out all of the interesting root nodes, grab
4242
# all of the interesting pages and insert them
4243
### pb.update('fetch inventory', 1, 2)
4244
interesting = chk_map.iter_interesting_nodes(
4245
self.from_repository.chk_bytes, interesting_chk_roots,
4246
uninteresting_chk_roots)
4247
def to_stream_adapter():
4248
"""Adapt the iter_interesting_nodes result to a single stream.
4250
iter_interesting_nodes returns records as it processes them, along
4251
with keys. However, we only want to return the records themselves.
4253
for record, items in interesting:
4254
if record is not None:
4256
# XXX: We could instead call get_record_stream(records.keys())
4257
# ATM, this will always insert the records as fulltexts, and
4258
# requires that you can hang on to records once you have gone
4259
# on to the next one. Further, it causes the target to
4260
# recompress the data. Testing shows it to be faster than
4261
# requesting the records again, though.
4262
yield ('chk_bytes', to_stream_adapter())
4263
### pb.update('fetch inventory', 2, 2)
4265
def _get_convertable_inventory_stream(self, revision_ids):
4266
# XXX: One of source or target is using chks, and they don't have
4267
# compatible serializations. The StreamSink code expects to be
4268
# able to convert on the target, so we need to put
4269
# bytes-on-the-wire that can be converted
4270
yield ('inventories', self._stream_invs_as_fulltexts(revision_ids))
4272
def _stream_invs_as_fulltexts(self, revision_ids):
4273
from_repo = self.from_repository
4274
from_serializer = from_repo._format._serializer
4275
revision_keys = [(rev_id,) for rev_id in revision_ids]
4276
parent_map = from_repo.inventories.get_parent_map(revision_keys)
4277
for inv in self.from_repository.iter_inventories(revision_ids):
4278
# XXX: This is a bit hackish, but it works. Basically,
4279
# CHKSerializer 'accidentally' supports
4280
# read/write_inventory_to_string, even though that is never
4281
# the format that is stored on disk. It *does* give us a
4282
# single string representation for an inventory, so live with
4284
# This would be far better if we had a 'serialized inventory
4285
# delta' form. Then we could use 'inventory._make_delta', and
4286
# transmit that. This would both be faster to generate, and
4287
# result in fewer bytes-on-the-wire.
4288
as_bytes = from_serializer.write_inventory_to_string(inv)
4289
key = (inv.revision_id,)
4290
parent_keys = parent_map.get(key, ())
4291
yield versionedfile.FulltextContentFactory(
4292
key, parent_keys, None, as_bytes)