546
553
# time (self._block._content) is a little expensive.
547
554
self._block._ensure_content(self._last_byte)
549
def _check_rebuild_block(self):
556
def _check_rebuild_action(self):
550
557
"""Check to see if our block should be repacked."""
551
558
total_bytes_used = 0
552
559
last_byte_used = 0
553
560
for factory in self._factories:
554
561
total_bytes_used += factory._end - factory._start
555
last_byte_used = max(last_byte_used, factory._end)
556
# If we are using most of the bytes from the block, we have nothing
557
# else to check (currently more that 1/2)
562
if last_byte_used < factory._end:
563
last_byte_used = factory._end
564
# If we are using more than half of the bytes from the block, we have
565
# nothing else to check
558
566
if total_bytes_used * 2 >= self._block._content_length:
560
# Can we just strip off the trailing bytes? If we are going to be
561
# transmitting more than 50% of the front of the content, go ahead
567
return None, last_byte_used, total_bytes_used
568
# We are using less than 50% of the content. Is the content we are
569
# using at the beginning of the block? If so, we can just trim the
570
# tail, rather than rebuilding from scratch.
562
571
if total_bytes_used * 2 > last_byte_used:
563
self._trim_block(last_byte_used)
572
return 'trim', last_byte_used, total_bytes_used
566
574
# We are using a small amount of the data, and it isn't just packed
567
575
# nicely at the front, so rebuild the content.
574
582
# expanding many deltas into fulltexts, as well.
575
583
# If we build a cheap enough 'strip', then we could try a strip,
576
584
# if that expands the content, we then rebuild.
577
self._rebuild_block()
585
return 'rebuild', last_byte_used, total_bytes_used
587
def check_is_well_utilized(self):
588
"""Is the current block considered 'well utilized'?
590
This heuristic asks if the current block considers itself to be a fully
591
developed group, rather than just a loose collection of data.
593
if len(self._factories) == 1:
594
# A block of length 1 could be improved by combining with other
595
# groups - don't look deeper. Even larger than max size groups
596
# could compress well with adjacent versions of the same thing.
598
action, last_byte_used, total_bytes_used = self._check_rebuild_action()
599
block_size = self._block._content_length
600
if total_bytes_used < block_size * self._max_cut_fraction:
601
# This block wants to trim itself small enough that we want to
602
# consider it under-utilized.
604
# TODO: This code is meant to be the twin of _insert_record_stream's
605
# 'start_new_block' logic. It would probably be better to factor
606
# out that logic into a shared location, so that it stays
608
# We currently assume a block is properly utilized whenever it is >75%
609
# of the size of a 'full' block. In normal operation, a block is
610
# considered full when it hits 4MB of same-file content. So any block
611
# >3MB is 'full enough'.
612
# The only time this isn't true is when a given block has large-object
613
# content. (a single file >4MB, etc.)
614
# Under these circumstances, we allow a block to grow to
615
# 2 x largest_content. Which means that if a given block had a large
616
# object, it may actually be under-utilized. However, given that this
617
# is 'pack-on-the-fly' it is probably reasonable to not repack large
618
# content blobs on-the-fly. Note that because we return False for all
619
# 1-item blobs, we will repack them; we may wish to reevaluate our
620
# treatment of large object blobs in the future.
621
if block_size >= self._full_enough_block_size:
623
# If a block is <3MB, it still may be considered 'full' if it contains
624
# mixed content. The current rule is 2MB of mixed content is considered
625
# full. So check to see if this block contains mixed content, and
626
# set the threshold appropriately.
628
for factory in self._factories:
629
prefix = factory.key[:-1]
630
if common_prefix is None:
631
common_prefix = prefix
632
elif prefix != common_prefix:
633
# Mixed content, check the size appropriately
634
if block_size >= self._full_enough_mixed_block_size:
637
# The content failed both the mixed check and the single-content check
638
# so obviously it is not fully utilized
639
# TODO: there is one other constraint that isn't being checked
640
# namely, that the entries in the block are in the appropriate
641
# order. For example, you could insert the entries in exactly
642
# reverse groupcompress order, and we would think that is ok.
643
# (all the right objects are in one group, and it is fully
644
# utilized, etc.) For now, we assume that case is rare,
645
# especially since we should always fetch in 'groupcompress'
649
def _check_rebuild_block(self):
650
action, last_byte_used, total_bytes_used = self._check_rebuild_action()
654
self._trim_block(last_byte_used)
655
elif action == 'rebuild':
656
self._rebuild_block()
658
raise ValueError('unknown rebuild action: %r' % (action,))
579
660
def _wire_bytes(self):
580
661
"""Return a byte stream suitable for transmitting over the wire."""
1583
1665
if reuse_blocks:
1584
1666
# If the reuse_blocks flag is set, check to see if we can just
1585
1667
# copy a groupcompress block as-is.
1668
# We only check on the first record (groupcompress-block) not
1669
# on all of the (groupcompress-block-ref) entries.
1670
# The reuse_this_block flag is then kept for as long as
1671
if record.storage_kind == 'groupcompress-block':
1672
# Check to see if we really want to re-use this block
1673
insert_manager = record._manager
1674
reuse_this_block = insert_manager.check_is_well_utilized()
1676
reuse_this_block = False
1677
if reuse_this_block:
1678
# We still want to reuse this block
1586
1679
if record.storage_kind == 'groupcompress-block':
1587
1680
# Insert the raw block into the target repo
1588
1681
insert_manager = record._manager
1589
insert_manager._check_rebuild_block()
1590
1682
bytes = record._manager._block.to_bytes()
1591
1683
_, start, length = self._access.add_raw_records(
1592
1684
[(None, len(bytes))], bytes)[0]
1597
1689
'groupcompress-block-ref'):
1598
1690
if insert_manager is None:
1599
1691
raise AssertionError('No insert_manager set')
1692
if insert_manager is not record._manager:
1693
raise AssertionError('insert_manager does not match'
1694
' the current record, we cannot be positive'
1695
' that the appropriate content was inserted.'
1600
1697
value = "%d %d %d %d" % (block_start, block_length,
1601
1698
record._start, record._end)
1602
1699
nodes = [(record.key, value, (record.parents,))]