~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to groupcompress.py

Committer: Robert Collins
Date: 2008-07-07 02:27:03 UTC
mto: (0.21.2 dev5) (3735.31.1 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: robertc@robertcollins.net-20080707022703-rfk7o3i2xysdl9o6

new encoder, allows non monotonically increasing sequence matches for moar compression.

files modified:
groupcompress.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

groupcompress.py

next = lines.next

print next(), next()

for header in lines:

start, end, count = [int(n) for n in header.split(',')]

contents = [next() for i in xrange(count)]

result.append((start, end, count, contents))

op = header[0]

numbers = header[2:]

numbers = [int(n) for n in header[2:].split(',')]

if op == 'c':

result.append((op, numbers[0], numbers[1], None))

else:

contents = [next() for i in xrange(numbers[0])]

result.append((op, None, numbers[0], contents))

return result

def apply_delta(basis, delta):

last_offset = 0

# eq ranges occur where gaps occur

# start, end refer to offsets in basis

for start, end, count, delta_lines in delta:

if last_offset != start: # copy an eq range

lines.extend(basis[last_offset:start])

lines[start:end] = delta_lines

last_offset = end

if last_offset != len(basis):

lines.extend(basis[last_offset:])

for op, start, count, delta_lines in delta:

if op == 'c':

lines.extend(basis[start:start+count])

else:

lines.extend(delta_lines)

trim_encoding_newline(lines)

return lines

class GroupCompressor(object):

"""Produce a serialised group of compressed texts."""

"""Produce a serialised group of compressed texts.

It contains code very similar to SequenceMatcher because of having a similar

task. However some key differences apply:

- there is no junk, we want a minimal edit not a human readable diff.

- we don't filter very common lines (because we don't know where a good

range will start, and after the first text we want to be emitting minmal

edits only.

- we chain the left side, not the right side

- we incrementally update the adjacency matrix as new lines are provided.

- we look for matches in all of the left side, so the routine which does

the analagous task of find_longest_match does not need to filter on the

left side.

"""

def __init__(self, delta=True):

"""Create a GroupCompressor.

self.lines = []

self.endpoint = 0

self.input_bytes = 0

# line: set(locations it appears at), set(N+1 for N in locations)

100

self.line_locations = {}

101

102

def compress(self, key, lines, expected_sha):

103

"""Compress lines with label key.

104

122

new_lines = []

105

123

new_lines.append('label: %s\n' % label)

106

124

new_lines.append('sha1: %s\n' % sha1)

107

if 0:

108

delta_seq = diff.difflib.SequenceMatcher(

109

None, self.lines, lines)

110

else:

111

delta_seq = patiencediff.PatienceSequenceMatcher(

112

None, self.lines, lines)

113

diff_hunks = []

114

for op in delta_seq.get_opcodes():

115

if op[0] == 'equal':

116

continue

117

diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))

118

for start, end, count, new in diff_hunks:

119

new_lines.append('%d,%d,%d\n' % (start, end, count))

120

new_lines.extend(new)

121

self.endpoint += sum(map(len, new_lines))

122

self.lines.extend(new_lines)

125

pos = 0

126

line_locations = self.line_locations

127

accumulator = []

128

copying = False

129

new_len = 0

130

new_start = 0

131

# We either copy a range (while there are reusable lines) or we

132

# insert new lines. To find reusable lines we traverse

133

while pos < len(lines):

134

line = lines[pos]

135

if line not in line_locations:

136

if copying:

137

# flush the copy

138

copy_start = min(copy_ends) - copy_len

139

new_lines.append("c,%d,%d\n" % (copy_start,copy_len))

140

copying = False

141

new_start = pos

142

new_len = 1

143

else:

144

new_len += 1

145

else:

146

if copying:

147

locations, next = line_locations[line]

148

next_locations = locations.intersection(copy_ends)

149

if len(next_locations):

150

# range continues

151

copy_len += 1

152

copy_ends = set(loc + 1 for loc in next_locations)

153

else:

154

# range stops, flush and start a new copy range

155

copy_start = min(copy_ends) - copy_len

156

new_lines.append("c,%d,%d\n" % (copy_start,copy_len))

157

copy_len = 1

158

copy_ends = next

159

else:

160

# Flush

161

if new_len:

162

new_lines.append("i,%d\n" % new_len)

163

new_lines.extend(lines[new_start:new_start+new_len])

164

# setup a copy

165

copy_len = 1

166

copy_ends = line_locations[line][1]

167

copying = True

168

pos += 1

169

if copying:

170

copy_start = min(copy_ends) - copy_len

171

new_lines.append("c,%d,%d\n" % (copy_start,copy_len))

172

elif new_len:

173

new_lines.append("i,%d\n" % new_len)

174

new_lines.extend(lines[new_start:new_start+new_len])

175

176

self.output_lines(new_lines)

123

177

trim_encoding_newline(lines)

124

178

self.input_bytes += sum(map(len, lines))

125

179

return sha1, self.endpoint

126

180

181

def output_lines(self, new_lines):

182

self.endpoint += sum(map(len, new_lines))

183

offset = len(self.lines)

184

self.lines.extend(new_lines)

185

for pos, line in enumerate(new_lines):

186

indices, next_lines = self.line_locations.setdefault(line,

187

(set(), set()))

188

indices.add(pos + offset)

189

next_lines.add(pos + offset + 1)

190

127

191

def ratio(self):

128

192

"""Return the overall compression ratio."""

129

193

return float(self.input_bytes) / float(self.endpoint)

224

288

# double handling for now. Make it work until then.

225

289

bytes = ''.join(lines)

226

290

record = FulltextContentFactory(key, parents, None, bytes)

227

sha1 = self._insert_record_stream([record])

291

sha1 = self._insert_record_stream([record]).next()

228

292

return sha1, len(bytes), None

229

293

230

294

def _check_add(self, key, lines, random_id, check_content):

267

331

for record in stream:

268

332

found_sha1, end_point = compressor.compress(record.key,

269

333

split_lines(record.get_bytes_as('fulltext')), record.sha1)

334

yield found_sha1

270

335

271

336

class _GCGraphIndex(object):

272

337

"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""

Older »