~bzr-pqm/bzr/bzr.dev : revision 4261

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

16

17

"""Python version of compiled extensions for doing compression.

18

19

We separate the implementation from the groupcompress.py to avoid importing

20

useless stuff.

21

"""

22

23

from bzrlib import osutils

24

25

26

class LinesDeltaIndex(object):

27

"""This class indexes matches between strings.

28

29

:ivar lines: The 'static' lines that will be preserved between runs.

30

:ivar _matching_lines: A dict of {line:[matching offsets]}

31

:ivar line_offsets: The byte offset for the end of each line, used to

32

quickly map between a matching line number and the byte location

33

:ivar endpoint: The total number of bytes in self.line_offsets

34

"""

35

36

def __init__(self, lines):

37

self.lines = []

38

self.line_offsets = []

39

self.endpoint = 0

40

self._matching_lines = {}

41

self.extend_lines(lines, [True]*len(lines))

42

43

def _update_matching_lines(self, new_lines, index):

44

matches = self._matching_lines

45

start_idx = len(self.lines)

46

if len(new_lines) != len(index):

47

raise AssertionError('The number of lines to be indexed does'

48

' not match the index/don\'t index flags: %d != %d'

49

% (len(new_lines), len(index)))

50

for idx, do_index in enumerate(index):

51

if not do_index:

52

continue

53

matches.setdefault(new_lines[idx], []).append(start_idx + idx)

54

55

def get_matches(self, line):

56

"""Return the lines which match the line in right."""

57

try:

58

return self._matching_lines[line]

59

except KeyError:

60

return None

61

62

def _get_longest_match(self, lines, pos, locations):

63

"""Look at all matches for the current line, return the longest.

64

65

:param lines: The lines we are matching against

66

:param pos: The current location we care about

67

:param locations: A list of lines that matched the current location.

68

This may be None, but often we'll have already found matches for

69

this line.

70

:return: (start_in_self, start_in_lines, num_lines)

71

All values are the offset in the list (aka the line number)

72

If start_in_self is None, then we have no matches, and this line

73

should be inserted in the target.

74

"""

75

range_start = pos

76

range_len = 0

77

copy_ends = None

78

max_pos = len(lines)

79

while pos < max_pos:

80

if locations is None:

81

# TODO: is try/except better than get(..., None)?

82

try:

83

locations = self._matching_lines[lines[pos]]

84

except KeyError:

85

locations = None

86

if locations is None:

87

# No more matches, just return whatever we have, but we know

88

# that this last position is not going to match anything

89

pos += 1

90

break

91

else:

92

# We have a match

93

if copy_ends is None:

94

# This is the first match in a range

95

copy_ends = [loc + 1 for loc in locations]

96

range_len = 1

97

locations = None # Consumed

98

else:

99

# We have a match started, compare to see if any of the

100

# current matches can be continued

101

next_locations = set(copy_ends).intersection(locations)

102

if next_locations:

103

# At least one of the regions continues to match

104

copy_ends = [loc + 1 for loc in next_locations]

105

range_len += 1

106

locations = None # Consumed

107

else:

108

# All current regions no longer match.

109

# This line does still match something, just not at the

110

# end of the previous matches. We will return locations

111

# so that we can avoid another _matching_lines lookup.

112

break

113

pos += 1

114

if copy_ends is None:

115

# We have no matches, this is a pure insert

116

return None, pos, locations

117

return (((min(copy_ends) - range_len, range_start, range_len)),

118

pos, locations)

119

120

def get_matching_blocks(self, lines, soft=False):

121

"""Return the ranges in lines which match self.lines.

122

123

:param lines: lines to compress

124

:return: A list of (old_start, new_start, length) tuples which reflect

125

a region in self.lines that is present in lines. The last element

126

of the list is always (old_len, new_len, 0) to provide a end point

127

for generating instructions from the matching blocks list.

128

"""

129

# In this code, we iterate over multiple _get_longest_match calls, to

130

# find the next longest copy, and possible insert regions. We then

131

# convert that to the simple matching_blocks representation, since

132

# otherwise inserting 10 lines in a row would show up as 10

133

# instructions.

134

result = []

135

pos = 0

136

locations = None

137

max_pos = len(lines)

138

result_append = result.append

139

min_match_bytes = 10

140

if soft:

141

min_match_bytes = 200

142

while pos < max_pos:

143

block, pos, locations = self._get_longest_match(lines, pos,

144

locations)

145

if block is not None:

146

# Check to see if we match fewer than min_match_bytes. As we

147

# will turn this into a pure 'insert', rather than a copy.

148

# block[-1] is the number of lines. A quick check says if we

149

# have more lines than min_match_bytes, then we know we have

150

# enough bytes.

151

if block[-1] < min_match_bytes:

152

# This block may be a 'short' block, check

153

old_start, new_start, range_len = block

154

matched_bytes = sum(map(len,

155

lines[new_start:new_start + range_len]))

156

if matched_bytes < min_match_bytes:

157

block = None

158

if block is not None:

159

result_append(block)

160

result_append((len(self.lines), len(lines), 0))

161

return result

162

163

def extend_lines(self, lines, index):

164

"""Add more lines to the left-lines list.

165

166

:param lines: A list of lines to add

167

:param index: A True/False for each node to define if it should be

168

indexed.

169

"""

170

self._update_matching_lines(lines, index)

171

self.lines.extend(lines)

172

endpoint = self.endpoint

173

for line in lines:

174

endpoint += len(line)

175

self.line_offsets.append(endpoint)

176

if len(self.line_offsets) != len(self.lines):

177

raise AssertionError('Somehow the line offset indicator'

178

' got out of sync with the line counter.')

179

self.endpoint = endpoint

180

181

def _flush_insert(self, start_linenum, end_linenum,

182

new_lines, out_lines, index_lines):

183

"""Add an 'insert' request to the data stream."""

184

bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])

185

insert_length = len(bytes_to_insert)

186

# Each insert instruction is at most 127 bytes long

187

for start_byte in xrange(0, insert_length, 127):

188

insert_count = min(insert_length - start_byte, 127)

189

out_lines.append(chr(insert_count))

190

# Don't index the 'insert' instruction

191

index_lines.append(False)

192

insert = bytes_to_insert[start_byte:start_byte+insert_count]

193

as_lines = osutils.split_lines(insert)

194

out_lines.extend(as_lines)

195

index_lines.extend([True]*len(as_lines))

196

197

def _flush_copy(self, old_start_linenum, num_lines,

198

out_lines, index_lines):

199

if old_start_linenum == 0:

200

first_byte = 0

201

else:

202

first_byte = self.line_offsets[old_start_linenum - 1]

203

stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]

204

num_bytes = stop_byte - first_byte

205

# The data stream allows >64kB in a copy, but to match the compiled

206

# code, we will also limit it to a 64kB copy

207

for start_byte in xrange(first_byte, stop_byte, 64*1024):

208

num_bytes = min(64*1024, stop_byte - first_byte)

209

copy_bytes = encode_copy_instruction(start_byte, num_bytes)

210

out_lines.append(copy_bytes)

211

index_lines.append(False)

212

213

def make_delta(self, new_lines, bytes_length=None, soft=False):

214

"""Compute the delta for this content versus the original content."""

215

if bytes_length is None:

216

bytes_length = sum(map(len, new_lines))

217

# reserved for content type, content length

218

out_lines = ['', '', encode_base128_int(bytes_length)]

219

index_lines = [False, False, False]

220

blocks = self.get_matching_blocks(new_lines, soft=soft)

221

current_line_num = 0

222

# We either copy a range (while there are reusable lines) or we

223

# insert new lines. To find reusable lines we traverse

224

for old_start, new_start, range_len in blocks:

225

if new_start != current_line_num:

226

# non-matching region, insert the content

227

self._flush_insert(current_line_num, new_start,

228

new_lines, out_lines, index_lines)

229

current_line_num = new_start + range_len

230

if range_len:

231

self._flush_copy(old_start, range_len, out_lines, index_lines)

232

return out_lines, index_lines

233

234

235

def encode_base128_int(val):

236

"""Convert an integer into a 7-bit lsb encoding."""

237

bytes = []

238

count = 0

239

while val >= 0x80:

240

bytes.append(chr((val | 0x80) & 0xFF))

241

val >>= 7

242

bytes.append(chr(val))

243

return ''.join(bytes)

244

245

246

def decode_base128_int(bytes):

247

"""Decode an integer from a 7-bit lsb encoding."""

248

offset = 0

249

val = 0

250

shift = 0

251

bval = ord(bytes[offset])

252

while bval >= 0x80:

253

val |= (bval & 0x7F) << shift

254

shift += 7

255

offset += 1

256

bval = ord(bytes[offset])

257

val |= bval << shift

258

offset += 1

259

return val, offset

260

261

262

def encode_copy_instruction(offset, length):

263

"""Convert this offset into a control code and bytes."""

264

copy_command = 0x80

265

copy_bytes = [None]

266

267

for copy_bit in (0x01, 0x02, 0x04, 0x08):

268

base_byte = offset & 0xff

269

if base_byte:

270

copy_command |= copy_bit

271

copy_bytes.append(chr(base_byte))

272

offset >>= 8

273

if length is None:

274

# None is used by the test suite

275

copy_bytes[0] = chr(copy_command)

276

return ''.join(copy_bytes)

277

if length > 0x10000:

278

raise ValueError("we don't emit copy records for lengths > 64KiB")

279

if length == 0:

280

raise ValueError("We cannot emit a copy of length 0")

281

if length != 0x10000:

282

# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,

283

# since that saves bytes for large chained copies

284

for copy_bit in (0x10, 0x20):

285

base_byte = length & 0xff

286

if base_byte:

287

copy_command |= copy_bit

288

copy_bytes.append(chr(base_byte))

289

length >>= 8

290

copy_bytes[0] = chr(copy_command)

291

return ''.join(copy_bytes)

292

293

294

def decode_copy_instruction(bytes, cmd, pos):

295

"""Decode a copy instruction from the next few bytes.

296

297

A copy instruction is a variable number of bytes, so we will parse the

298

bytes we care about, and return the new position, as well as the offset and

299

length referred to in the bytes.

300

301

:param bytes: A string of bytes

302

:param cmd: The command code

303

:param pos: The position in bytes right after the copy command

304

:return: (offset, length, newpos)

305

The offset of the copy start, the number of bytes to copy, and the

306

position after the last byte of the copy

307

"""

308

if cmd & 0x80 != 0x80:

309

raise ValueError('copy instructions must have bit 0x80 set')

310

offset = 0

311

length = 0

312

if (cmd & 0x01):

313

offset = ord(bytes[pos])

314

pos += 1

315

if (cmd & 0x02):

316

offset = offset | (ord(bytes[pos]) << 8)

317

pos += 1

318

if (cmd & 0x04):

319

offset = offset | (ord(bytes[pos]) << 16)

320

pos += 1

321

if (cmd & 0x08):

322

offset = offset | (ord(bytes[pos]) << 24)

323

pos += 1

324

if (cmd & 0x10):

325

length = ord(bytes[pos])

326

pos += 1

327

if (cmd & 0x20):

328

length = length | (ord(bytes[pos]) << 8)

329

pos += 1

330

if (cmd & 0x40):

331

length = length | (ord(bytes[pos]) << 16)

332

pos += 1

333

if length == 0:

334

length = 65536

335

return (offset, length, pos)

336

337

338

def make_delta(source_bytes, target_bytes):

339

"""Create a delta from source to target."""

340

# TODO: The checks below may not be a the right place yet.

341

if type(source_bytes) is not str:

342

raise TypeError('source is not a str')

343

if type(target_bytes) is not str:

344

raise TypeError('target is not a str')

345

line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))

346

delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),

347

bytes_length=len(target_bytes))

348

return ''.join(delta)

349

350

351

def apply_delta(basis, delta):

352

"""Apply delta to this object to become new_version_id."""

353

if type(basis) is not str:

354

raise TypeError('basis is not a str')

355

if type(delta) is not str:

356

raise TypeError('delta is not a str')

357

target_length, pos = decode_base128_int(delta)

358

lines = []

359

len_delta = len(delta)

360

while pos < len_delta:

361

cmd = ord(delta[pos])

362

pos += 1

363

if cmd & 0x80:

364

offset, length, pos = decode_copy_instruction(delta, cmd, pos)

365

last = offset + length

366

if last > len(basis):

367

raise ValueError('data would copy bytes past the'

368

'end of source')

369

lines.append(basis[offset:last])

370

else: # Insert of 'cmd' bytes

371

if cmd == 0:

372

raise ValueError('Command == 0 not supported yet')

373

lines.append(delta[pos:pos+cmd])

374

pos += cmd

375

bytes = ''.join(lines)

376

if len(bytes) != target_length:

377

raise ValueError('Delta claimed to be %d long, but ended up'

378

' %d long' % (target_length, len(bytes)))

379

return bytes

380

381

382

def apply_delta_to_source(source, delta_start, delta_end):

383

"""Extract a delta from source bytes, and apply it."""

384

source_size = len(source)

385

if delta_start >= source_size:

386

raise ValueError('delta starts after source')

387

if delta_end > source_size:

388

raise ValueError('delta ends after source')

389

if delta_start >= delta_end:

390

raise ValueError('delta starts after it ends')

391

delta_bytes = source[delta_start:delta_end]

392

return apply_delta(source, delta_bytes)