~bzr-pqm/bzr/bzr.dev : revision 1852.13.24

49

because we wont want to restat all files on disk just because a lock was

50

acquired, yet we cannot trust the data after the previous lock was released.

51

52

DirState State machines? Strategy needed ?

53

We have a number of key states:

54

A memory object exists, disk data untouched.

55

B have read just the parents details to answer common queries

56

C have read the entire dirstate, so can answer questions about the tree

57

from memory

58

D have altered some part of the directory data, can incrementally save.

59

i.e. have refreshed a stat hit for a single file.

60

E (full-dirty) have altered some part of the directory data, cannot

61

incrementally save. I.e. have added or deleted a file, or added or deleted

62

parents to the dirstate itself.

63

64

currently _header_read = True means the header is read, can be in B through E

65

_clean = True means the in memory representation is exactly whats

66

on disk - C

67

_clean = False -> A or B or D or E

68

_header_read = False means A

69

70

52

Memory representation:

71

Each row will be a tuple that has:

72

current_row_data_tuple, parent_list

73

current_row_data = [dirname, basename, fullkind, fileid, size, packed-stat, linkvalue]

74

parents_list = [(revision, kind, dirname, basename, size, executable_bool, sha1) ...]

75

row = (current_data, parents_list)

76

77

Still need to address how rows are managed:

78

53

open questions:

79

vector of all rows or one vector per directory ?

80

vector of all rows allows trivial bisection to find paths

81

54

vecter of all directories, and vector of the childen ?

82

55

i.e.

83

(data for root),

84

[

56

root_row = (direntry for root, [parent_direntries_for_root]),

57

dirblocks = [

85

58

('', ['data for achild', 'data for bchild', 'data for cchild'])

86

59

('dir', ['achild', 'cchild', 'echild'])

87

60

]

158

131

# of using int conversion rather than a dict here. AND BLAME ANDREW IF

159

132

# it is faster.

160

133

134

NOT_IN_MEMORY = 0

135

IN_MEMORY_UNMODIFIED = 1

136

IN_MEMORY_MODIFIED = 2

137

161

138

def __init__(self):

162

self._header_read = False

163

self._clean = False

139

# _header_state and _dirblock_state represent the current state

140

# of the dirstate metadata and the per-row data respectiely.

141

# NOT_IN_MEMORY indicates that no data is in memory

142

# IN_MEMORY_UNMODIFIED indicates that what we have in memory

143

# is the same as is on disk

144

# IN_MEMORY_MODIFIED indicates that we have a modified version

145

# of what is on disk.

146

# In future we will add more granularity, for instance _dirblock_state

147

# will probably support partially-in-memory as a separate variable,

148

# allowing for partially-in-memory unmodified and partially-in-memory

149

# modified states.

150

self._header_state = DirState.NOT_IN_MEMORY

151

self._dirblock_state = DirState.NOT_IN_MEMORY

152

self._dirblocks = []

153

self._ghosts = []

164

154

self._parents = []

155

self._state_file=None

165

156

166

157

def add_parent_tree(self, tree_id, tree):

167

158

"""Add tree as a parent to this dirstate."""

159

self._read_dirblocks_if_needed()

168

160

self._parents.append(tree_id)

169

self._clean = False

161

self._header_state = DirState.IN_MEMORY_MODIFIED

162

if tree is None:

163

self._ghosts.append(tree_id)

170

164

171

165

@staticmethod

172

def from_tree(tree):

166

def from_tree(tree, dir_state_filename):

173

167

"""Create a dirstate from a bzr Tree.

174

168

175

169

:param tree: The tree which should provide parent information and

176

170

inventory ids.

177

171

"""

172

# XXX: aka the big ugly.

178

173

result = DirState()

179

180

lines = []

174

result._state_file = open(dir_state_filename, 'wb+')

181

175

182

176

_encode = base64.encodestring

183

177

190

184

for parent_id in parent_ids:

191

185

parent_trees.append(tree.branch.repository.revision_tree(parent_id))

192

186

193

lines.append(result._get_parents_line(parent_ids))

194

187

# FIXME: is this utf8 safe?

195

188

196

189

to_minikind = DirState._kind_to_minikind

197

190

to_yesno = DirState._to_yesno

198

191

199

192

st = os.lstat(tree.basedir)

200

null_parent_info = '\0'.join((

201

'null:'

202

, '', ''

203

, ''

204

, ''

205

, ''

206

, ''

207

))

208

#, 'd', gen_root_id().encode('utf8')

209

193

root_info = [

210

194

'', '' # No path

211

, 'd', tree.inventory.root.file_id.encode('utf8')

212

, str(st.st_size)

195

, 'directory', tree.inventory.root.file_id.encode('utf8')

196

, 0 # no point having a size for dirs.

213

197

, pack_stat(st)

214

198

, '' # No sha

215

] + [null_parent_info]*num_parents

216

# disabled because the root entry has no revision attribute set.

217

# for parent_tree in parent_trees:

218

# root_info.append('\0'.join((

219

# parent_tree.inventory.root.revision.encode('utf8'),

220

# '', '',

221

# '',

222

# '',

223

# '',

224

# '',

225

# )))

199

]

200

root_parents = []

201

for parent_tree in parent_trees:

202

root_parents.append((

203

parent_tree.inventory.root.revision.encode('utf8'),

204

'directory', '',

205

'',

206

'',

207

False,

208

'',

209

))

226

210

227

lines.append('\0'.join(root_info))

228

211

root_row = (root_info, root_parents)

212

dirblocks = []

229

213

for dirinfo, block in tree.walkdirs():

230

214

# dirinfo is path, id

231

215

to_remove = []

216

# add the row for this block

217

block_row = []

218

dirblocks.append((dirinfo[0], block_row))

232

219

for relpath, name, kind, st, fileid, versionedkind in block:

233

220

if fileid is None:

234

221

# unversioned file, skip

264

251

row_data = (dirname.encode('utf8'), basename.encode('utf8'),

265

252

kind, fileid.encode('utf8'), st.st_size, pack_stat(st),

266

253

s)

267

row_tuple = (row_data, parent_info)

268

lines.append(result._row_to_line(row_tuple))

254

block_row.append((row_data, parent_info))

269

255

270

256

# It isn't safe to remove entries while we are iterating

271

257

# over the same list, so remove them now

272

258

for entry in to_remove:

273

259

block.remove(entry)

274

260

275

result.lines = result._get_output_lines(lines)

276

result._header_read = True

277

result._clean = True

261

#lines.append(result._get_parents_line(parent_ids))

262

#lines.append(result._get_ghosts_line([]))

263

result._set_data(parent_ids, root_row, dirblocks)

264

result.save()

278

265

return result

279

266

267

def get_ghosts(self):

268

"""Return a list of the parent tree revision ids that are ghosts."""

269

self._read_header_if_needed()

270

return self._ghosts

271

280

272

def get_lines(self):

281

273

"""Serialise the entire dirstate to a sequence of lines."""

282

if self._clean:

283

return self.lines

274

if (self._header_state == DirState.IN_MEMORY_UNMODIFIED and

275

self._dirblock_state == DirState.IN_MEMORY_UNMODIFIED):

276

# read whats on disk.

277

self._state_file.seek(0)

278

return self._state_file.readlines()

284

279

lines = []

285

280

lines.append(self._get_parents_line(self.get_parent_ids()))

281

lines.append(self._get_ghosts_line(self._ghosts))

286

282

# append the root line which is special cased

287

lines.append(self._row_to_line(self._root_row))

288

self.lines = self._get_output_lines(lines)

289

return self.lines

283

lines.extend(map(self._row_to_line, self._iter_rows()))

284

return self._get_output_lines(lines)

290

285

286

def _get_ghosts_line(self, ghost_ids):

287

"""Create a line for the state file for ghost information."""

288

return '\0'.join([str(len(ghost_ids))] + ghost_ids)

289

291

290

def _get_parents_line(self, parent_ids):

292

291

"""Create a line for the state file for parents information."""

293

292

return '\0'.join([str(len(parent_ids))] + parent_ids)

307

306

:param path: The name of the file for the dirstate.

308

307

:return: A DirState object.

309

308

"""

310

# This constructs a new DirState object on a path, sets the state_file

309

# This constructs a new DirState object on a path, sets the _state_file

311

310

# to a new empty file for that path. It then calls _set_data() with our

312

311

# stock empty dirstate information - a root with ROOT_ID, no children,

313

312

# and no parents. Finally it calls save() to ensure that this data will

314

313

# persist.

315

314

result = DirState()

316

result.state_file = open(path, 'wb+')

317

# a new root directory, with a pack_stat that is just noise and will

315

result._state_file = open(path, 'wb+')

316

# a new root directory, with a pack_stat (the x's) that is just noise and will

318

317

# never match the output of base64 encode.

319

root_row_data = ('', '', 'directory', bzrlib.inventory.ROOT_ID, 0, 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', '')

318

root_row_data = ('', '', 'directory', bzrlib.inventory.ROOT_ID, 0, 'x'*32, '')

320

319

root_parents = []

321

320

root_row = (root_row_data, root_parents)

322

empty_tree_data = [('', [])] # root dir contents - no entries.

323

result._set_data(root_row, empty_tree_data)

321

empty_tree_dirblocks = [('', [])] # root dir contents - no entries.

322

result._set_data([], root_row, empty_tree_dirblocks)

324

323

try:

325

324

result.save()

326

325

except:

327

result.state_file.close()

326

result._state_file.close()

328

327

raise

329

328

return result

330

329

330

def _iter_rows(self):

331

"""Iterate over all the row data in the dirstate.

332

333

Each yelt item is a tuple of (row_data, parent_data_list).

334

"""

335

self._read_dirblocks_if_needed()

336

yield self._root_row

337

for directory in self._dirblocks:

338

for row in directory[1]:

339

yield row

340

331

341

def _get_output_lines(self, lines):

332

342

"""format lines for final output.

333

343

338

348

lines.append('') # a final newline

339

349

inventory_text = '\0\n\0'.join(lines)

340

350

output_lines.append('adler32: %s\n' % (zlib.adler32(inventory_text),))

341

# -2, 1 for num parents, 1 for final newline

342

num_entries = len(lines)-2

351

# -3, 1 for num parents, 1 for ghosts, 1 for final newline

352

num_entries = len(lines)-3

343

353

output_lines.append('num_entries: %s\n' % (num_entries,))

344

354

output_lines.append(inventory_text)

345

355

return output_lines

348

358

def on_file(path):

349

359

"""Construct a DirState on the file at path path."""

350

360

result = DirState()

351

result.state_file = open(path, 'rb+')

361

result._state_file = open(path, 'rb+')

352

362

return result

353

363

354

def _read_all(self):

355

"""Read the entire state."""

356

self._read_header()

357

364

def _read_dirblocks_if_needed(self):

365

"""Read in all the dirblocks from the file if they are not in memory."""

366

self._read_header_if_needed()

367

if self._dirblock_state == DirState.NOT_IN_MEMORY:

368

# the _state_file pointer will be positioned at the start of the

369

# dirblocks.

370

text = self._state_file.read()

371

# TODO: check the adler checksums. adler_measured = zlib.adler32(text)

372

373

fields = text.split('\0')

374

# Remove the last blank entry

375

trailing = fields.pop()

376

assert trailing == ''

377

# consider turning fields into a tuple.

378

379

# skip the first field which is the trailing null from the header.

380

cur = 1

381

field_count = len(fields)

382

# Each line now has an extra '\n' field which is not used

383

# so we just skip over it

384

# number of fields per dir_entry + number of fields per parent_entry + newline

385

num_parents = len(self._parents)

386

entry_size = 7 + (7 * num_parents) + 1

387

expected_field_count = entry_size * self._num_entries

388

# is the file too short ?

389

assert field_count - cur == expected_field_count, \

390

'field count incorrect %s != %s' % (expected_field_count, field_count)

391

392

# Fast path the case where there are 1 or 2 parents

393

if num_parents == 0:

394

entries = [(fields[pos:pos+7], []) for pos in xrange(cur, field_count, entry_size)]

395

elif num_parents == 1:

396

entries = [(fields[pos:pos+7], [fields[pos+7:pos+14],])

397

for pos in xrange(cur, field_count, entry_size)]

398

elif num_parents == 2:

399

entries = [(fields[pos:pos+7], [

400

fields[pos+7:pos+14],

401

fields[pos+14:pos+21],])

402

for pos in xrange(cur, field_count, entry_size)]

403

else:

404

raise NotImplementedError(self._read_dirblocks_if_needed)

405

entries = [tuple(

406

[fields[chunk:chunk+7] for chunk in xrange(pos, pos+entry_size-1, 7)])

407

for pos in xrange(cur, field_count, entry_size)

408

]

409

410

assert len(entries) == self._num_entries, '%s != %s entries' % (len(entries),

411

self._num_entries)

412

entry_iter = iter(entries)

413

self._root_row = entry_iter.next()

414

# convert the minikind to kind

415

self._root_row[0][2] = self._minikind_to_kind[self._root_row[0][2]]

416

# convert the size to an int

417

self._root_row[0][4] = int(self._root_row[0][4])

418

# TODO parent converion

419

# TODO dirblock population

420

for entry in entry_iter:

421

# do something here

422

pass

423

358

424

def _read_header(self):

359

425

"""This reads in the metadata header, and the parent ids.

360

426

364

430

:return: (expected adler checksum, number of entries, parent list)

365

431

"""

366

432

self._read_prelude()

367

parent_line = self.state_file.readline()

433

parent_line = self._state_file.readline()

368

434

info = parent_line.split('\0')

369

435

num_parents = int(info[0])

370

436

assert num_parents == len(info)-2, 'incorrect parent info line'

371

372

437

self._parents = [p.decode('utf8') for p in info[1:-1]]

373

438

439

ghost_line = self._state_file.readline()

440

info = ghost_line.split('\0')

441

num_ghosts = int(info[1])

442

assert num_ghosts == len(info)-3, 'incorrect ghost info line'

443

self._ghosts = [p.decode('utf8') for p in info[2:-1]]

444

self._header_state = DirState.IN_MEMORY_UNMODIFIED

445

374

446

def _read_header_if_needed(self):

375

447

"""Read the header of the dirstate file if needed."""

376

if self._header_read is False:

448

if self._header_state == DirState.NOT_IN_MEMORY:

377

449

self._read_header()

378

450

379

451

def _read_prelude(self):

385

457

The next entry in the file should be the number of parents,

386

458

and their ids. Followed by a newline.

387

459

"""

388

header = self.state_file.readline()

460

header = self._state_file.readline()

389

461

assert header == '#bazaar dirstate flat format 1\n', \

390

462

'invalid header line: %r' % (header,)

391

adler_line = self.state_file.readline()

463

adler_line = self._state_file.readline()

392

464

assert adler_line.startswith('adler32: '), 'missing adler32 checksum'

393

465

self.adler_expected = int(adler_line[len('adler32: '):-1])

394

num_entries_line = self.state_file.readline()

466

num_entries_line = self._state_file.readline()

395

467

assert num_entries_line.startswith('num_entries: '), 'missing num_entries line'

396

self.num_entries = int(num_entries_line[len('num_entries: '):-1])

468

self._num_entries = int(num_entries_line[len('num_entries: '):-1])

397

469

398

470

def _row_to_line(self, row):

399

471

"""Serialize row to a NULL delimited line ready for _get_output_lines.

418

490

419

491

def save(self):

420

492

"""Save any pending changes created during this session."""

421

self.state_file.seek(0)

422

self.state_file.writelines(self.get_lines())

423

self.state_file.flush()

424

self._clean = True

493

if (self._header_state == DirState.IN_MEMORY_MODIFIED or

494

self._dirblock_state == DirState.IN_MEMORY_MODIFIED):

495

self._state_file.seek(0)

496

self._state_file.writelines(self.get_lines())

497

self._state_file.flush()

498

self._header_state = DirState.IN_MEMORY_UNMODIFIED

499

self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED

425

500

426

def _set_data(self, root_row, tree_data):

427

"""Set the full dirstate data to root_row and tree_data.

501

def _set_data(self, parent_ids, root_row, dirblocks):

502

"""Set the full dirstate data in memory.

428

503

429

504

This is an internal function used to completely replace the objects

430

505

in memory state. It puts the dirstate into state 'full-dirty'.

506

507

:param parent_ids: A list of parent tree revision ids.

508

:param root_row: The root row - a tuple of the root direntry and the

509

list of matching direntries from the parent_ids trees.

510

:param dirblocks: A list containing one tuple for each directory in the

511

tree. Each tuple contains the directory path and a list of

512

row data in the same format as root_row.

431

513

"""

432

514

# our memory copy is now authoritative.

433

self._header_read = True

434

self._clean = False

515

self._dirblocks = dirblocks

435

516

self._root_row = root_row

436

# should save tree_data.

517

self._header_state = DirState.IN_MEMORY_MODIFIED

518

self._dirblock_state = DirState.IN_MEMORY_MODIFIED

519

self._parents = list(parent_ids)

520

521

def set_parent_trees(self, trees, ghosts):

522

"""Set the parent trees for the dirstate.

523

524

:param trees: A list of revision_id, tree tuples. tree must be provided

525

even if the revision_id refers to a ghost: supply an empty tree in

526

this case.

527

:param ghosts: A list of the revision_ids that are ghosts at the time

528

of setting.

529

"""

530

# TODO regenerate self._dirblocks and self._root_row

531

self._read_dirblocks_if_needed()

532

self._parents = [rev_id for rev_id, tree in trees]

533

self._ghosts = list(ghosts)

534

self._header_state = DirState.IN_MEMORY_MODIFIED

535

self._dirblock_state = DirState.IN_MEMORY_MODIFIED

437

536

438

537

439

538

def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):