~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to doc/developers/repository.txt

Committer: Aaron Bentley
Date: 2007-07-17 13:27:14 UTC
mfrom: (2624 +trunk)
mto: This revision was merged to the branch mainline in revision 2631.
Revision ID: abentley@panoramicfeedback.com-20070717132714-tmzx9khmg9501k51

Merge from bzr.dev

files added:
bzrlib/_knit_load_data_c.pyx

bzrlib/_knit_load_data_py.py

bzrlib/benchmarks/bench_knit.py

bzrlib/index.py

bzrlib/tests/blackbox/test_pack.py

bzrlib/tests/repository_implementations/test_pack.py

bzrlib/tests/test_index.py

bzrlib/tests/workingtree_implementations/test_uncommit.py

bzrlib/util/tests

bzrlib/util/tests/__init__.py

bzrlib/util/tests/test_bencode.py

doc/bug_trackers.txt

doc/developers/diff.txt

doc/developers/indices.txt

doc/developers/repository.txt

files modified:
.bzrignore

NEWS

bzrlib/__init__.py

bzrlib/annotate.py

bzrlib/benchmarks/__init__.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/cmd_version_info.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/debug.py

bzrlib/errors.py

bzrlib/help_topics.py

bzrlib/info.py

bzrlib/knit.py

bzrlib/memorytree.py

bzrlib/merge.py

bzrlib/option.py

bzrlib/plugin.py

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/remote.py

bzrlib/repofmt/knitrepo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/sign_my_commits.py

bzrlib/smart/protocol.py

bzrlib/smart/vfs.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/symbol_versioning.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_cat_revision.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_debug.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_inventory.py

bzrlib/tests/blackbox/test_join.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_lsprof.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_merge_directive.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_nick.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_split.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_submit.py

bzrlib/tests/blackbox/test_tags.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/blackbox/test_version_info.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations/test_revision_id_to_revno.py

bzrlib/tests/branch_implementations/test_sprout.py

bzrlib/tests/branch_implementations/test_uncommit.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_config.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_help.py

bzrlib/tests/test_info.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_options.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_source.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_parents.py

bzrlib/tests/workingtree_implementations/test_remove.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/remote.py

bzrlib/transport/sftp.py

bzrlib/uncommit.py

bzrlib/util/bencode.py

bzrlib/version.py

bzrlib/win32utils.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

bzrlib/xml5.py

doc/README.1st

doc/configuration.txt

doc/developers/HACKING

doc/developers/index.txt

doc/developers/performance-roadmap.txt

doc/developers/performance.dot

doc/developers/scratch.txt

doc/http_smart_server.txt

doc/plugins.txt

setup.py

Show diffs side-by-side

added added

removed removed

doc/developers/repository.txt

============

Repositories

============

Status

======

:Date: 2007-07-08

This document describes the services repositories offer and need to offer

within brlib.

.. contents::

Motivation

==========

To provide clarity to API and performance tradeoff decisions by

centralising the requirements placed upon repositories.

Terminology

===========

A **repository** is a store of historical data for bzr.

Command Requirements

====================

================== ====================

Command Needed services

================== ====================

Add None

Annotate Annotated file texts, revision details

Branch Fetch, Revision parents, Inventory contents, All file texts

Bundle Maximally compact diffs (file and inventory), Revision graph

difference, Revision texts.

Commit Insert new texts, insert new inventory via delta, insert

revision, insert signature

Fetching Revision graph difference, ghost identification, stream data

introduced by a set of revisions in some cheap form, insert

data from a stream, validate data during insertion.

Garbage Collection Exclusive lock the repository preventing readers.

Revert Delta from working tree to historical tree, and then

arbitrary file access to obtain the texts of differing

files.

Uncommit Revision graph access.

Status Revision graph access, revision text access, file

fingerprint information, inventory differencing.

Diff As status but also file text access.

Merge As diff but needs up to twice as many file texts -

base and other for each changed file. Also an initial

fetch is needed.

Log Revision graph (entire at the moment) access,

sometimes status between adjacent revisions. Log of a

file needs per-file-graph. Dominator caching or

similar tools may be needed to prevent entire graph

access.

Missing Revision graph access, and revision texts to show

output.

Update As for merge, but twice.

================== ====================

Data access patterns

====================

Ideally we can make our data access for commands such as branch to

dovetail well with the native storage in the repository, in the common

case. Doing this may require choosing the behaviour of some commands to

allow us to have a smaller range of access patterns which we can optimise

more heavily. Alternatively if each command is very predicable in its

data access pattern we may be able to hint to the low level layers which

pattern is needed on a per command basis to get efficient behaviour.

=================== ===================================================

Command Data access pattern

=================== ===================================================

Annotate-cached Find text name in an inventory, Recreate one text,

recreate annotation regions

Annotate-on demand Find file id from name, then breadth-first pre-order

traversal of versions-of-the-file until the annotation

is complete.

Branch Fetch, possibly taking a copy of any file present in a

nominated revision when it is validated during fetch.

Bundle Revision-graph as for fetch; then inventories for

selected revision_ids to determine file texts, then

mp-parent deltas for all determined file texts.

Commit Something like basis-inventories read to determine

per-file graphs, insertion of new texts (which may

be delta compressed), generation of annotation

regions if the repository is configured to do so,

finalisation of the inventory pointing at all the new

texts and finally a revision and possibly signature.

Fetching Revision-graph searching to find the graph difference.

Scan the inventory data introduced during the selected

revisions, and grab the on disk data for the found

100

file texts, annotation region data, per-file-graph

101

data, piling all this into a stream.

102

Garbage Collection Basically a mass fetch of all the revisions which

103

branches point at, then a bait and switch with the old

104

repository thus removing unreferenced data.

105

Revert Revision graph access for the revision being reverted

106

to, inventory extraction of that revision,

107

dirblock-order file text extract for files that were

108

different.

109

Uncommit Revision graph access to synthesise pending-merges

110

linear access down left-hand-side, with is_ancestor

111

checks between all the found non-left-hand-side

112

parents.

113

Status Lookup the revisions added by pending merges and their

114

commit messages. Then an inventory difference between

115

the trees involved, which may include a working tree.

116

If there is a working tree involved then the file

117

fingerprint for cache-misses on files will be needed.

118

Note that dirstate caches most of this making

119

repository performance largely irrelevant: but if it

120

was fast enough dirstate might be able to be simpler/

121

Diff As status but also file text access for every file

122

that is different - either one text (working tree

123

diff) or a diff of two (revision to revision diff).

124

Merge As diff but needs up to twice as many file texts -

125

base and other for each changed file. Also an initial

126

fetch is needed. Note that the access pattern is

127

probably id-based at the moment, but that may be

128

'fixed' with the iter_changes based merge. Also note

129

that while the texts from OTHER are the ones accessed,

130

this is equivalent to the **newest** form of each text

131

changed from BASE to OTHER. And as the repository

132

looks at when data is introduced, this should be the

133

pattern we focus on for merge.

134

Log Revision graph (entire at the moment) access, log of a

135

file wants a per-file-graph. Log -v will want

136

newest-first inventory deltas between revisions.

137

Missing Revision graph access, breadth-first pre-order.

138

Update As for merge, but twice.

139

=================== ===================================================

140

141

Patterns used

142

-------------

143

144

Note that these are able to be changed by changing what we store. For

145

instance if the repository satisfies mpdiff requests, then bundle can be

146

defined in terms of mpdiff lookups rather than file text lookups

147

appropriate to create mpdiffs. If the repository satisfies full text

148

requests only, then you need the topological access to build up the

149

desired mpdiffs.

150

151

=========================================== =========

152

Pattern Commands

153

=========================================== =========

154

Single file text annotate, diff

155

Files present in one revision branch

156

Newest form of files altered by revisions merge, update?

157

Topological access to file versions/deltas annotate-uncached

158

Stream all data required to recreate revs branch (lightweight)

159

Stream file texts in topological order bundle

160

Write full versions of files, inv, rev, sig commit

161

Write deltas of files, inv for one tree commit

162

Stream all data introduced by revs fetch

163

Regenerate/combine deltas of many trees fetch, pack

164

Reconstruct all texts and validate trees check, fetch

165

Revision graph walk fetch, pack, uncommit,

166

annotate-uncached,

167

merge, log, missing

168

Top down access multiple invs concurrently status, diff, merge?, update?

169

Concurrent access to N file texts diff, merge

170

Iteration of inventory deltas log -v, fetch?

171

=========================================== =========

172

173

Facilities to scale well

174

========================

175

176

Indices

177

-------

178

179

We want < linear access to all data in the repository. This suggests

180

everything is indexed to some degree.

181

182

Often we know the kind of data we are accessing; which allows us to

183

partition our indices if that will help (e.g. by reducing the total index

184

size for queries that only care about the revision graph).

185

186

Indices that support our data access patterns will usually display

187

increased locality of reference, reducing the impact of a large indices

188

without needing careful page size management or other tricks.

189

190

We need repository wide indices. For the current repositories this is

191

achieved by dividing the keyspace (revisions, signatures, inventories,

192

per-fileid) and then having an append only index within each keyspace.

193

For pack based repositories we will want some means to query the index of

194

each component pack, presumably as a single logical index.

195

196

It would be nice if indexing was made cleanly separate from storage. So

197

that suggests indices don't know the meaning of the lookup; indices which

198

offer particular ordering, or graph walking facilities will clearly need

199

that information, but perhaps they don't need to know the semantics ?

200

201

Index size

202

~~~~~~~~~~

203

204

Smaller indexes are good. We could go with one big index, or a different

205

index for different operation styles. As multiple indices will occupy more

206

space in total we should consider carefully about adding indices.

207

208

Index ordering

209

~~~~~~~~~~~~~~

210

211

Looking at the data access patterns some operations such as graph walking

212

can clearly be made more efficient by offering direct iteration rather

213

than repeated reentry into the index - so having indices that support

214

iteration in such a style would be useful eventually.

215

216

Changing our current indexes

217

~~~~~~~~~~~~~~~~~~~~~~~~~~~~

218

219

We can consider introducing cleaner indices in advance of a full pack

220

based repository.

221

222

There are many possibilities for this, but I've chosen one that seems ok

223

to me for illustration.

224

225

A key element is to consider when indices are updated. I think that the

226

update style proposed for pack based repositories - write once, then when

227

we group data again rewrite a new single index - is sufficent.

228

229

Replace .kndx

230

^^^^^^^^^^^^^

231

232

We could discard the per-knit .kndx by writing a new index at the end of

233

every bzr transaction indexing the new data introduced by the bzr

234

operation. e.g. at the end of fetch. This can be based on the new

235

``GraphIndex`` index type.

236

237

Encoding a knit entry into a ``GraphIndex`` can be done as follows:

238

239

* Change the key to include a prefix of the knit name, to allow filtering

240

out of data from different knits.

241

* Encode the parents from the knit as the zeroth node reference list.

242

* If the knit hunk was delta compressed encode the node it was delta

243

compressed against as the 1st node reference list (otherwise the 1st

244

node reference list will be empty to indicate no compression parents).

245

* For the value encode similarly to the current knit format the byte

246

offset for the data record in the knit, the byte length for the data

247

record in the knit and the no-end-of-line flag.

248

249

Its important to note that knit repositories cannot be regenerated by

250

scanning .knits, so a mapped index is still irreplaceable and must be

251

transmitted on push/pull.

252

253

A potential improvement exists by specialising this further to not record

254

data that is not needed - e.g. an index of revisions does not need to

255

support a pointer to a parent compressed text as revisions.knit is not

256

delta-compressed ever. Likewise signatures do not need the parent pointers

257

at all as there is no 'signature graph'.

258

259

Data

260

----

261

262

Moving to pack based repositories

263

---------------------------------

264

265

We have a number of challenges to solve.

266

267

Naming of files

268

~~~~~~~~~~~~~~~

269

270

As long as the file name is unique it does not really matter. It might be

271

interesting to have it be deterministic based on content, but there are no

272

specific problems we have solved by doing that, and doing so would require

273

hashing the full file. OTOH hashing the full file is a cheap way to detect

274

bit-errors in transfer (such as windows corruption).

275

276

Discovery of files

277

~~~~~~~~~~~~~~~~~~

278

279

With non-listable transports how should the collection of pack/index files

280

be found ? Initially record a list of all the pack/index files from

281

write actions. (Require writable transports to be listable). We can then

282

use a heuristic to statically combine pack/index files later.

283

284

Housing files

285

~~~~~~~~~~~~~

286

287

Combining indices on demand

288

~~~~~~~~~~~~~~~~~~~~~~~~~~~

289

290

Merging data on push

291

~~~~~~~~~~~~~~~~~~~~

292

293

A trivial implementation would be to make a pack which has just the data

294

needed for the push, then send that. More sophisticated things would be

295

streaming single-pass creation, and also using this as an opportunity to

296

increase the packedness of the local repo.

297

298

Choosing compression/delta support

299

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

300

301

302

303

304

305

vim: ft=rst tw=74 ai

306

Older »