~bzr-pqm/bzr/bzr.dev : contents of bzrlib/diff-delta.c at revision 5672.1.7

~bzr-pqm/bzr/bzr.dev : (revision 5672.1.7)

0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	1	/*
	2	* diff-delta.c: generate a delta between two buffers
	3	*
	4	* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
	5	* http://www.xmailserver.org/xdiff-lib.html
	6	*
	7	* Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
0.17.31 by John Arbash Meinel Bring in the 'rabin' experiment.	8	* Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	9	*
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil Groupcompress from brisbane-core.	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
	16	* has granted permission for use under 'version 2 or later' in private email
	17	* to Robert Collins and Karl Fogel on the 6th April 2009.
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	18	*/
	19
3735.33.4 by John Arbash Meinel The new layout is working.	20	#include <stdio.h>
3735.33.4 by John Arbash Meinel The new layout is working.	21
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	22	#include "delta.h"
0.17.31 by John Arbash Meinel Bring in the 'rabin' experiment.	23	#include <stdlib.h>
	24	#include <string.h>
0.23.5 by John Arbash Meinel Minor changes to get diff-delta.c and patch-delta.c to compile.	25	#include <assert.h>
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	26
	27	/* maximum hash entry list for the same hash bucket */
	28	#define HASH_LIMIT 64
	29
	30	#define RABIN_SHIFT 23
	31	#define RABIN_WINDOW 16
	32
3735.33.13 by John Arbash Meinel Tweak the number of blank spaces up just a tad.	33	/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
	34	* for more data. Tweaking this number above 4 doesn't seem to help much,
	35	* anyway.
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	36	*/
3735.33.13 by John Arbash Meinel Tweak the number of blank spaces up just a tad.	37	#define EXTRA_NULLS 4
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	38
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	39	static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	40	0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
	41	0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
	42	0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
	43	0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
	44	0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
	45	0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
	46	0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
	47	0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
	48	0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
	49	0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
	50	0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
	51	0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
	52	0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
	53	0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
	54	0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
	55	0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
	56	0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
	57	0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
	58	0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
	59	0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
	60	0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
	61	0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
	62	0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
	63	0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
	64	0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
	65	0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
	66	0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
	67	0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
	68	0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
	69	0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
	70	0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
	71	0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
	72	0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
	73	0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
	74	0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
	75	0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
	76	0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
	77	0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
	78	0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
	79	0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
	80	0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
	81	0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
	82	0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	83	};
	84
	85	static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	86	0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
	87	0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
	88	0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
	89	0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
	90	0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
	91	0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
	92	0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
	93	0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
	94	0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
	95	0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
	96	0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
	97	0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
	98	0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
	99	0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
	100	0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
	101	0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
	102	0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
	103	0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
	104	0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
	105	0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
	106	0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
	107	0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
	108	0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
	109	0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
	110	0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
	111	0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
	112	0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
	113	0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
	114	0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
	115	0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
	116	0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
	117	0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
	118	0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
	119	0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
	120	0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
	121	0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
	122	0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
	123	0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
	124	0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
	125	0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
	126	0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
	127	0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
	128	0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	129	};
	130
	131	struct index_entry {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	132	const unsigned char *ptr;
	133	const struct source_info *src;
	134	unsigned int val;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	135	};
	136
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	137	struct index_entry_linked_list {
	138	struct index_entry *p_entry;
	139	struct index_entry_linked_list *next;
	140	};
	141
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	142	struct unpacked_index_entry {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	143	struct index_entry entry;
	144	struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	145	};
	146
	147	struct delta_index {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	148	unsigned long memsize; /* Total bytes pointed to by this index */
	149	const struct source_info last_src; / Information about the referenced source */
	150	unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
	151	entry */
	152	unsigned int num_entries; /* The total number of entries in this index */
	153	struct index_entry last_entry; / Pointer to the last valid entry */
	154	struct index_entry *hash[];
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	155	};
	156
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	157	static unsigned int
	158	limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	159	unsigned int *hash_count, unsigned int hsize,
	160	unsigned int entries)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	161	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	162	struct unpacked_index_entry *entry;
	163	unsigned int i;
	164	/*
	165	* Determine a limit on the number of entries in the same hash
	166	* bucket. This guards us against pathological data sets causing
	167	* really bad hash distribution with most entries in the same hash
	168	* bucket that would bring us to O(m*n) computing costs (m and n
	169	* corresponding to reference and target buffer sizes).
	170	*
	171	* Make sure none of the hash buckets has more entries than
	172	* we're willing to test. Otherwise we cull the entry list
	173	* uniformly to still preserve a good repartition across
	174	* the reference buffer.
	175	*/
	176	for (i = 0; i < hsize; i++) {
	177	int acc;
	178
	179	if (hash_count[i] <= HASH_LIMIT)
	180	continue;
	181
	182	/* We leave exactly HASH_LIMIT entries in the bucket */
	183	entries -= hash_count[i] - HASH_LIMIT;
	184
	185	entry = hash[i];
	186	acc = 0;
	187
	188	/*
	189	* Assume that this loop is gone through exactly
	190	* HASH_LIMIT times and is entered and left with
	191	* acc==0. So the first statement in the loop
	192	* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
	193	* to the accumulator, and the inner loop consequently
	194	* is run (hash_count[i]-HASH_LIMIT) times, removing
	195	* one element from the list each time. Since acc
	196	* balances out to 0 at the final run, the inner loop
	197	* body can't be left with entry==NULL. So we indeed
	198	* encounter entry==NULL in the outer loop only.
	199	*/
	200	do {
	201	acc += hash_count[i] - HASH_LIMIT;
	202	if (acc > 0) {
	203	struct unpacked_index_entry *keep = entry;
	204	do {
	205	entry = entry->next;
	206	acc -= HASH_LIMIT;
	207	} while (acc > 0);
	208	keep->next = entry->next;
	209	}
	210	entry = entry->next;
	211	} while (entry);
	212	}
	213	return entries;
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	214	}
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	215
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	216	static struct delta_index *
	217	pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	218	unsigned int num_entries, struct delta_index *old_index)
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	219	{
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	220	unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	221	struct unpacked_index_entry *entry;
	222	struct delta_index *index;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	223	struct index_entry packed_entry, packed_hash, old_entry, *copy_from;
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	224	struct index_entry null_entry = {0};
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	225	void *mem;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	226
	227	hmask = hsize - 1;
	228
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	229	// if (old_index) {
	230	// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
	231	// " %x => %x\n",
	232	// num_entries - old_index->num_entries,
	233	// old_index->num_entries, num_entries,
	234	// old_index->hash_mask, hmask);
	235	// } else {
	236	// fprintf(stderr, "Packing %d entries into a new index\n",
	237	// num_entries);
	238	// }
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	239	/* First, see if we can squeeze the new items into the existing structure.
	240	*/
	241	fit_in_old = 0;
	242	copied_count = 0;
	243	if (old_index && old_index->hash_mask == hmask) {
	244	fit_in_old = 1;
	245	for (i = 0; i < hsize; ++i) {
	246	packed_entry = NULL;
	247	for (entry = hash[i]; entry; entry = entry->next) {
	248	if (packed_entry == NULL) {
	249	/* Find the last open spot */
	250	packed_entry = old_index->hash[i + 1];
	251	--packed_entry;
	252	while (packed_entry >= old_index->hash[i]
	253	&& packed_entry->ptr == NULL) {
	254	--packed_entry;
	255	}
	256	++packed_entry;
	257	}
	258	if (packed_entry >= old_index->hash[i+1]
	259	\|\| packed_entry->ptr != NULL) {
	260	/* There are no free spots here :( */
	261	fit_in_old = 0;
	262	break;
	263	}
	264	/* We found an empty spot to put this entry
	265	* Copy it over, and remove it from the linked list, just in
	266	* case we end up running out of room later.
	267	*/
	268	*packed_entry++ = entry->entry;
	269	assert(entry == hash[i]);
	270	hash[i] = entry->next;
	271	copied_count += 1;
	272	old_index->num_entries++;
	273	}
	274	if (!fit_in_old) {
	275	break;
	276	}
	277	}
	278	}
	279	if (old_index) {
	280	if (fit_in_old) {
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	281	// fprintf(stderr, "Fit all %d entries into old index\n",
	282	// copied_count);
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	283	/* No need to allocate a new buffer */
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	284	return NULL;
	285	} else {
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	286	// fprintf(stderr, "Fit only %d entries into old index,"
	287	// " reallocating\n", copied_count);
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	288	}
	289	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	290	/*
	291	* Now create the packed index in array form
	292	* rather than linked lists.
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	293	* Leave a 2-entry gap for inserting more entries between the groups
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	294	*/
	295	memsize = sizeof(*index)
	296	+ sizeof(packed_hash) (hsize+1)
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	297	+ sizeof(packed_entry) (num_entries + hsize * EXTRA_NULLS);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	298	mem = malloc(memsize);
	299	if (!mem) {
	300	return NULL;
	301	}
	302
	303	index = mem;
	304	index->memsize = memsize;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	305	index->hash_mask = hmask;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	306	index->num_entries = num_entries;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	307	if (old_index) {
3735.33.16 by John Arbash Meinel Handle when our current packing is sub-optimal.	308	if (hmask < old_index->hash_mask) {
	309	fprintf(stderr, "hash mask was shrunk %x => %x\n",
	310	old_index->hash_mask, hmask);
	311	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	312	assert(hmask >= old_index->hash_mask);
	313	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	314
	315	mem = index->hash;
	316	packed_hash = mem;
	317	mem = packed_hash + (hsize+1);
	318	packed_entry = mem;
	319
	320	for (i = 0; i < hsize; i++) {
	321	/*
	322	* Coalesce all entries belonging to one linked list
	323	* into consecutive array entries.
	324	*/
	325	packed_hash[i] = packed_entry;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	326	/* Old comes earlier as a source, so it always comes first in a given
	327	* hash bucket.
	328	*/
	329	if (old_index) {
	330	/* Could we optimize this to use memcpy when hmask ==
	331	* old_index->hash_mask? Would it make any real difference?
	332	*/
	333	j = i & old_index->hash_mask;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	334	copy_from = old_index->hash[j];
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	335	for (old_entry = old_index->hash[j];
	336	old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
	337	old_entry++) {
	338	if ((old_entry->val & hmask) == i) {
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	339	packed_entry++ = old_entry;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	340	}
	341	}
	342	}
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	343	for (entry = hash[i]; entry; entry = entry->next) {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	344	*packed_entry++ = entry->entry;
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	345	}
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	346	/* TODO: At this point packed_entry - packed_hash[i] is the number of
	347	* records that we have inserted into this hash bucket.
	348	* We should really consider doing some limiting along the
	349	* lines of limit_hash_buckets() to avoid pathological behavior.
	350	*/
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	351	/* Now add extra 'NULL' entries that we can use for future expansion. */
	352	for (j = 0; j < EXTRA_NULLS; ++j ) {
	353	*packed_entry++ = null_entry;
	354	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	355	}
	356
	357	/* Sentinel value to indicate the length of the last hash bucket */
	358	packed_hash[hsize] = packed_entry;
	359
3735.33.4 by John Arbash Meinel The new layout is working.	360	if (packed_entry - (struct index_entry *)mem
3735.33.4 by John Arbash Meinel The new layout is working.	361	!= num_entries + hsize*EXTRA_NULLS) {
3735.33.5 by John Arbash Meinel Restoring the debugging for now.	362	fprintf(stderr, "We expected %d entries, but created %d\n",
	363	num_entries + hsize*EXTRA_NULLS,
	364	(int)(packed_entry - (struct index_entry*)mem));
3735.33.4 by John Arbash Meinel The new layout is working.	365	}
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	366	assert(packed_entry - (struct index_entry *)mem
	367	== num_entries + hsize*EXTRA_NULLS);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	368	index->last_entry = (packed_entry - 1);
	369	return index;
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	370	}
	371
3735.33.4 by John Arbash Meinel The new layout is working.	372
0.23.49 by John Arbash Meinel When adding new entries to the delta index, use memcpy	373	struct delta_index *
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	374	create_delta_index(const struct source_info *src,
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	375	struct delta_index *old)
0.23.43 by John Arbash Meinel Change the internals to allow delta indexes to be expanded with new source data.	376	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	377	unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
	378	unsigned int total_num_entries;
	379	const unsigned char data, buffer;
	380	struct delta_index *index;
	381	struct unpacked_index_entry entry, *hash;
	382	void *mem;
	383	unsigned long memsize;
	384
	385	if (!src->buf \|\| !src->size)
	386	return NULL;
	387	buffer = src->buf;
	388
	389	/* Determine index hash size. Note that indexing skips the
	390	first byte to allow for optimizing the Rabin's polynomial
	391	initialization in create_delta(). */
	392	num_entries = (src->size - 1) / RABIN_WINDOW;
	393	if (old != NULL)
	394	total_num_entries = num_entries + old->num_entries;
	395	else
	396	total_num_entries = num_entries;
3735.33.8 by John Arbash Meinel Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.	397	hsize = total_num_entries / 4;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	398	for (i = 4; (1u << i) < hsize && i < 31; i++);
	399	hsize = 1 << i;
	400	hmask = hsize - 1;
3735.33.18 by John Arbash Meinel grow the local hmask if it is smaller than expected, don't shrink it.	401	if (old && old->hash_mask > hmask) {
3735.33.16 by John Arbash Meinel Handle when our current packing is sub-optimal.	402	hmask = old->hash_mask;
	403	hsize = hmask + 1;
	404	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	405
	406	/* allocate lookup index */
	407	memsize = sizeof(hash) hsize +
	408	sizeof(entry) total_num_entries;
	409	mem = malloc(memsize);
	410	if (!mem)
	411	return NULL;
	412	hash = mem;
	413	mem = hash + hsize;
	414	entry = mem;
	415
	416	memset(hash, 0, hsize * sizeof(*hash));
	417
	418	/* allocate an array to count hash num_entries */
	419	hash_count = calloc(hsize, sizeof(*hash_count));
	420	if (!hash_count) {
	421	free(hash);
	422	return NULL;
	423	}
	424
	425	/* then populate the index for the new data */
	426	prev_val = ~0;
	427	for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
	428	data >= buffer;
	429	data -= RABIN_WINDOW) {
	430	unsigned int val = 0;
	431	for (i = 1; i <= RABIN_WINDOW; i++)
	432	val = ((val << 8) \| data[i]) ^ T[val >> RABIN_SHIFT];
	433	if (val == prev_val) {
	434	/* keep the lowest of consecutive identical blocks */
	435	entry[-1].entry.ptr = data + RABIN_WINDOW;
	436	--num_entries;
	437	--total_num_entries;
	438	} else {
	439	prev_val = val;
	440	i = val & hmask;
	441	entry->entry.ptr = data + RABIN_WINDOW;
	442	entry->entry.val = val;
	443	entry->entry.src = src;
	444	entry->next = hash[i];
	445	hash[i] = entry++;
	446	hash_count[i]++;
	447	}
	448	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	449	/* TODO: It would be nice to limit_hash_buckets at a better time. */
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	450	total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
	451	total_num_entries);
	452	free(hash_count);
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	453	if (old) {
	454	old->last_src = src;
	455	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	456	index = pack_delta_index(hash, hsize, total_num_entries, old);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	457	free(hash);
	458	if (!index) {
	459	return NULL;
	460	}
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	461	index->last_src = src;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	462	return index;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	463	}
	464
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	465	/* Take some entries, and put them into a custom hash.
	466	* @param entries A list of entries, sorted by position in file
	467	* @param num_entries Length of entries
	468	* @param out_hsize The maximum size of the hash, the final size will be
	469	* returned here
	470	*/
	471	struct index_entry_linked_list **
	472	_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	473	unsigned int hsize)
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	474	{
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	475	unsigned int hash_offset, hmask, memsize;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	476	struct index_entry entry, last_entry;
	477	struct index_entry_linked_list out_entry, *hash;
	478	void *mem;
	479
	480	hmask = hsize - 1;
	481
	482	memsize = sizeof(hash) hsize +
	483	sizeof(out_entry) num_entries;
	484	mem = malloc(memsize);
	485	if (!mem)
	486	return NULL;
	487	hash = mem;
	488	mem = hash + hsize;
	489	out_entry = mem;
	490
	491	memset(hash, 0, sizeof(hash)(hsize+1));
	492
	493	/* We know that entries are in the order we want in the output, but they
	494	* aren't "grouped" by hash bucket yet.
	495	*/
	496	last_entry = entries + num_entries;
	497	for (entry = entries + num_entries - 1; entry >= entries; --entry) {
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	498	hash_offset = entry->val & hmask;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	499	out_entry->p_entry = entry;
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	500	out_entry->next = hash[hash_offset];
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	501	/* TODO: Remove entries that have identical vals, or at least filter
	502	* the map a little bit.
	503	* if (hash[i] != NULL) {
	504	* }
	505	*/
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	506	hash[hash_offset] = out_entry;
	507	++out_entry;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	508	}
	509	return hash;
	510	}
	511
3735.33.4 by John Arbash Meinel The new layout is working.	512
	513	struct delta_index *
	514	create_index_from_old_and_new_entries(const struct delta_index *old_index,
	515	struct index_entry *entries,
	516	unsigned int num_entries)
	517	{
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	518	unsigned int i, j, hsize, hmask, total_num_entries;
3735.33.4 by John Arbash Meinel The new layout is working.	519	struct delta_index *index;
	520	struct index_entry entry, packed_entry, **packed_hash;
	521	struct index_entry *last_entry, null_entry = {0};
	522	void *mem;
	523	unsigned long memsize;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	524	struct index_entry_linked_list unpacked_entry, *mini_hash;
3735.33.4 by John Arbash Meinel The new layout is working.	525
	526	/* Determine index hash size. Note that indexing skips the
	527	first byte to allow for optimizing the Rabin's polynomial
	528	initialization in create_delta(). */
	529	total_num_entries = num_entries + old_index->num_entries;
3735.33.8 by John Arbash Meinel Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.	530	hsize = total_num_entries / 4;
3735.33.4 by John Arbash Meinel The new layout is working.	531	for (i = 4; (1u << i) < hsize && i < 31; i++);
	532	hsize = 1 << i;
	533	if (hsize < old_index->hash_mask) {
	534	/* For some reason, there was a code path that would actually shrink
	535	* the hash size. This screws with some later code, and in general, I
	536	* think it better to make the hash bigger, rather than smaller. So
	537	* we'll just force the size here.
	538	* Possibly done by create_delta_index running into a
	539	* limit_hash_buckets call, that ended up transitioning across a
	540	* power-of-2. The cause isn't 100% clear, though.
	541	*/
	542	hsize = old_index->hash_mask + 1;
	543	}
	544	hmask = hsize - 1;
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	545	// fprintf(stderr, "resizing index to insert %d entries into array"
	546	// " with %d entries: %x => %x\n",
	547	// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
3735.33.4 by John Arbash Meinel The new layout is working.	548
	549	memsize = sizeof(*index)
	550	+ sizeof(packed_hash) (hsize+1)
	551	+ sizeof(packed_entry) (total_num_entries + hsize*EXTRA_NULLS);
	552	mem = malloc(memsize);
	553	if (!mem) {
	554	return NULL;
	555	}
	556	index = mem;
	557	index->memsize = memsize;
	558	index->hash_mask = hmask;
	559	index->num_entries = total_num_entries;
	560	index->last_src = old_index->last_src;
	561
	562	mem = index->hash;
	563	packed_hash = mem;
	564	mem = packed_hash + (hsize+1);
	565	packed_entry = mem;
	566
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	567	mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	568	if (mini_hash == NULL) {
	569	free(index);
	570	return NULL;
	571	}
3735.33.4 by John Arbash Meinel The new layout is working.	572	last_entry = entries + num_entries;
	573	for (i = 0; i < hsize; i++) {
	574	/*
	575	* Coalesce all entries belonging in one hash bucket
	576	* into consecutive array entries.
	577	* The entries in old_index all come before 'entries'.
	578	*/
	579	packed_hash[i] = packed_entry;
	580	/* Copy any of the old entries across */
	581	/* Would we rather use memcpy? */
	582	if (hmask == old_index->hash_mask) {
	583	for (entry = old_index->hash[i];
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	584	entry < old_index->hash[i+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel The new layout is working.	585	++entry) {
	586	assert((entry->val & hmask) == i);
	587	packed_entry++ = entry;
	588	}
	589	} else {
	590	/* If we resized the index from this action, all of the old values
	591	* will be found in the previous location, but they will end up
	592	* spread across the new locations.
	593	*/
	594	j = i & old_index->hash_mask;
	595	for (entry = old_index->hash[j];
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	596	entry < old_index->hash[j+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel The new layout is working.	597	++entry) {
	598	assert((entry->val & old_index->hash_mask) == j);
	599	if ((entry->val & hmask) == i) {
	600	/* Any entries not picked up here will be picked up on the
	601	* next pass.
	602	*/
	603	packed_entry++ = entry;
	604	}
	605	}
	606	}
	607	/* Now see if we need to insert any of the new entries.
	608	* Note that loop ends up O(hsize*num_entries), so we expect that
	609	* num_entries is always small.
	610	* We also help a little bit by collapsing the entry range when the
	611	* endpoints are inserted. However, an alternative would be to build a
	612	* quick hash lookup for just the new entries.
	613	* Testing shows that this list can easily get up to about 100
	614	* entries, the tradeoff is a malloc, 1 pass over the entries, copying
	615	* them into a sorted buffer, and a free() when done,
	616	*/
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	617	for (unpacked_entry = mini_hash[i];
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	618	unpacked_entry;
	619	unpacked_entry = unpacked_entry->next) {
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	620	assert((unpacked_entry->p_entry->val & hmask) == i);
	621	packed_entry++ = (unpacked_entry->p_entry);
3735.33.4 by John Arbash Meinel The new layout is working.	622	}
	623	/* Now insert some extra nulls */
	624	for (j = 0; j < EXTRA_NULLS; ++j) {
	625	*packed_entry++ = null_entry;
	626	}
	627	}
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	628	free(mini_hash);
3735.33.4 by John Arbash Meinel The new layout is working.	629
	630	/* Sentinel value to indicate the length of the last hash bucket */
	631	packed_hash[hsize] = packed_entry;
	632
	633	if ((packed_entry - (struct index_entry *)mem)
	634	!= (total_num_entries + hsize*EXTRA_NULLS)) {
	635	fprintf(stderr, "We expected %d entries, but created %d\n",
	636	total_num_entries + hsize*EXTRA_NULLS,
	637	(int)(packed_entry - (struct index_entry*)mem));
	638	fflush(stderr);
	639	}
	640	assert((packed_entry - (struct index_entry *)mem)
	641	== (total_num_entries + hsize * EXTRA_NULLS));
	642	index->last_entry = (packed_entry - 1);
	643	return index;
	644	}
	645
	646
	647	void
	648	get_text(char buff[128], const unsigned char *ptr)
	649	{
	650	unsigned int i;
	651	const unsigned char *start;
	652	unsigned char cmd;
	653	start = (ptr-RABIN_WINDOW-1);
	654	cmd = *(start);
	655	if (cmd < 0x80) {// This is likely to be an insert instruction
	656	if (cmd < RABIN_WINDOW) {
	657	cmd = RABIN_WINDOW;
	658	}
	659	} else {
	660	/* This was either a copy [should never be] or it
	661	* was a longer insert so the insert start happened at 16 more
	662	* bytes back.
	663	*/
	664	cmd = RABIN_WINDOW + 1;
	665	}
	666	if (cmd > 60) {
	667	cmd = 60; /* Be friendly to 80char terms */
	668	}
	669	/* Copy the 1 byte command, and 4 bytes after the insert */
	670	cmd += 5;
	671	memcpy(buff, start, cmd);
	672	buff[cmd] = 0;
	673	for (i = 0; i < cmd; ++i) {
	674	if (buff[i] == '\n') {
	675	buff[i] = 'N';
	676	} else if (buff[i] == '\t') {
	677	buff[i] = 'T';
	678	}
	679	}
	680	}
	681
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	682	struct delta_index *
	683	create_delta_index_from_delta(const struct source_info *src,
3735.33.4 by John Arbash Meinel The new layout is working.	684	struct delta_index *old_index)
0.23.45 by John Arbash Meinel Add a function that updates the index for delta bytes.	685	{
3735.33.4 by John Arbash Meinel The new layout is working.	686	unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
3735.33.4 by John Arbash Meinel The new layout is working.	687	unsigned int hash_offset;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	688	const unsigned char data, buffer, *top;
	689	unsigned char cmd;
3735.33.4 by John Arbash Meinel The new layout is working.	690	struct delta_index *new_index;
4634.106.2 by John Arbash Meinel Restructure the internals to be a bit more understandable.	691	struct index_entry entry, entries;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	692
	693	if (!src->buf \|\| !src->size)
	694	return NULL;
	695	buffer = src->buf;
	696	top = buffer + src->size;
	697
	698	/* Determine index hash size. Note that indexing skips the
	699	first byte to allow for optimizing the Rabin's polynomial
	700	initialization in create_delta().
	701	This computes the maximum number of entries that could be held. The
	702	actual number will be recomputed during processing.
	703	*/
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	704
3735.33.4 by John Arbash Meinel The new layout is working.	705	max_num_entries = (src->size - 1) / RABIN_WINDOW;
	706
	707	/* allocate an array to hold whatever entries we find */
	708	entries = malloc(sizeof(entry) max_num_entries);
	709	if (!entries) /* malloc failure */
	710	return NULL;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	711
	712	/* then populate the index for the new data */
	713	prev_val = ~0;
	714	data = buffer;
	715	/* target size */
4582.2.1 by John Arbash Meinel Use a cast to improve compiler conformance.	716	/* get_delta_hdr_size doesn't mutate the content, just moves the
	717	* start-of-data pointer, so it is safe to do the cast.
	718	*/
	719	get_delta_hdr_size((unsigned char**)&data, top);
3735.33.4 by John Arbash Meinel The new layout is working.	720	entry = entries; /* start at the first slot */
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	721	num_entries = 0; /* calculate the real number of entries */
	722	while (data < top) {
	723	cmd = *data++;
	724	if (cmd & 0x80) {
	725	/* Copy instruction, skip it */
	726	if (cmd & 0x01) data++;
	727	if (cmd & 0x02) data++;
	728	if (cmd & 0x04) data++;
	729	if (cmd & 0x08) data++;
	730	if (cmd & 0x10) data++;
	731	if (cmd & 0x20) data++;
	732	if (cmd & 0x40) data++;
	733	} else if (cmd) {
	734	/* Insert instruction, we want to index these bytes */
	735	if (data + cmd > top) {
	736	/* Invalid insert, not enough bytes in the delta */
	737	break;
	738	}
3735.33.5 by John Arbash Meinel Restoring the debugging for now.	739	/* The create_delta code requires a match at least 4 characters
	740	* (including only the last char of the RABIN_WINDOW) before it
	741	* will consider it something worth copying rather than inserting.
	742	* So we don't want to index anything that we know won't ever be a
	743	* match.
	744	*/
	745	for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	746	data += RABIN_WINDOW) {
	747	unsigned int val = 0;
	748	for (i = 1; i <= RABIN_WINDOW; i++)
	749	val = ((val << 8) \| data[i]) ^ T[val >> RABIN_SHIFT];
	750	if (val != prev_val) {
	751	/* Only keep the first of consecutive data */
	752	prev_val = val;
3735.33.4 by John Arbash Meinel The new layout is working.	753	num_entries++;
	754	entry->ptr = data + RABIN_WINDOW;
	755	entry->val = val;
	756	entry->src = src;
3735.33.2 by John Arbash Meinel Revert some of the previous work.	757	entry++;
3735.33.4 by John Arbash Meinel The new layout is working.	758	if (num_entries > max_num_entries) {
	759	/* We ran out of entry room, something is really wrong
	760	*/
	761	break;
	762	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	763	}
	764	}
	765	/* Move the data pointer by whatever remainder is left */
	766	data += cmd;
	767	} else {
	768	/*
	769	* cmd == 0 is reserved for future encoding
	770	* extensions. In the mean time we must fail when
	771	* encountering them (might be data corruption).
	772	*/
	773	break;
	774	}
	775	}
	776	if (data != top) {
	777	/* Something was wrong with this delta */
3735.33.4 by John Arbash Meinel The new layout is working.	778	free(entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	779	return NULL;
	780	}
	781	if (num_entries == 0) {
	782	/ Nothing to index /
3735.33.4 by John Arbash Meinel The new layout is working.	783	free(entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	784	return NULL;
	785	}
3735.33.4 by John Arbash Meinel The new layout is working.	786	assert(old_index != NULL);
	787	old_index->last_src = src;
	788	/* See if we can fill in these values into the holes in the array */
	789	entry = entries;
	790	num_inserted = 0;
	791	for (; num_entries > 0; --num_entries, ++entry) {
4634.106.2 by John Arbash Meinel Restructure the internals to be a bit more understandable.	792	struct index_entry next_bucket_entry, cur_entry, *bucket_first_entry;
3735.33.4 by John Arbash Meinel The new layout is working.	793	hash_offset = (entry->val & old_index->hash_mask);
	794	/* The basic structure is a hash => packed_entries that fit in that
	795	* hash bucket. Things are structured such that the hash-pointers are
	796	* strictly ordered. So we start by pointing to the next pointer, and
	797	* walk back until we stop getting NULL targets, and then go back
	798	* forward. If there are no NULL targets, then we know because
	799	* entry->ptr will not be NULL.
	800	*/
4634.106.2 by John Arbash Meinel Restructure the internals to be a bit more understandable.	801	// The start of the next bucket, this may point past the end of the
	802	// entry table if hash_offset is the last bucket.
	803	next_bucket_entry = old_index->hash[hash_offset + 1];
	804	// First entry in this bucket
	805	bucket_first_entry = old_index->hash[hash_offset];
	806	cur_entry = next_bucket_entry - 1;
	807	while (cur_entry->ptr == NULL && cur_entry >= bucket_first_entry) {
	808	cur_entry--;
3735.33.4 by John Arbash Meinel The new layout is working.	809	}
4634.106.2 by John Arbash Meinel Restructure the internals to be a bit more understandable.	810	// cur_entry now either points at the first NULL, or it points to
	811	// next_bucket_entry if there were no blank spots.
	812	cur_entry++;
	813	if (cur_entry >= next_bucket_entry \|\| cur_entry->ptr != NULL) {
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	814	/* There is no room for this entry, we have to resize */
	815	// char buff[128];
	816	// get_text(buff, entry->ptr);
	817	// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
	818	// hash_offset, entry->val, buff);
	819	// for (old_entry = old_index->hash[hash_offset];
	820	// old_entry < old_index->hash[hash_offset+1];
	821	// ++old_entry) {
	822	// get_text(buff, old_entry->ptr);
	823	// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
	824	// (int)(old_entry - old_index->hash[hash_offset]),
	825	// old_entry->val, old_entry->ptr, buff);
	826	// }
3735.33.4 by John Arbash Meinel The new layout is working.	827	break;
	828	}
	829	num_inserted++;
4634.106.2 by John Arbash Meinel Restructure the internals to be a bit more understandable.	830	cur_entry = entry;
3735.33.4 by John Arbash Meinel The new layout is working.	831	/* For entries which we do manage to insert into old_index, we don't
	832	* want them double copied into the final output.
	833	*/
	834	old_index->num_entries++;
	835	}
	836	if (num_entries > 0) {
	837	/* We couldn't fit the new entries into the old index, so allocate a
	838	* new one, and fill it with stuff.
	839	*/
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	840	// fprintf(stderr, "inserted %d before resize\n", num_inserted);
3735.33.4 by John Arbash Meinel The new layout is working.	841	new_index = create_index_from_old_and_new_entries(old_index,
3735.33.4 by John Arbash Meinel The new layout is working.	842	entry, num_entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	843	} else {
3735.33.4 by John Arbash Meinel The new layout is working.	844	new_index = NULL;
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	845	// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
3735.33.4 by John Arbash Meinel The new layout is working.	846	}
	847	free(entries);
	848	return new_index;
0.23.45 by John Arbash Meinel Add a function that updates the index for delta bytes.	849	}
	850
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	851	void free_delta_index(struct delta_index *index)
	852	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	853	free(index);
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	854	}
	855
5361.2.2 by John Arbash Meinel Fix some signed integer warnings in diff-delta.c	856	unsigned long
	857	sizeof_delta_index(struct delta_index *index)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	858	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	859	if (index)
	860	return index->memsize;
	861	else
	862	return 0;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	863	}
	864
	865	/*
	866	* The maximum size for any opcode sequence, including the initial header
	867	* plus Rabin window plus biggest copy.
	868	*/
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	869	#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	870
	871	void *
0.23.44 by John Arbash Meinel Remove the multi-index handling now that we have index combining instead.	872	create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	873	const void *trg_buf, unsigned long trg_size,
	874	unsigned long *delta_size, unsigned long max_size)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	875	{
5361.2.4 by John Arbash Meinel Use signed ints to avoid the compiler warning.	876	unsigned int i, outpos, outsize, moff, val;
	877	int msize;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	878	const struct source_info *msource;
	879	int inscnt;
	880	const unsigned char ref_data, ref_top, data, top;
	881	unsigned char *out;
	882	unsigned long source_size;
	883
	884	if (!trg_buf \|\| !trg_size)
	885	return NULL;
	886	if (index == NULL)
	887	return NULL;
	888
	889	outpos = 0;
	890	outsize = 8192;
	891	if (max_size && outsize >= max_size)
	892	outsize = max_size + MAX_OP_SIZE + 1;
	893	out = malloc(outsize);
	894	if (!out)
	895	return NULL;
	896
	897	source_size = index->last_src->size + index->last_src->agg_offset;
	898
	899	/* store target buffer size */
	900	i = trg_size;
	901	while (i >= 0x80) {
	902	out[outpos++] = i \| 0x80;
	903	i >>= 7;
	904	}
	905	out[outpos++] = i;
	906
	907	data = trg_buf;
	908	top = (const unsigned char *) trg_buf + trg_size;
	909
	910	/* Start the matching by filling out with a simple 'insert' instruction, of
	911	* the first RABIN_WINDOW bytes of the input.
	912	*/
	913	outpos++; /* leave a byte for the insert command */
	914	val = 0;
	915	for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
	916	out[outpos++] = *data;
	917	val = ((val << 8) \| *data) ^ T[val >> RABIN_SHIFT];
	918	}
	919	/* we are now setup with an insert of 'i' bytes and val contains the RABIN
	920	* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
	921	* input.
	922	*/
	923	inscnt = i;
	924
	925	moff = 0;
	926	msize = 0;
	927	msource = NULL;
	928	while (data < top) {
	929	if (msize < 4096) {
	930	/* we don't have a 'worthy enough' match yet, so let's look for
	931	* one.
	932	*/
	933	struct index_entry *entry;
	934	/* Shift the window by one byte. */
	935	val ^= U[data[-RABIN_WINDOW]];
	936	val = ((val << 8) \| *data) ^ T[val >> RABIN_SHIFT];
	937	i = val & index->hash_mask;
	938	/* TODO: When using multiple indexes like this, the hash tables
	939	* mapping val => index_entry become less efficient.
	940	* You end up getting a lot more collisions in the hash,
	941	* which doesn't actually lead to a entry->val match.
942	*/
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	943	for (entry = index->hash[i];
	944	entry < index->hash[i+1] && entry->src != NULL;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	945	entry++) {
	946	const unsigned char *ref;
	947	const unsigned char *src;
5361.2.4 by John Arbash Meinel Use signed ints to avoid the compiler warning.	948	int ref_size;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	949	if (entry->val != val)
	950	continue;
	951	ref = entry->ptr;
	952	src = data;
	953	ref_data = entry->src->buf;
	954	ref_top = ref_data + entry->src->size;
	955	ref_size = ref_top - ref;
	956	/* ref_size is the longest possible match that we could make
	957	* here. If ref_size <= msize, then we know that we cannot
	958	* match more bytes with this location that we have already
	959	* matched.
	960	*/
5361.2.4 by John Arbash Meinel Use signed ints to avoid the compiler warning.	961	if (ref_size > (top - src))
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	962	ref_size = top - src;
	963	if (ref_size <= msize)
	964	break;
	965	/* See how many bytes actually match at this location. */
	966	while (ref_size-- && src++ == ref)
	967	ref++;
5361.2.4 by John Arbash Meinel Use signed ints to avoid the compiler warning.	968	if (msize < (ref - entry->ptr)) {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	969	/* this is our best match so far */
	970	msize = ref - entry->ptr;
	971	msource = entry->src;
	972	moff = entry->ptr - ref_data;
	973	if (msize >= 4096) /* good enough */
	974	break;
	975	}
	976	}
	977	}
	978
	979	if (msize < 4) {
	980	/* The best match right now is less than 4 bytes long. So just add
	981	* the current byte to the insert instruction. Increment the insert
	982	* counter, and copy the byte of data into the output buffer.
	983	*/
	984	if (!inscnt)
	985	outpos++;
	986	out[outpos++] = *data++;
	987	inscnt++;
	988	if (inscnt == 0x7f) {
	989	/* We have a max length insert instruction, finalize it in the
	990	* output.
	991	*/
	992	out[outpos - inscnt - 1] = inscnt;
	993	inscnt = 0;
	994	}
	995	msize = 0;
	996	} else {
	997	unsigned int left;
	998	unsigned char *op;
	999
	1000	if (inscnt) {
	1001	ref_data = msource->buf;
	1002	while (moff && ref_data[moff-1] == data[-1]) {
	1003	/* we can match one byte back */
	1004	msize++;
	1005	moff--;
	1006	data--;
	1007	outpos--;
	1008	if (--inscnt)
	1009	continue;
	1010	outpos--; /* remove count slot */
	1011	inscnt--; /* make it -1 */
	1012	break;
	1013	}
	1014	out[outpos - inscnt - 1] = inscnt;
	1015	inscnt = 0;
	1016	}
	1017
	1018	/* A copy op is currently limited to 64KB (pack v2) */
	1019	left = (msize < 0x10000) ? 0 : (msize - 0x10000);
	1020	msize -= left;
	1021
	1022	op = out + outpos++;
	1023	i = 0x80;
	1024
	1025	/* moff is the offset in the local structure, for encoding, we need
	1026	* to push it into the global offset
	1027	*/
	1028	assert(moff < msource->size);
	1029	moff += msource->agg_offset;
	1030	assert(moff + msize <= source_size);
	1031	if (moff & 0x000000ff)
	1032	out[outpos++] = moff >> 0, i \|= 0x01;
1033	if (moff & 0x0000ff00)
1034	out[outpos++] = moff >> 8, i \|= 0x02;
1035	if (moff & 0x00ff0000)
1036	out[outpos++] = moff >> 16, i \|= 0x04;
1037	if (moff & 0xff000000)
1038	out[outpos++] = moff >> 24, i \|= 0x08;
1039	/* Put it back into local coordinates, in case we have multiple
1040	* copies in a row.
1041	*/
1042	moff -= msource->agg_offset;
1043
1044	if (msize & 0x00ff)
1045	out[outpos++] = msize >> 0, i \|= 0x10;
1046	if (msize & 0xff00)
1047	out[outpos++] = msize >> 8, i \|= 0x20;
1048
1049	*op = i;
1050
1051	data += msize;
1052	moff += msize;
1053	msize = left;
1054
1055	if (msize < 4096) {
1056	int j;
1057	val = 0;
1058	for (j = -RABIN_WINDOW; j < 0; j++)
1059	val = ((val << 8) \| data[j])
1060	^ T[val >> RABIN_SHIFT];
1061	}
1062	}
1063
1064	if (outpos >= outsize - MAX_OP_SIZE) {
1065	void *tmp = out;
1066	outsize = outsize * 3 / 2;
1067	if (max_size && outsize >= max_size)
1068	outsize = max_size + MAX_OP_SIZE + 1;
1069	if (max_size && outpos > max_size)
1070	break;
1071	out = realloc(out, outsize);
1072	if (!out) {
1073	free(tmp);
1074	return NULL;
1075	}
1076	}
1077	}
1078
1079	if (inscnt)
1080	out[outpos - inscnt - 1] = inscnt;
1081
1082	if (max_size && outpos > max_size) {
1083	free(out);
1084	return NULL;
1085	}
1086
1087	*delta_size = outpos;
1088	return out;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	1089	}
0.23.24 by John Arbash Meinel Change the code so that we can pass in multiple sources to match against.	1090
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	1091	/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel Track down a memory leak in the refactored diff-delta.c code.	1092	*/