~bzr-pqm/bzr/bzr.dev : contents of bzrlib/diff-delta.c at revision 4304.2.1

~bzr-pqm/bzr/bzr.dev : (revision 4304.2.1)

0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	1	/*
	2	* diff-delta.c: generate a delta between two buffers
	3	*
	4	* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
	5	* http://www.xmailserver.org/xdiff-lib.html
	6	*
	7	* Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
0.17.31 by John Arbash Meinel Bring in the 'rabin' experiment.	8	* Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	9	*
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil Groupcompress from brisbane-core.	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
	16	* has granted permission for use under 'version 2 or later' in private email
	17	* to Robert Collins and Karl Fogel on the 6th April 2009.
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	18	*/
	19
3735.33.4 by John Arbash Meinel The new layout is working.	20	#include <stdio.h>
3735.33.4 by John Arbash Meinel The new layout is working.	21
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	22	#include "delta.h"
0.17.31 by John Arbash Meinel Bring in the 'rabin' experiment.	23	#include <stdlib.h>
	24	#include <string.h>
0.23.5 by John Arbash Meinel Minor changes to get diff-delta.c and patch-delta.c to compile.	25	#include <assert.h>
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	26
	27	/* maximum hash entry list for the same hash bucket */
	28	#define HASH_LIMIT 64
	29
	30	#define RABIN_SHIFT 23
	31	#define RABIN_WINDOW 16
	32
3735.33.13 by John Arbash Meinel Tweak the number of blank spaces up just a tad.	33	/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
	34	* for more data. Tweaking this number above 4 doesn't seem to help much,
	35	* anyway.
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	36	*/
3735.33.13 by John Arbash Meinel Tweak the number of blank spaces up just a tad.	37	#define EXTRA_NULLS 4
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	38
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	39	static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	40	0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
	41	0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
	42	0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
	43	0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
	44	0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
	45	0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
	46	0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
	47	0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
	48	0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
	49	0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
	50	0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
	51	0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
	52	0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
	53	0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
	54	0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
	55	0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
	56	0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
	57	0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
	58	0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
	59	0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
	60	0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
	61	0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
	62	0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
	63	0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
	64	0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
	65	0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
	66	0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
	67	0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
	68	0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
	69	0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
	70	0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
	71	0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
	72	0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
	73	0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
	74	0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
	75	0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
	76	0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
	77	0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
	78	0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
	79	0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
	80	0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
	81	0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
	82	0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	83	};
	84
	85	static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	86	0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
	87	0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
	88	0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
	89	0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
	90	0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
	91	0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
	92	0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
	93	0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
	94	0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
	95	0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
	96	0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
	97	0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
	98	0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
	99	0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
	100	0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
	101	0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
	102	0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
	103	0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
	104	0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
	105	0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
	106	0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
	107	0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
	108	0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
	109	0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
	110	0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
	111	0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
	112	0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
	113	0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
	114	0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
	115	0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
	116	0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
	117	0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
	118	0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
	119	0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
	120	0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
	121	0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
	122	0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
	123	0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
	124	0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
	125	0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
	126	0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
	127	0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
	128	0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	129	};
	130
	131	struct index_entry {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	132	const unsigned char *ptr;
	133	const struct source_info *src;
	134	unsigned int val;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	135	};
	136
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	137	struct index_entry_linked_list {
	138	struct index_entry *p_entry;
	139	struct index_entry_linked_list *next;
	140	};
	141
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	142	struct unpacked_index_entry {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	143	struct index_entry entry;
	144	struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	145	};
	146
	147	struct delta_index {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	148	unsigned long memsize; /* Total bytes pointed to by this index */
	149	const struct source_info last_src; / Information about the referenced source */
	150	unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
	151	entry */
	152	unsigned int num_entries; /* The total number of entries in this index */
	153	struct index_entry last_entry; / Pointer to the last valid entry */
	154	struct index_entry *hash[];
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	155	};
	156
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	157	static unsigned int
	158	limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	159	unsigned int *hash_count, unsigned int hsize,
	160	unsigned int entries)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	161	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	162	struct unpacked_index_entry *entry;
	163	unsigned int i;
	164	/*
	165	* Determine a limit on the number of entries in the same hash
	166	* bucket. This guards us against pathological data sets causing
	167	* really bad hash distribution with most entries in the same hash
	168	* bucket that would bring us to O(m*n) computing costs (m and n
	169	* corresponding to reference and target buffer sizes).
	170	*
	171	* Make sure none of the hash buckets has more entries than
	172	* we're willing to test. Otherwise we cull the entry list
	173	* uniformly to still preserve a good repartition across
	174	* the reference buffer.
	175	*/
	176	for (i = 0; i < hsize; i++) {
	177	int acc;
	178
	179	if (hash_count[i] <= HASH_LIMIT)
	180	continue;
	181
	182	/* We leave exactly HASH_LIMIT entries in the bucket */
	183	entries -= hash_count[i] - HASH_LIMIT;
	184
	185	entry = hash[i];
	186	acc = 0;
	187
	188	/*
	189	* Assume that this loop is gone through exactly
	190	* HASH_LIMIT times and is entered and left with
	191	* acc==0. So the first statement in the loop
	192	* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
	193	* to the accumulator, and the inner loop consequently
	194	* is run (hash_count[i]-HASH_LIMIT) times, removing
	195	* one element from the list each time. Since acc
	196	* balances out to 0 at the final run, the inner loop
	197	* body can't be left with entry==NULL. So we indeed
	198	* encounter entry==NULL in the outer loop only.
	199	*/
	200	do {
	201	acc += hash_count[i] - HASH_LIMIT;
	202	if (acc > 0) {
	203	struct unpacked_index_entry *keep = entry;
	204	do {
	205	entry = entry->next;
	206	acc -= HASH_LIMIT;
	207	} while (acc > 0);
	208	keep->next = entry->next;
	209	}
	210	entry = entry->next;
	211	} while (entry);
	212	}
	213	return entries;
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	214	}
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	215
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	216	static struct delta_index *
	217	pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	218	unsigned int num_entries, struct delta_index *old_index)
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	219	{
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	220	unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	221	struct unpacked_index_entry *entry;
	222	struct delta_index *index;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	223	struct index_entry packed_entry, packed_hash, old_entry, *copy_from;
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	224	struct index_entry null_entry = {0};
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	225	void *mem;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	226
	227	hmask = hsize - 1;
	228
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	229	// if (old_index) {
	230	// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
	231	// " %x => %x\n",
	232	// num_entries - old_index->num_entries,
	233	// old_index->num_entries, num_entries,
	234	// old_index->hash_mask, hmask);
	235	// } else {
	236	// fprintf(stderr, "Packing %d entries into a new index\n",
	237	// num_entries);
	238	// }
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	239	/* First, see if we can squeeze the new items into the existing structure.
	240	*/
	241	fit_in_old = 0;
	242	copied_count = 0;
	243	if (old_index && old_index->hash_mask == hmask) {
	244	fit_in_old = 1;
	245	for (i = 0; i < hsize; ++i) {
	246	packed_entry = NULL;
	247	for (entry = hash[i]; entry; entry = entry->next) {
	248	if (packed_entry == NULL) {
	249	/* Find the last open spot */
	250	packed_entry = old_index->hash[i + 1];
	251	--packed_entry;
	252	while (packed_entry >= old_index->hash[i]
	253	&& packed_entry->ptr == NULL) {
	254	--packed_entry;
	255	}
	256	++packed_entry;
	257	}
	258	if (packed_entry >= old_index->hash[i+1]
	259	\|\| packed_entry->ptr != NULL) {
	260	/* There are no free spots here :( */
	261	fit_in_old = 0;
	262	break;
	263	}
	264	/* We found an empty spot to put this entry
	265	* Copy it over, and remove it from the linked list, just in
	266	* case we end up running out of room later.
	267	*/
	268	*packed_entry++ = entry->entry;
	269	assert(entry == hash[i]);
	270	hash[i] = entry->next;
	271	copied_count += 1;
	272	old_index->num_entries++;
	273	}
	274	if (!fit_in_old) {
	275	break;
	276	}
	277	}
	278	}
	279	if (old_index) {
	280	if (fit_in_old) {
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	281	// fprintf(stderr, "Fit all %d entries into old index\n",
	282	// copied_count);
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	283	/* No need to allocate a new buffer */
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	284	return NULL;
	285	} else {
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	286	// fprintf(stderr, "Fit only %d entries into old index,"
	287	// " reallocating\n", copied_count);
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	288	}
	289	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	290	/*
	291	* Now create the packed index in array form
	292	* rather than linked lists.
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	293	* Leave a 2-entry gap for inserting more entries between the groups
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	294	*/
	295	memsize = sizeof(*index)
	296	+ sizeof(packed_hash) (hsize+1)
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	297	+ sizeof(packed_entry) (num_entries + hsize * EXTRA_NULLS);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	298	mem = malloc(memsize);
	299	if (!mem) {
	300	return NULL;
	301	}
	302
	303	index = mem;
	304	index->memsize = memsize;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	305	index->hash_mask = hmask;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	306	index->num_entries = num_entries;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	307	if (old_index) {
3735.33.16 by John Arbash Meinel Handle when our current packing is sub-optimal.	308	if (hmask < old_index->hash_mask) {
	309	fprintf(stderr, "hash mask was shrunk %x => %x\n",
	310	old_index->hash_mask, hmask);
	311	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	312	assert(hmask >= old_index->hash_mask);
	313	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	314
	315	mem = index->hash;
	316	packed_hash = mem;
	317	mem = packed_hash + (hsize+1);
	318	packed_entry = mem;
	319
	320	for (i = 0; i < hsize; i++) {
	321	/*
	322	* Coalesce all entries belonging to one linked list
	323	* into consecutive array entries.
	324	*/
	325	packed_hash[i] = packed_entry;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	326	/* Old comes earlier as a source, so it always comes first in a given
	327	* hash bucket.
	328	*/
	329	if (old_index) {
	330	/* Could we optimize this to use memcpy when hmask ==
	331	* old_index->hash_mask? Would it make any real difference?
	332	*/
	333	j = i & old_index->hash_mask;
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	334	copy_from = old_index->hash[j];
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	335	for (old_entry = old_index->hash[j];
	336	old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
	337	old_entry++) {
	338	if ((old_entry->val & hmask) == i) {
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	339	packed_entry++ = old_entry;
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	340	}
	341	}
	342	}
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	343	for (entry = hash[i]; entry; entry = entry->next) {
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	344	*packed_entry++ = entry->entry;
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	345	}
3735.33.14 by John Arbash Meinel Simplify the code a bit. We don't repack as often, so go with a	346	/* TODO: At this point packed_entry - packed_hash[i] is the number of
	347	* records that we have inserted into this hash bucket.
	348	* We should really consider doing some limiting along the
	349	* lines of limit_hash_buckets() to avoid pathological behavior.
	350	*/
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	351	/* Now add extra 'NULL' entries that we can use for future expansion. */
	352	for (j = 0; j < EXTRA_NULLS; ++j ) {
	353	*packed_entry++ = null_entry;
	354	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	355	}
	356
	357	/* Sentinel value to indicate the length of the last hash bucket */
	358	packed_hash[hsize] = packed_entry;
	359
3735.33.4 by John Arbash Meinel The new layout is working.	360	if (packed_entry - (struct index_entry *)mem
3735.33.4 by John Arbash Meinel The new layout is working.	361	!= num_entries + hsize*EXTRA_NULLS) {
3735.33.5 by John Arbash Meinel Restoring the debugging for now.	362	fprintf(stderr, "We expected %d entries, but created %d\n",
	363	num_entries + hsize*EXTRA_NULLS,
	364	(int)(packed_entry - (struct index_entry*)mem));
3735.33.4 by John Arbash Meinel The new layout is working.	365	}
3735.33.3 by John Arbash Meinel include_entries_from_hash wasn't properly skipping NULL records.	366	assert(packed_entry - (struct index_entry *)mem
	367	== num_entries + hsize*EXTRA_NULLS);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	368	index->last_entry = (packed_entry - 1);
	369	return index;
0.23.32 by John Arbash Meinel Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.	370	}
	371
3735.33.4 by John Arbash Meinel The new layout is working.	372
0.23.49 by John Arbash Meinel When adding new entries to the delta index, use memcpy	373	struct delta_index *
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	374	create_delta_index(const struct source_info *src,
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	375	struct delta_index *old)
0.23.43 by John Arbash Meinel Change the internals to allow delta indexes to be expanded with new source data.	376	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	377	unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
	378	unsigned int total_num_entries;
	379	const unsigned char data, buffer;
	380	struct delta_index *index;
	381	struct unpacked_index_entry entry, *hash;
	382	void *mem;
	383	unsigned long memsize;
	384
	385	if (!src->buf \|\| !src->size)
	386	return NULL;
	387	buffer = src->buf;
	388
	389	/* Determine index hash size. Note that indexing skips the
	390	first byte to allow for optimizing the Rabin's polynomial
	391	initialization in create_delta(). */
	392	num_entries = (src->size - 1) / RABIN_WINDOW;
	393	if (old != NULL)
	394	total_num_entries = num_entries + old->num_entries;
	395	else
	396	total_num_entries = num_entries;
3735.33.8 by John Arbash Meinel Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.	397	hsize = total_num_entries / 4;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	398	for (i = 4; (1u << i) < hsize && i < 31; i++);
	399	hsize = 1 << i;
	400	hmask = hsize - 1;
3735.33.18 by John Arbash Meinel grow the local hmask if it is smaller than expected, don't shrink it.	401	if (old && old->hash_mask > hmask) {
3735.33.16 by John Arbash Meinel Handle when our current packing is sub-optimal.	402	hmask = old->hash_mask;
	403	hsize = hmask + 1;
	404	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	405
	406	/* allocate lookup index */
	407	memsize = sizeof(hash) hsize +
	408	sizeof(entry) total_num_entries;
	409	mem = malloc(memsize);
	410	if (!mem)
	411	return NULL;
	412	hash = mem;
	413	mem = hash + hsize;
	414	entry = mem;
	415
	416	memset(hash, 0, hsize * sizeof(*hash));
	417
	418	/* allocate an array to count hash num_entries */
	419	hash_count = calloc(hsize, sizeof(*hash_count));
	420	if (!hash_count) {
	421	free(hash);
	422	return NULL;
	423	}
	424
	425	/* then populate the index for the new data */
	426	prev_val = ~0;
	427	for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
	428	data >= buffer;
	429	data -= RABIN_WINDOW) {
	430	unsigned int val = 0;
	431	for (i = 1; i <= RABIN_WINDOW; i++)
	432	val = ((val << 8) \| data[i]) ^ T[val >> RABIN_SHIFT];
	433	if (val == prev_val) {
	434	/* keep the lowest of consecutive identical blocks */
	435	entry[-1].entry.ptr = data + RABIN_WINDOW;
	436	--num_entries;
	437	--total_num_entries;
	438	} else {
	439	prev_val = val;
	440	i = val & hmask;
	441	entry->entry.ptr = data + RABIN_WINDOW;
	442	entry->entry.val = val;
	443	entry->entry.src = src;
	444	entry->next = hash[i];
	445	hash[i] = entry++;
	446	hash_count[i]++;
	447	}
	448	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	449	/* TODO: It would be nice to limit_hash_buckets at a better time. */
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	450	total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
	451	total_num_entries);
	452	free(hash_count);
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	453	if (old) {
	454	old->last_src = src;
	455	}
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	456	index = pack_delta_index(hash, hsize, total_num_entries, old);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	457	free(hash);
	458	if (!index) {
	459	return NULL;
	460	}
3735.33.12 by John Arbash Meinel Now we are able to weave 'add_source' into the existing index.	461	index->last_src = src;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	462	return index;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	463	}
	464
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	465	/* Take some entries, and put them into a custom hash.
	466	* @param entries A list of entries, sorted by position in file
	467	* @param num_entries Length of entries
	468	* @param out_hsize The maximum size of the hash, the final size will be
	469	* returned here
	470	*/
	471	struct index_entry_linked_list **
	472	_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	473	unsigned int hsize)
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	474	{
3735.33.11 by John Arbash Meinel Shave 5s->3.3s in add_source by copying old entries across directly.	475	unsigned int hash_offset, hmask, memsize;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	476	struct index_entry entry, last_entry;
	477	struct index_entry_linked_list out_entry, *hash;
	478	void *mem;
	479
	480	hmask = hsize - 1;
	481
	482	memsize = sizeof(hash) hsize +
	483	sizeof(out_entry) num_entries;
	484	mem = malloc(memsize);
	485	if (!mem)
	486	return NULL;
	487	hash = mem;
	488	mem = hash + hsize;
	489	out_entry = mem;
	490
	491	memset(hash, 0, sizeof(hash)(hsize+1));
	492
	493	/* We know that entries are in the order we want in the output, but they
	494	* aren't "grouped" by hash bucket yet.
	495	*/
	496	last_entry = entries + num_entries;
	497	for (entry = entries + num_entries - 1; entry >= entries; --entry) {
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	498	hash_offset = entry->val & hmask;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	499	out_entry->p_entry = entry;
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	500	out_entry->next = hash[hash_offset];
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	501	/* TODO: Remove entries that have identical vals, or at least filter
	502	* the map a little bit.
	503	* if (hash[i] != NULL) {
	504	* }
	505	*/
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	506	hash[hash_offset] = out_entry;
	507	++out_entry;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	508	}
	509	return hash;
	510	}
	511
3735.33.4 by John Arbash Meinel The new layout is working.	512
	513	struct delta_index *
	514	create_index_from_old_and_new_entries(const struct delta_index *old_index,
	515	struct index_entry *entries,
	516	unsigned int num_entries)
	517	{
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	518	unsigned int i, j, hsize, hmask, total_num_entries;
3735.33.4 by John Arbash Meinel The new layout is working.	519	struct delta_index *index;
	520	struct index_entry entry, packed_entry, **packed_hash;
	521	struct index_entry *last_entry, null_entry = {0};
	522	void *mem;
	523	unsigned long memsize;
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	524	struct index_entry_linked_list unpacked_entry, *mini_hash;
3735.33.4 by John Arbash Meinel The new layout is working.	525
	526	/* Determine index hash size. Note that indexing skips the
	527	first byte to allow for optimizing the Rabin's polynomial
	528	initialization in create_delta(). */
	529	total_num_entries = num_entries + old_index->num_entries;
3735.33.8 by John Arbash Meinel Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.	530	hsize = total_num_entries / 4;
3735.33.4 by John Arbash Meinel The new layout is working.	531	for (i = 4; (1u << i) < hsize && i < 31; i++);
	532	hsize = 1 << i;
	533	if (hsize < old_index->hash_mask) {
	534	/* For some reason, there was a code path that would actually shrink
	535	* the hash size. This screws with some later code, and in general, I
	536	* think it better to make the hash bigger, rather than smaller. So
	537	* we'll just force the size here.
	538	* Possibly done by create_delta_index running into a
	539	* limit_hash_buckets call, that ended up transitioning across a
	540	* power-of-2. The cause isn't 100% clear, though.
	541	*/
	542	hsize = old_index->hash_mask + 1;
	543	}
	544	hmask = hsize - 1;
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	545	// fprintf(stderr, "resizing index to insert %d entries into array"
	546	// " with %d entries: %x => %x\n",
	547	// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
3735.33.4 by John Arbash Meinel The new layout is working.	548
	549	memsize = sizeof(*index)
	550	+ sizeof(packed_hash) (hsize+1)
	551	+ sizeof(packed_entry) (total_num_entries + hsize*EXTRA_NULLS);
	552	mem = malloc(memsize);
	553	if (!mem) {
	554	return NULL;
	555	}
	556	index = mem;
	557	index->memsize = memsize;
	558	index->hash_mask = hmask;
	559	index->num_entries = total_num_entries;
	560	index->last_src = old_index->last_src;
	561
	562	mem = index->hash;
	563	packed_hash = mem;
	564	mem = packed_hash + (hsize+1);
	565	packed_entry = mem;
	566
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	567	mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	568	if (mini_hash == NULL) {
	569	free(index);
	570	return NULL;
	571	}
3735.33.4 by John Arbash Meinel The new layout is working.	572	last_entry = entries + num_entries;
	573	for (i = 0; i < hsize; i++) {
	574	/*
	575	* Coalesce all entries belonging in one hash bucket
	576	* into consecutive array entries.
	577	* The entries in old_index all come before 'entries'.
	578	*/
	579	packed_hash[i] = packed_entry;
	580	/* Copy any of the old entries across */
	581	/* Would we rather use memcpy? */
	582	if (hmask == old_index->hash_mask) {
	583	for (entry = old_index->hash[i];
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	584	entry < old_index->hash[i+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel The new layout is working.	585	++entry) {
	586	assert((entry->val & hmask) == i);
	587	packed_entry++ = entry;
	588	}
	589	} else {
	590	/* If we resized the index from this action, all of the old values
	591	* will be found in the previous location, but they will end up
	592	* spread across the new locations.
	593	*/
	594	j = i & old_index->hash_mask;
	595	for (entry = old_index->hash[j];
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	596	entry < old_index->hash[j+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel The new layout is working.	597	++entry) {
	598	assert((entry->val & old_index->hash_mask) == j);
	599	if ((entry->val & hmask) == i) {
	600	/* Any entries not picked up here will be picked up on the
	601	* next pass.
	602	*/
	603	packed_entry++ = entry;
	604	}
	605	}
	606	}
	607	/* Now see if we need to insert any of the new entries.
	608	* Note that loop ends up O(hsize*num_entries), so we expect that
	609	* num_entries is always small.
	610	* We also help a little bit by collapsing the entry range when the
	611	* endpoints are inserted. However, an alternative would be to build a
	612	* quick hash lookup for just the new entries.
	613	* Testing shows that this list can easily get up to about 100
	614	* entries, the tradeoff is a malloc, 1 pass over the entries, copying
	615	* them into a sorted buffer, and a free() when done,
	616	*/
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	617	for (unpacked_entry = mini_hash[i];
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	618	unpacked_entry;
	619	unpacked_entry = unpacked_entry->next) {
3735.33.10 by John Arbash Meinel Fix a bug when there is more than one entry (increment out_entry).	620	assert((unpacked_entry->p_entry->val & hmask) == i);
	621	packed_entry++ = (unpacked_entry->p_entry);
3735.33.4 by John Arbash Meinel The new layout is working.	622	}
	623	/* Now insert some extra nulls */
	624	for (j = 0; j < EXTRA_NULLS; ++j) {
	625	*packed_entry++ = null_entry;
	626	}
	627	}
3735.33.9 by John Arbash Meinel When expanding an index put the entries into a hash.	628	free(mini_hash);
3735.33.4 by John Arbash Meinel The new layout is working.	629
	630	/* Sentinel value to indicate the length of the last hash bucket */
	631	packed_hash[hsize] = packed_entry;
	632
	633	if ((packed_entry - (struct index_entry *)mem)
	634	!= (total_num_entries + hsize*EXTRA_NULLS)) {
	635	fprintf(stderr, "We expected %d entries, but created %d\n",
	636	total_num_entries + hsize*EXTRA_NULLS,
	637	(int)(packed_entry - (struct index_entry*)mem));
	638	fflush(stderr);
	639	}
	640	assert((packed_entry - (struct index_entry *)mem)
	641	== (total_num_entries + hsize * EXTRA_NULLS));
	642	index->last_entry = (packed_entry - 1);
	643	return index;
	644	}
	645
	646
	647	void
	648	get_text(char buff[128], const unsigned char *ptr)
	649	{
	650	unsigned int i;
	651	const unsigned char *start;
	652	unsigned char cmd;
	653	start = (ptr-RABIN_WINDOW-1);
	654	cmd = *(start);
	655	if (cmd < 0x80) {// This is likely to be an insert instruction
	656	if (cmd < RABIN_WINDOW) {
	657	cmd = RABIN_WINDOW;
	658	}
	659	} else {
	660	/* This was either a copy [should never be] or it
	661	* was a longer insert so the insert start happened at 16 more
	662	* bytes back.
	663	*/
	664	cmd = RABIN_WINDOW + 1;
	665	}
	666	if (cmd > 60) {
	667	cmd = 60; /* Be friendly to 80char terms */
	668	}
	669	/* Copy the 1 byte command, and 4 bytes after the insert */
	670	cmd += 5;
	671	memcpy(buff, start, cmd);
	672	buff[cmd] = 0;
	673	for (i = 0; i < cmd; ++i) {
	674	if (buff[i] == '\n') {
	675	buff[i] = 'N';
	676	} else if (buff[i] == '\t') {
	677	buff[i] = 'T';
	678	}
	679	}
	680	}
	681
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	682	struct delta_index *
	683	create_delta_index_from_delta(const struct source_info *src,
3735.33.4 by John Arbash Meinel The new layout is working.	684	struct delta_index *old_index)
0.23.45 by John Arbash Meinel Add a function that updates the index for delta bytes.	685	{
3735.33.4 by John Arbash Meinel The new layout is working.	686	unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
3735.33.4 by John Arbash Meinel The new layout is working.	687	unsigned int hash_offset;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	688	const unsigned char data, buffer, *top;
	689	unsigned char cmd;
3735.33.4 by John Arbash Meinel The new layout is working.	690	struct delta_index *new_index;
3735.33.4 by John Arbash Meinel The new layout is working.	691	struct index_entry entry, entries, *old_entry;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	692
	693	if (!src->buf \|\| !src->size)
	694	return NULL;
	695	buffer = src->buf;
	696	top = buffer + src->size;
	697
	698	/* Determine index hash size. Note that indexing skips the
	699	first byte to allow for optimizing the Rabin's polynomial
	700	initialization in create_delta().
	701	This computes the maximum number of entries that could be held. The
	702	actual number will be recomputed during processing.
	703	*/
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	704
3735.33.4 by John Arbash Meinel The new layout is working.	705	max_num_entries = (src->size - 1) / RABIN_WINDOW;
	706
	707	/* allocate an array to hold whatever entries we find */
	708	entries = malloc(sizeof(entry) max_num_entries);
	709	if (!entries) /* malloc failure */
	710	return NULL;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	711
	712	/* then populate the index for the new data */
	713	prev_val = ~0;
	714	data = buffer;
	715	/* target size */
	716	get_delta_hdr_size(&data, top);
3735.33.4 by John Arbash Meinel The new layout is working.	717	entry = entries; /* start at the first slot */
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	718	num_entries = 0; /* calculate the real number of entries */
	719	while (data < top) {
	720	cmd = *data++;
	721	if (cmd & 0x80) {
	722	/* Copy instruction, skip it */
	723	if (cmd & 0x01) data++;
	724	if (cmd & 0x02) data++;
	725	if (cmd & 0x04) data++;
	726	if (cmd & 0x08) data++;
	727	if (cmd & 0x10) data++;
	728	if (cmd & 0x20) data++;
	729	if (cmd & 0x40) data++;
	730	} else if (cmd) {
	731	/* Insert instruction, we want to index these bytes */
	732	if (data + cmd > top) {
	733	/* Invalid insert, not enough bytes in the delta */
	734	break;
	735	}
3735.33.5 by John Arbash Meinel Restoring the debugging for now.	736	/* The create_delta code requires a match at least 4 characters
	737	* (including only the last char of the RABIN_WINDOW) before it
	738	* will consider it something worth copying rather than inserting.
	739	* So we don't want to index anything that we know won't ever be a
	740	* match.
	741	*/
	742	for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	743	data += RABIN_WINDOW) {
	744	unsigned int val = 0;
	745	for (i = 1; i <= RABIN_WINDOW; i++)
	746	val = ((val << 8) \| data[i]) ^ T[val >> RABIN_SHIFT];
	747	if (val != prev_val) {
	748	/* Only keep the first of consecutive data */
	749	prev_val = val;
3735.33.4 by John Arbash Meinel The new layout is working.	750	num_entries++;
	751	entry->ptr = data + RABIN_WINDOW;
	752	entry->val = val;
	753	entry->src = src;
3735.33.2 by John Arbash Meinel Revert some of the previous work.	754	entry++;
3735.33.4 by John Arbash Meinel The new layout is working.	755	if (num_entries > max_num_entries) {
	756	/* We ran out of entry room, something is really wrong
	757	*/
	758	break;
	759	}
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	760	}
	761	}
	762	/* Move the data pointer by whatever remainder is left */
	763	data += cmd;
	764	} else {
	765	/*
	766	* cmd == 0 is reserved for future encoding
	767	* extensions. In the mean time we must fail when
	768	* encountering them (might be data corruption).
	769	*/
	770	break;
	771	}
	772	}
	773	if (data != top) {
	774	/* Something was wrong with this delta */
3735.33.4 by John Arbash Meinel The new layout is working.	775	free(entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	776	return NULL;
	777	}
	778	if (num_entries == 0) {
	779	/ Nothing to index /
3735.33.4 by John Arbash Meinel The new layout is working.	780	free(entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	781	return NULL;
	782	}
3735.33.4 by John Arbash Meinel The new layout is working.	783	assert(old_index != NULL);
	784	old_index->last_src = src;
	785	/* See if we can fill in these values into the holes in the array */
	786	entry = entries;
	787	num_inserted = 0;
	788	for (; num_entries > 0; --num_entries, ++entry) {
	789	hash_offset = (entry->val & old_index->hash_mask);
	790	/* The basic structure is a hash => packed_entries that fit in that
	791	* hash bucket. Things are structured such that the hash-pointers are
	792	* strictly ordered. So we start by pointing to the next pointer, and
	793	* walk back until we stop getting NULL targets, and then go back
	794	* forward. If there are no NULL targets, then we know because
	795	* entry->ptr will not be NULL.
	796	*/
	797	old_entry = old_index->hash[hash_offset + 1];
	798	old_entry--;
	799	while (old_entry->ptr == NULL
	800	&& old_entry >= old_index->hash[hash_offset]) {
	801	old_entry--;
	802	}
	803	old_entry++;
	804	if (old_entry->ptr != NULL
	805	\|\| old_entry >= old_index->hash[hash_offset + 1]) {
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	806	/* There is no room for this entry, we have to resize */
	807	// char buff[128];
	808	// get_text(buff, entry->ptr);
	809	// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
	810	// hash_offset, entry->val, buff);
	811	// for (old_entry = old_index->hash[hash_offset];
	812	// old_entry < old_index->hash[hash_offset+1];
	813	// ++old_entry) {
	814	// get_text(buff, old_entry->ptr);
	815	// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
	816	// (int)(old_entry - old_index->hash[hash_offset]),
	817	// old_entry->val, old_entry->ptr, buff);
	818	// }
3735.33.4 by John Arbash Meinel The new layout is working.	819	break;
	820	}
	821	num_inserted++;
	822	old_entry = entry;
	823	/* For entries which we do manage to insert into old_index, we don't
	824	* want them double copied into the final output.
	825	*/
	826	old_index->num_entries++;
	827	}
	828	if (num_entries > 0) {
	829	/* We couldn't fit the new entries into the old index, so allocate a
	830	* new one, and fill it with stuff.
	831	*/
3735.33.6 by John Arbash Meinel Increasing EXTRA_NULLS to 2 from 1 ups the hit rate	832	// fprintf(stderr, "inserted %d before resize\n", num_inserted);
3735.33.4 by John Arbash Meinel The new layout is working.	833	new_index = create_index_from_old_and_new_entries(old_index,
3735.33.4 by John Arbash Meinel The new layout is working.	834	entry, num_entries);
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	835	} else {
3735.33.4 by John Arbash Meinel The new layout is working.	836	new_index = NULL;
3735.33.15 by John Arbash Meinel Remove the noisy debugging code. (down to 23.1s)	837	// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
3735.33.4 by John Arbash Meinel The new layout is working.	838	}
	839	free(entries);
	840	return new_index;
0.23.45 by John Arbash Meinel Add a function that updates the index for delta bytes.	841	}
	842
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	843	void free_delta_index(struct delta_index *index)
	844	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	845	free(index);
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	846	}
	847
	848	unsigned long sizeof_delta_index(struct delta_index *index)
	849	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	850	if (index)
	851	return index->memsize;
	852	else
	853	return 0;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	854	}
	855
	856	/*
	857	* The maximum size for any opcode sequence, including the initial header
	858	* plus Rabin window plus biggest copy.
	859	*/
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	860	#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	861
	862	void *
0.23.44 by John Arbash Meinel Remove the multi-index handling now that we have index combining instead.	863	create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	864	const void *trg_buf, unsigned long trg_size,
	865	unsigned long *delta_size, unsigned long max_size)
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	866	{
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	867	unsigned int i, outpos, outsize, moff, msize, val;
	868	const struct source_info *msource;
	869	int inscnt;
	870	const unsigned char ref_data, ref_top, data, top;
	871	unsigned char *out;
	872	unsigned long source_size;
	873
	874	if (!trg_buf \|\| !trg_size)
	875	return NULL;
	876	if (index == NULL)
	877	return NULL;
	878
	879	outpos = 0;
	880	outsize = 8192;
	881	if (max_size && outsize >= max_size)
	882	outsize = max_size + MAX_OP_SIZE + 1;
	883	out = malloc(outsize);
	884	if (!out)
	885	return NULL;
	886
	887	source_size = index->last_src->size + index->last_src->agg_offset;
	888
	889	/* store target buffer size */
	890	i = trg_size;
	891	while (i >= 0x80) {
	892	out[outpos++] = i \| 0x80;
	893	i >>= 7;
	894	}
	895	out[outpos++] = i;
	896
	897	data = trg_buf;
	898	top = (const unsigned char *) trg_buf + trg_size;
	899
	900	/* Start the matching by filling out with a simple 'insert' instruction, of
	901	* the first RABIN_WINDOW bytes of the input.
	902	*/
	903	outpos++; /* leave a byte for the insert command */
	904	val = 0;
	905	for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
	906	out[outpos++] = *data;
	907	val = ((val << 8) \| *data) ^ T[val >> RABIN_SHIFT];
	908	}
	909	/* we are now setup with an insert of 'i' bytes and val contains the RABIN
	910	* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
	911	* input.
	912	*/
	913	inscnt = i;
	914
	915	moff = 0;
	916	msize = 0;
	917	msource = NULL;
	918	while (data < top) {
	919	if (msize < 4096) {
	920	/* we don't have a 'worthy enough' match yet, so let's look for
	921	* one.
	922	*/
	923	struct index_entry *entry;
	924	/* Shift the window by one byte. */
	925	val ^= U[data[-RABIN_WINDOW]];
	926	val = ((val << 8) \| *data) ^ T[val >> RABIN_SHIFT];
	927	i = val & index->hash_mask;
	928	/* TODO: When using multiple indexes like this, the hash tables
	929	* mapping val => index_entry become less efficient.
	930	* You end up getting a lot more collisions in the hash,
931	* which doesn't actually lead to a entry->val match.
932	*/
3735.33.1 by John Arbash Meinel (broken, in progress), change the data structures to allow for inserting small deltas.	933	for (entry = index->hash[i];
	934	entry < index->hash[i+1] && entry->src != NULL;
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	935	entry++) {
	936	const unsigned char *ref;
	937	const unsigned char *src;
	938	unsigned int ref_size;
	939	if (entry->val != val)
	940	continue;
	941	ref = entry->ptr;
	942	src = data;
	943	ref_data = entry->src->buf;
	944	ref_top = ref_data + entry->src->size;
	945	ref_size = ref_top - ref;
	946	/* ref_size is the longest possible match that we could make
	947	* here. If ref_size <= msize, then we know that we cannot
	948	* match more bytes with this location that we have already
	949	* matched.
	950	*/
	951	if (ref_size > top - src)
	952	ref_size = top - src;
	953	if (ref_size <= msize)
	954	break;
	955	/* See how many bytes actually match at this location. */
	956	while (ref_size-- && src++ == ref)
	957	ref++;
	958	if (msize < ref - entry->ptr) {
	959	/* this is our best match so far */
	960	msize = ref - entry->ptr;
	961	msource = entry->src;
	962	moff = entry->ptr - ref_data;
	963	if (msize >= 4096) /* good enough */
	964	break;
	965	}
	966	}
	967	}
	968
	969	if (msize < 4) {
	970	/* The best match right now is less than 4 bytes long. So just add
	971	* the current byte to the insert instruction. Increment the insert
	972	* counter, and copy the byte of data into the output buffer.
	973	*/
	974	if (!inscnt)
	975	outpos++;
	976	out[outpos++] = *data++;
	977	inscnt++;
	978	if (inscnt == 0x7f) {
	979	/* We have a max length insert instruction, finalize it in the
	980	* output.
	981	*/
	982	out[outpos - inscnt - 1] = inscnt;
	983	inscnt = 0;
	984	}
	985	msize = 0;
	986	} else {
	987	unsigned int left;
	988	unsigned char *op;
	989
	990	if (inscnt) {
	991	ref_data = msource->buf;
	992	while (moff && ref_data[moff-1] == data[-1]) {
	993	/* we can match one byte back */
	994	msize++;
	995	moff--;
	996	data--;
	997	outpos--;
	998	if (--inscnt)
999	continue;
1000	outpos--; /* remove count slot */
1001	inscnt--; /* make it -1 */
1002	break;
1003	}
1004	out[outpos - inscnt - 1] = inscnt;
1005	inscnt = 0;
1006	}
1007
1008	/* A copy op is currently limited to 64KB (pack v2) */
1009	left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1010	msize -= left;
1011
1012	op = out + outpos++;
1013	i = 0x80;
1014
1015	/* moff is the offset in the local structure, for encoding, we need
1016	* to push it into the global offset
1017	*/
1018	assert(moff < msource->size);
1019	moff += msource->agg_offset;
1020	assert(moff + msize <= source_size);
1021	if (moff & 0x000000ff)
1022	out[outpos++] = moff >> 0, i \|= 0x01;
1023	if (moff & 0x0000ff00)
1024	out[outpos++] = moff >> 8, i \|= 0x02;
1025	if (moff & 0x00ff0000)
1026	out[outpos++] = moff >> 16, i \|= 0x04;
1027	if (moff & 0xff000000)
1028	out[outpos++] = moff >> 24, i \|= 0x08;
1029	/* Put it back into local coordinates, in case we have multiple
1030	* copies in a row.
1031	*/
1032	moff -= msource->agg_offset;
1033
1034	if (msize & 0x00ff)
1035	out[outpos++] = msize >> 0, i \|= 0x10;
1036	if (msize & 0xff00)
1037	out[outpos++] = msize >> 8, i \|= 0x20;
1038
1039	*op = i;
1040
1041	data += msize;
1042	moff += msize;
1043	msize = left;
1044
1045	if (msize < 4096) {
1046	int j;
1047	val = 0;
1048	for (j = -RABIN_WINDOW; j < 0; j++)
1049	val = ((val << 8) \| data[j])
1050	^ T[val >> RABIN_SHIFT];
1051	}
1052	}
1053
1054	if (outpos >= outsize - MAX_OP_SIZE) {
1055	void *tmp = out;
1056	outsize = outsize * 3 / 2;
1057	if (max_size && outsize >= max_size)
1058	outsize = max_size + MAX_OP_SIZE + 1;
1059	if (max_size && outpos > max_size)
1060	break;
1061	out = realloc(out, outsize);
1062	if (!out) {
1063	free(tmp);
1064	return NULL;
1065	}
1066	}
1067	}
1068
1069	if (inscnt)
1070	out[outpos - inscnt - 1] = inscnt;
1071
1072	if (max_size && outpos > max_size) {
1073	free(out);
1074	return NULL;
1075	}
1076
1077	*delta_size = outpos;
1078	return out;
0.23.2 by Nicolas Pitre Add the diff-delta.c and patch-delta.c files.	1079	}
0.23.24 by John Arbash Meinel Change the code so that we can pass in multiple sources to match against.	1080
0.23.57 by John Arbash Meinel Change the formatting, replace \t with spaces to be consistent with bzr coding.	1081	/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel Track down a memory leak in the refactored diff-delta.c code.	1082	*/