Added some stat stuff.

2000-10-12 Not Zed <NotZed@HelixCode.com> * index.h: Added some stat stuff. * wordindex.c (struct _wordcache): Changed files[] to be a pointer to an allocated block/or an individual item. (find): Fix for changes to struct. (find_name): " (sync_cache_entry): " (add): " (add_list): " (add_index_cache): Free the cache file array if it was created. (word_flush): And here. (word_close): And here too. (ibex_create_word_index): Double the size of the hashtables. (word_flush): Make sure we reset the wordcount to 0 if we remove the list items. DOH. (add_index_cache): Use a slightly more sohpisticated aging algorithm to remove expired nodes. 2000-10-10 Not Zed <NotZed@HelixCode.com> * hash.c (hash_find): (hash_remove): (hash_insert): Truncate key if it is too big to fit in a single block to MAX_KEYLEN bytes. svn path=/trunk/; revision=5882
2000-10-12 13:40:55 +00:00
parent 6c337e9ea0
commit 1deca02b41
5 changed files with 187 additions and 26 deletions
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@ -1,3 +1,23 @@
+2000-10-12  Not Zed  <NotZed@HelixCode.com>
+
+	* index.h: Added some stat stuff.
+
+	* wordindex.c (struct _wordcache): Changed files[] to be a pointer
+	to an allocated block/or an individual item.
+	(find): Fix for changes to struct.
+	(find_name):  "
+	(sync_cache_entry): "
+	(add): "
+	(add_list): "
+	(add_index_cache): Free the cache file array if it was created.
+	(word_flush): And here.
+	(word_close): And here too.
+	(ibex_create_word_index): Double the size of the hashtables.
+	(word_flush): Make sure we reset the wordcount to 0 if we remove
+	the list items.  DOH.
+	(add_index_cache): Use a slightly more sohpisticated aging
+	algorithm to remove expired nodes.
+
 2000-10-10  Not Zed  <NotZed@HelixCode.com>

 	* hash.c (hash_find):
--- a/libibex/block.c
+++ b/libibex/block.c
@ -38,6 +38,7 @@
 #define d(x)
 /*#define DEBUG*/

+int block_log;

 #ifdef IBEX_STATS
 static void
@ -202,6 +203,9 @@ ibex_block_dirty(struct _block *block)
 static void
 sync_block(struct _memcache *block_cache, struct _memblock *memblock)
 {
+	if (block_log)
+		printf("writing block %d\n", memblock->block);
+
 	lseek(block_cache->fd, memblock->block, SEEK_SET);
 	if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) {
 		memblock->flags &= ~BLOCK_DIRTY;
@ -324,6 +328,9 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 	add_miss(block_cache, blockid);
 	add_read(block_cache, blockid);
 #endif
+	if (block_log)
+		printf("miss block %d\n", blockid);
+
 	d(printf("loading blockid from disk %d\n", blockid));
 	memblock = g_malloc(sizeof(*memblock));
 	memblock->block = blockid;
@ -345,6 +352,8 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 				struct _memblock *rootblock = g_hash_table_lookup(block_cache->index, (void *)0);
 				struct _root *root = (struct _root *)&rootblock->data;

+				printf("Unsyncing root block\n");
+
 				g_assert(rootblock != NULL);
 				root->flags &= ~IBEX_ROOT_SYNCF;
 				sync_block(block_cache, rootblock);
--- a/libibex/hash.c
+++ b/libibex/hash.c
@ -190,6 +190,9 @@ static int hash_sync(struct _IBEXIndex *index)

 static int hash_close(struct _IBEXIndex *index)
 {
+#ifdef INDEX_STAT
+	printf("Performed %d lookups, average %f depth\n", index->lookups, (double)index->lookup_total/index->lookups);
+#endif
 	g_free(index);
 	return 0;
 }
@ -259,12 +262,19 @@ hash_find(struct _IBEXIndex *index, const char *key, int keylen)
 	/* and its bucket */
 	hashbucket = table->buckets[hashentry];

+#ifdef INDEX_STAT
+	index->lookups++;
+#endif
 	/* go down the bucket chain, reading each entry till we are done ... */
 	while (hashbucket != 0) {
 		struct _hashblock *bucket;
 		char *start, *end;
 		int ind;

+#ifdef INDEX_STAT
+		index->lookup_total++;
+#endif
+
 		d(printf(" checking bucket %d\n", hashbucket));

 		/* get the real bucket id from the hashbucket id */
--- a/libibex/index.h
+++ b/libibex/index.h
@ -25,10 +25,16 @@

 /* an indexing 'class' maps a key to 1 piece of info */

+#define INDEX_STAT
+
 struct _IBEXIndex {
 	struct _IBEXIndexClass *klass;
 	struct _memcache *blocks;
 	blockid_t root;		/* root block of ondisk index data */
+#ifdef INDEX_STAT
+	int lookups;		/* how many lookups */
+	int lookup_total;	/* how many blocks loaded for all lookups (hash chain depth) */
+#endif
 };

 struct _IBEXIndexClass {
--- a/libibex/wordindex.c
+++ b/libibex/wordindex.c
@ -45,7 +45,7 @@ of words, and could then be discarded (:flush()).
 #define d(x)

 /*#define WORDCACHE_SIZE (256)*/
-#define WORDCACHE_SIZE (10240)
+#define WORDCACHE_SIZE (4096)

 extern struct _IBEXStoreClass ibex_diskarray_class;
 extern struct _IBEXIndexClass ibex_hash_class;
@ -56,15 +56,20 @@ extern struct _IBEXIndexClass ibex_hash_class;
 */


+#define CACHE_FILE_COUNT (62)
+
 struct _wordcache {
 	struct _wordcache *next;
 	struct _wordcache *prev;
 	nameid_t wordid;	/* disk wordid */
 	blockid_t wordblock;	/* head of disk list */
 	blockid_t wordtail;	/* and the tail data */
-	int filecount;		/* how many we have in memory */
-	/*nameid_t files[32];*/	/* memory cache of files */
-	nameid_t files[62];	/* memory cache of files */
+	short filecount;	/* how many valid items in files[] */
+	short filealloc;	/* how much allocated space in files[] */
+	union {
+		nameid_t *files;	/* memory cache of files */
+		nameid_t file0;	/* if filecount == 1 && filealloc == 0, store directly */
+	} file;
 	char word[1];		/* actual word follows */
 };

@ -106,7 +111,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam
 		printf("opening wordindex root = %d\n", *wordroot);
 		idx->wordindex = ibex_hash_class.open(bc, *wordroot);
 	} else {
-		idx->wordindex = ibex_hash_class.create(bc, 1024);
+		idx->wordindex = ibex_hash_class.create(bc, 2048);
 		*wordroot = idx->wordindex->root;
 		printf("creating wordindex root = %d\n", *wordroot);
 	}
@ -114,7 +119,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam
 		printf("opening nameindex root = %d\n", *nameroot);
 		idx->nameindex = ibex_hash_class.open(bc, *nameroot);
 	} else {
-		idx->nameindex = ibex_hash_class.create(bc, 1024);
+		idx->nameindex = ibex_hash_class.create(bc, 2048);
 		*nameroot = idx->nameindex->root;
 		printf("creating nameindex root = %d\n", *nameroot);
 	}
@ -126,7 +131,7 @@ static void
 cache_sanity(struct _wordcache *head)
 {
 	while (head->next) {
-		g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0]));
+		g_assert(head->filecount <= head->filealloc);
 		g_assert(strlen(head->word) != 0);
 		head = head->next;
 	}
@ -209,7 +214,10 @@ static GPtrArray *find(struct _IBEXWord *idx, const char *word)
 	names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
 	/* .. including any memory-only data */
 	if (cache) {
-		g_array_append_vals(names, cache->files, cache->filecount);
+		if (cache->filealloc == 0 && cache->filecount == 1)
+			g_array_append_val(names, cache->file.file0);
+		else
+			g_array_append_vals(names, cache->file.files, cache->filecount);
 	}

 	/* walk it ... converting id's back to strings */
@ -242,9 +250,14 @@ static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *w
 		/* freshen cache entry if we touch it */
 		ibex_list_remove((struct _listnode *)cache);
 		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
-		for (i=0;i<cache->filecount;i++) {
-			if (cache->files[i] == nameid)
+		if (cache->filecount == 1 && cache->filealloc == 0) {
+			if (cache->file.file0 == nameid)
 				return TRUE;
+		} else {
+			for (i=0;i<cache->filecount;i++) {
+				if (cache->file.files[i] == nameid)
+					return TRUE;
+			}
 		}
 		/* not there?  well we can use the wordid anyway */
 		wordid = cache->wordid;
@ -265,8 +278,11 @@ sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
 	GArray array; /* just use this as a header */
 	blockid_t oldblock, oldtail;
 	
-	d(printf("syncing cache entry '%s'\n", cache->word));
-	array.data = (char *)cache->files;
+	d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount));
+	if (cache->filecount == 1 && cache->filealloc == 0)
+		array.data = (char *)&cache->file.file0;
+	else
+		array.data = (char *)cache->file.files;
 	array.len = cache->filecount;
 	oldblock = cache->wordblock;
 	oldtail = cache->wordtail;
@ -305,13 +321,35 @@ add_index_cache(struct _IBEXWord *idx, const char *word)
 	if (cache == 0) {
 		/* see if we have to flush off the last entry */
 		if (idx->wordcount >= WORDCACHE_SIZE) {
+			struct _wordcache *mincache;
+			int min, count=0;
 			/* remove last entry, and flush it */
 			cache = (struct _wordcache *)idx->wordnodes.tailpred;
+			mincache = cache;
+			min = mincache->filecount;
+
 			d(printf("flushing word from cache %s\n", cache->word));
-			ibex_list_remove((struct _listnode *)cache);
-			g_hash_table_remove(idx->wordcache, cache->word);
-			sync_cache_entry(idx, cache);
-			g_free(cache);
+			/* instead of just using the last entry, we try and find an entry with
+			   with only 1 item (failing that, the smallest in the range we look at) */
+			/* this could probably benefit greatly from a more sophisticated aging algorithm */
+			while (cache->next && count < 100) {
+				if (cache->filecount == 1) {
+					mincache = cache;
+					break;
+				}
+				if (cache->filecount > 0 && cache->filecount < min) {
+					mincache = cache;
+					min = cache->filecount;
+				}
+				cache = cache->next;
+				count++;
+			}
+			ibex_list_remove((struct _listnode *)mincache);
+			g_hash_table_remove(idx->wordcache, mincache->word);
+			sync_cache_entry(idx, mincache);
+			if (mincache->filealloc)
+				g_free(mincache->file.files);
+			g_free(mincache);
 			idx->wordcount--;
 		}
 		cache = g_malloc0(sizeof(*cache)+strlen(word));
@ -347,14 +385,34 @@ static void add(struct _IBEXWord *idx, const char *name, const char *word)
 	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);

 	/* check for repeats of the last name - dont add anything */
-	if (cache->files[cache->filecount] == nameid)
-		return;
+	if (cache->filecount == 1 && cache->filealloc == 0) {
+		if (cache->file.file0 == nameid)
+			return;
+	} else {
+		if (cache->file.files[cache->filecount] == nameid)
+			return;
+	}
+
+	/* see if we are setting the first, drop it in the union */
+	if (cache->filecount == 0 && cache->filealloc == 0) {
+		cache->file.file0 = nameid;
+	} else if (cache->filecount == 1 && cache->filealloc == 0) {
+		nameid_t saveid;
+		/* we need to allocate space for words */
+		saveid = cache->file.file0;
+		cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+		/* this could possibly grow as needed, but i wont for now */
+		cache->filealloc = CACHE_FILE_COUNT;
+		cache->file.files[0] = saveid;
+		cache->file.files[1] = nameid;
+	} else {
+		cache->file.files[cache->filecount] = nameid;
+	}

-	/* yay, now insert the nameindex into the word list, and the word index into the name list */
-	cache->files[cache->filecount] = nameid;
 	cache->filecount++;
+
 	/* if we are full, force a flush now */
-	if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+	if (cache->filealloc && cache->filecount >= cache->filealloc) {
 		sync_cache_entry(idx, cache);
 	}

@ -391,14 +449,33 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)

 		/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/

-		/* check for duplicates; doesn't catch duplicates over an overflow
-		   boundary.  Watch me care. */
+		/* check for duplicates; doesn't catch duplicates over an overflow boundary.  Watch me care. */
 		if (cache->filecount == 0
-		    || cache->files[cache->filecount-1] != nameid) {
-			cache->files[cache->filecount] = nameid;
+		    /* the 1 item case */
+		    || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid)
+		    /* the normal case */
+		    || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) {
+
+			/* see if we are setting the first, drop it in the union */
+			if (cache->filecount == 0 && cache->filealloc == 0) {
+				cache->file.file0 = nameid;
+			} else if (cache->filecount == 1 && cache->filealloc == 0) {
+				nameid_t saveid;
+				/* we need to allocate space for words */
+				saveid = cache->file.file0;
+				cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+				/* this could possibly grow as needed, but i wont for now */
+				cache->filealloc = CACHE_FILE_COUNT;
+				cache->file.files[0] = saveid;
+				cache->file.files[1] = nameid;
+			} else {
+				cache->file.files[cache->filecount] = nameid;
+			}
+
 			cache->filecount++;
+
 			/* if we are full, force a flush now */
-			if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+			if (cache->filealloc && cache->filecount >= cache->filealloc) {
 				sync_cache_entry(idx, cache);
 			}

@ -452,30 +529,69 @@ static int
 word_flush(struct _IBEXWord *idx)
 {
 	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+	extern int block_log;
+	int count= 0;
+	int used=0, wasted=0;
+
+	block_log = 0;

 	next = cache->next;
 	while (next) {
+		count++;
+		used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
+		if (cache->filealloc)
+			wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
+		else
+			wasted += (1-cache->filecount) * sizeof(nameid_t);
+
+		/*printf("syncing word %s\n", cache->word);*/
 		sync_cache_entry(idx, cache);
 		g_hash_table_remove(idx->wordcache, cache->word);
+		if (cache->filealloc)
+			g_free(cache->file.files);
 		g_free(cache);
 		cache = next;
 		next = cache->next;
 	}
+
+	printf("sync cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
+
+	block_log = 0;
 	ibex_list_new(&idx->wordnodes);
+	idx->wordcount = 0;
 	return 0;
 }

 static int word_close(struct _IBEXWord *idx)
 {
 	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+	extern int block_log;
+	int count= 0;
+	int used=0, wasted=0;
+
+	block_log = 0;

 	next = cache->next;
 	while (next) {
+		count++;
+		used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
+		if (cache->filealloc)
+			wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
+		else
+			wasted += (1-cache->filecount) * sizeof(nameid_t);
+
+		/*printf("closing word %s\n", cache->word);*/
 		sync_cache_entry(idx, cache);
+		if (cache->filealloc)
+			g_free(cache->file.files);
 		g_free(cache);
 		cache = next;
 		next = cache->next;
 	}
+	block_log = 0;
+
+	printf("cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
+
 	idx->namestore->klass->close(idx->namestore);
 	idx->nameindex->klass->close(idx->nameindex);
 	/*same as namestore: