** Merged from IBEX_DISK branch to head.

2000-09-19 Not Zed <NotZed@HelixCode.com> ** Merged from IBEX_DISK branch to head. svn path=/trunk/; revision=5500
2000-09-19 12:22:00 +00:00
parent 122b2426db
commit 288e7bb1fb
16 changed files with 3681 additions and 924 deletions
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@ -1,3 +1,60 @@
+2000-09-19  Not Zed  <NotZed@HelixCode.com>
+
+	** Merged from IBEX_DISK branch to head.
+	
+	* file.c:
+	* find.c:
+	* words.c:
+	* index.c: Removed unused files.
+
+	* block.h: Changed block to use only 24 bits for next and 8 for
+	used, and fixed all relevant code.  Some cleanup.
+	
+	* disktail.c (tail_get): If we use an empty tail node, then make
+	sure we make it dirty.
+
+2000-09-15  Not Zed  <NotZed@HelixCode.com>
+
+	* wordindex.c (word_close): Free hashtable on exit too.
+
+	* disktail.c: Implemented tail-node storage for the end of long
+	lists, or for short lists.  Should save significant disk space
+	(5x?).
+	Implemented special case for 1-item lists, where the tailnode
+	pointer is used to store the index entry.
+
+2000-09-14  Not Zed  <NotZed@HelixCode.com>
+
+	* wordindex.c (add_index_key): Keys also handle tails.
+
+	* hash.c (hash_set_data_block): Added new parameter to keys - a
+	tail block (a full 32 bit block pointer).
+	(hash_get_data_block): And same here.
+
+2000-09-12  Not Zed  <NotZed@HelixCode.com>
+
+	* wordindex.c (word_close): Dont close namestore twice.
+
+2000-09-11  Not Zed  <NotZed@HelixCode.com>
+
+	** Redid almost everything, on-disk hash table to store an index
+	to index records, mroe on the way to modularisation (more to go),
+	now stores reverse indexes for deleting.
+
+2000-08-31  Not Zed  <NotZed@HelixCode.com>
+
+	* block.c (add_key_mem): Initialise a memory based array for newly
+	added index entries.
+	(add_record): Changed to cache updates in memory until we hit a
+	limit, and then flush them to disk.
+	(get_record): Merge in-memory records with disk records.
+	(remove_record): Remove from memory first, and if that fails, goto
+	disk.
+	(find_record): Check memory first, then disk if that fails.
+	(add_datum_list): oops, copy size * sizeof(blockid_t)
+	(add_indexed): Make sure we link in the head node when we create a
+	new one.
+
 2000-08-09  Christopher James Lahey  <clahey@helixcode.com>

 	* file.c, find.c: Fixed some warnings.
--- a/libibex/Makefile.am
+++ b/libibex/Makefile.am
@ -2,20 +2,35 @@

 noinst_LTLIBRARIES = libibex.la

-libibex_la_SOURCES = file.c index.c find.c words.c ibex.h
+libibex_la_SOURCES = \
+	wordindex.c \
+	block.c ibex.h \
+	hash.c \
+	disktail.c \
+	ibex_block.c
+
 libibex_la_LDFLAGS = -static

-noinst_HEADERS = ibex_internal.h
+noinst_HEADERS = \
+	ibex_internal.h \
+	block.h \
+	wordindex.h \
+	index.h

 INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \
 		-DG_LOG_DOMAIN=\"libibex\"


-noinst_PROGRAMS = mkindex lookup
+#noinst_PROGRAMS = hash

-mkindex_SOURCES = mkindex.c
-mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#hash_SOURCES = disktail.c
+#hash_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)

-lookup_SOURCES = lookup.c
-lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#noinst_PROGRAMS = mkindex lookup
+#
+#mkindex_SOURCES = mkindex.c
+#mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#
+#lookup_SOURCES = lookup.c
+#lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)

--- a/libibex/block.c
+++ b/libibex/block.c
@ -0,0 +1,481 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+  block file/cache/utility functions
+*/
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+
+#ifdef IBEX_STATS
+static void
+init_stats(struct _memcache *index)
+{
+	index->stats = g_hash_table_new(g_direct_hash, g_direct_equal);
+}
+
+static void
+dump_1_stat(int id, struct _stat_info *info, struct _memcache *index)
+{
+	printf("%d %d %d %d %d\n", id, info->read, info->write, info->cache_hit, info->cache_miss);
+}
+
+static void
+dump_stats(struct _memcache *index)
+{
+	printf("Block reads writes hits misses\n");
+	g_hash_table_foreach(index->stats, dump_1_stat, index);
+}
+
+static void
+add_read(struct _memcache *index, int id)
+{
+	struct _stat_info *info;
+
+	info = g_hash_table_lookup(index->stats, (void *)id);
+	if (info == NULL) {
+		info = g_malloc0(sizeof(*info));
+		g_hash_table_insert(index->stats, (void *)id, info);
+	}
+	info->read++;
+}
+
+static void
+add_write(struct _memcache *index, int id)
+{
+	struct _stat_info *info;
+
+	info = g_hash_table_lookup(index->stats, (void *)id);
+	if (info == NULL) {
+		info = g_malloc0(sizeof(*info));
+		g_hash_table_insert(index->stats, (void *)id, info);
+	}
+	info->write++;
+}
+
+static void
+add_hit(struct _memcache *index, int id)
+{
+	struct _stat_info *info;
+
+	info = g_hash_table_lookup(index->stats, (void *)id);
+	if (info == NULL) {
+		info = g_malloc0(sizeof(*info));
+		g_hash_table_insert(index->stats, (void *)id, info);
+	}
+	info->cache_hit++;
+}
+
+static void
+add_miss(struct _memcache *index, int id)
+{
+	struct _stat_info *info;
+
+	info = g_hash_table_lookup(index->stats, (void *)id);
+	if (info == NULL) {
+		info = g_malloc0(sizeof(*info));
+		g_hash_table_insert(index->stats, (void *)id, info);
+	}
+	info->cache_miss++;
+}
+#endif /* IBEX_STATS */
+
+
+/* simple list routines (for simplified memory management of cache/lists) */
+
+/**
+ * ibex_list_new:
+ * @v: 
+ * 
+ * Initialise a list header.  A list header must always be initialised
+ * before use.
+ **/
+void ibex_list_new(struct _list *v)
+{
+	v->head = (struct _listnode *)&v->tail;
+	v->tail = 0;
+	v->tailpred = (struct _listnode *)&v->head;
+}
+
+/**
+ * ibex_list_addhead:
+ * @l: List.
+ * @n: Node to append.
+ * 
+ * Prepend a listnode to the head of the list @l.
+ * 
+ * Return value: Always @n.
+ **/
+struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n)
+{
+	n->next = l->head;
+	n->prev = (struct _listnode *)&l->head;
+	l->head->prev = n;
+	l->head = n;
+	return n;
+}
+
+/**
+ * ibex_list_addtail:
+ * @l: 
+ * @n: 
+ * 
+ * Append a listnode to the end of the list @l.
+ * 
+ * Return value: Always the same as @n.
+ **/
+struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n)
+{
+	n->next = (struct _listnode *)&l->tail;
+	n->prev = l->tailpred;
+	l->tailpred->next = n;
+	l->tailpred = n;
+	return n;
+}
+
+/**
+ * ibex_list_remove:
+ * @n: The node to remove.
+ * 
+ * Remove a listnode from a list.
+ * 
+ * Return value: Always the same as @n.
+ **/
+struct _listnode *ibex_list_remove(struct _listnode *n)
+{
+	n->next->prev = n->prev;
+	n->prev->next = n->next;
+	return n;
+}
+
+static struct _memblock *
+memblock_addr(struct _block *block)
+{
+	return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data));
+}
+
+/**
+ * ibex_block_dirty:
+ * @block: 
+ * 
+ * Dirty a block.  This will cause it to be written to disk on
+ * a cache sync, or when the block is flushed from the cache.
+ **/
+void
+ibex_block_dirty(struct _block *block)
+{
+	memblock_addr(block)->flags |= BLOCK_DIRTY;
+}
+
+static void
+sync_block(struct _memcache *block_cache, struct _memblock *memblock)
+{
+	lseek(block_cache->fd, memblock->block, SEEK_SET);
+	if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) {
+		memblock->flags &= ~BLOCK_DIRTY;
+	}
+#ifdef IBEX_STATS
+	add_write(block_cache, memblock->block);
+#endif
+}
+
+/**
+ * ibex_block_cache_sync:
+ * @block_cache: 
+ * 
+ * Ensure the block cache is fully synced to disk.
+ **/
+void
+ibex_block_cache_sync(struct _memcache *block_cache)
+{
+	struct _memblock *memblock;
+
+	memblock = (struct _memblock *)block_cache->nodes.head;
+	while (memblock->next) {
+		if (memblock->flags & BLOCK_DIRTY) {
+			sync_block(block_cache, memblock);
+		}
+		memblock = memblock->next;
+	}
+	fsync(block_cache->fd);
+
+#ifdef IBEX_STATS
+	dump_stats(block_cache);
+#endif
+
+}
+
+/**
+ * ibex_block_cache_flush:
+ * @block_cache: 
+ * 
+ * Ensure the block cache is fully synced to disk, and then flush
+ * its contents from memory.
+ **/
+void
+ibex_block_cache_flush(struct _memcache *block_cache)
+{
+	struct _memblock *mw, *mn;
+
+	ibex_block_cache_sync(block_cache);
+
+	mw = (struct _memblock *)block_cache->nodes.head;
+	mn = mw->next;
+	while (mn) {
+		g_hash_table_remove(block_cache->index, (void *)mw->block);
+		g_free(mw);
+		mw = mn;
+		mn = mn->next;
+	}
+	ibex_list_new(&block_cache->nodes);
+}
+
+
+/**
+ * ibex_block_read:
+ * @block_cache: 
+ * @blockid: 
+ * 
+ * Read the data of a block by blockid.  The data contents is backed by
+ * the block cache, and should be considered static.
+ *
+ * TODO; should this return a NULL block on error?
+ *
+ * Return value: The address of the block data (which may be cached).
+ **/
+struct _block *
+ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
+{
+	struct _memblock *memblock;
+
+	{
+		/* assert blockid<roof */
+		if (blockid > 0) {
+			struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+			g_assert(blockid < root->roof);
+		}
+	}
+
+	memblock = g_hash_table_lookup(block_cache->index, (void *)blockid);
+	if (memblock) {
+		d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data));
+		/* 'access' page */
+		ibex_list_remove((struct _listnode *)memblock);
+		ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
+#ifdef IBEX_STATS
+		add_hit(block_cache, memblock->block);
+#endif
+		return &memblock->data;
+	}
+#ifdef IBEX_STATS
+	add_miss(block_cache, blockid);
+	add_read(block_cache, blockid);
+#endif
+	d(printf("loading blockid from disk %d\n", blockid));
+	memblock = g_malloc(sizeof(*memblock));
+	memblock->block = blockid;
+	memblock->flags = 0;
+	lseek(block_cache->fd, blockid, SEEK_SET);
+	memset(&memblock->data, 0, sizeof(memblock->data));
+	read(block_cache->fd, &memblock->data, sizeof(memblock->data));
+	ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
+	g_hash_table_insert(block_cache->index, (void *)blockid, memblock);
+	if (block_cache->count >= CACHE_SIZE) {
+		struct _memblock *old = (struct _memblock *)block_cache->nodes.head;
+		d(printf("discaring cache block %d\n", old->block));
+		g_hash_table_remove(block_cache->index, (void *)old->block);
+		ibex_list_remove((struct _listnode *)old);
+		if (old->flags & BLOCK_DIRTY) {
+			sync_block(block_cache, old);
+		}
+		g_free(old);
+	} else {
+		block_cache->count++;
+	}
+
+	d(printf("  --- cached blocks : %d\n", block_cache->count));
+
+	return &memblock->data;
+}
+
+/**
+ * ibex_block_cache_open:
+ * @name: 
+ * @flags: Flags as to open(2), should use O_RDWR and optionally O_CREAT.
+ * @mode: Mose as to open(2)
+ * 
+ * Open a block file.
+ * 
+ * FIXME; this currently also initialises the word and name indexes
+ * because their pointers are stored in the root block.  Should be
+ * upto the caller to manage these pointers/data.
+ *
+ * Return value: NULL if the backing file could not be opened.
+ **/
+struct _memcache *
+ibex_block_cache_open(const char *name, int flags, int mode)
+{
+	struct _root *root;
+	struct _memcache *block_cache = g_malloc0(sizeof(*block_cache));
+
+	d(printf("opening ibex file: %s", name));
+
+	/* setup cache */
+	ibex_list_new(&block_cache->nodes);
+	block_cache->count = 0;
+	block_cache->index = g_hash_table_new(g_direct_hash, g_direct_equal);
+	block_cache->fd = open(name, flags, mode);
+
+	if (block_cache->fd == -1) {
+		g_hash_table_destroy(block_cache->index);
+		g_free(block_cache);
+		return NULL;
+	}
+
+	root = (struct _root *)ibex_block_read(block_cache, 0);
+	if (root->roof == 0 || memcmp(root->version, "ibx3", 4)) {
+		d(printf("Initialising superblock\n"));
+		/* reset root data */
+		memcpy(root->version, "ibx3", 4);
+		root->roof = 1024;
+		root->free = 0;
+		root->words = 0;
+		root->names = 0;
+		root->tail = 0;	/* list of tail blocks */
+		ibex_block_dirty((struct _block *)root);
+	} else {
+		d(printf("superblock already initialised:\n"
+			 " roof = %d\n free = %d\n words = %d\n names = %d\n",
+			 root->roof, root->free, root->words, root->names));
+	}
+	/* this should be moved higher up */
+	{
+		struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+
+		block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names);
+	}
+
+#ifdef IBEX_STATS
+	init_stats(block_cache);
+#endif
+
+	return block_cache;
+}
+
+/**
+ * ibex_block_cache_close:
+ * @block_cache: 
+ * 
+ * Close the block file, sync any remaining cached data
+ * to disk, and free all resources.
+ **/
+void
+ibex_block_cache_close(struct _memcache *block_cache)
+{
+	struct _memblock *mw, *mn;
+
+	ibex_block_cache_sync(block_cache);
+	close(block_cache->fd);
+
+	mw = (struct _memblock *)block_cache->nodes.head;
+	mn = mw->next;
+	while (mn) {
+		g_free(mw);
+		mw = mn;
+		mn = mw->next;
+	}
+
+	g_hash_table_destroy(block_cache->index);
+
+	g_free(block_cache);
+}
+
+/**
+ * ibex_block_free:
+ * @block_cache: 
+ * @blockid: 
+ * 
+ * Return a block to the free pool.
+ **/
+void
+ibex_block_free(struct _memcache *block_cache, blockid_t blockid)
+{
+	struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+	struct _block *block = ibex_block_read(block_cache, blockid);
+
+	block->next = root->free;
+	root->free = blockid;
+	ibex_block_dirty((struct _block *)root);
+	ibex_block_dirty((struct _block *)block);
+}
+
+/**
+ * ibex_block_get:
+ * @block_cache: 
+ * 
+ * Allocate a new block, or access a previously freed block and return
+ * its block id.  The block will have zeroed contents.
+ * 
+ * Return value: 0 if there are no blocks left (disk full/read only
+ * file, etc).
+ **/
+blockid_t
+ibex_block_get(struct _memcache *block_cache)
+{
+	struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+	struct _block *block;
+	blockid_t head;
+
+	if (root->free) {
+		head = root->free;
+		block = ibex_block_read(block_cache, head);
+		root->free = block_location(block->next);
+	} else {
+		/* TODO: check the block will fit first */
+		/* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */
+		head = root->roof;
+		root->roof += BLOCK_SIZE;
+		block = ibex_block_read(block_cache, head);
+	}
+
+	g_assert(head != 0);
+
+	d(printf("new block = %d\n", head));
+	block->next = 0;
+	block->used = 0;
+	ibex_block_dirty(block);
+	ibex_block_dirty((struct _block *)root);
+	return head;
+}
--- a/libibex/block.h
+++ b/libibex/block.h
@ -0,0 +1,102 @@
+
+/* public interfaces for block io routines */
+
+#ifndef _BLOCK_H
+#define _BLOCK_H
+
+/*#define IBEX_STATS*/		/* define to get/dump block access stats */
+
+#include <glib.h>
+
+typedef guint32 nameid_t;
+typedef guint32 blockid_t;
+
+#define BLOCK_BITS (8)
+#define BLOCK_SIZE (1<<BLOCK_BITS)
+#define CACHE_SIZE 256		/* blocks in disk cache */
+
+/* root block */
+struct _root {
+	char version[4];
+
+	blockid_t free;		/* list of free blocks */
+	blockid_t roof;		/* top of allocated space, everything below is in a free or used list */
+	blockid_t tail;		/* list of 'tail' blocks */
+
+	blockid_t words;	/* root of words index */
+	blockid_t names;	/* root of names index */
+};
+
+/* basic disk structure for (data) blocks */
+struct _block {
+	unsigned int next:32-BLOCK_BITS;	/* next block */
+	unsigned int used:BLOCK_BITS;		/* number of elements used */
+
+	nameid_t bl_data[(BLOCK_SIZE-4)/4];	/* references */
+};
+
+/* custom list structure, for a simple/efficient cache */
+struct _listnode {
+	struct _listnode *next;
+	struct _listnode *prev;
+};
+struct _list {
+	struct _listnode *head;
+	struct _listnode *tail;
+	struct _listnode *tailpred;
+};
+
+void ibex_list_new(struct _list *v);
+struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n);
+struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n);
+struct _listnode *ibex_list_remove(struct _listnode *n);
+
+/* in-memory structure for block cache */
+struct _memblock {
+	struct _memblock *next;
+	struct _memblock *prev;
+
+	blockid_t block;
+	int flags;
+
+	struct _block data;
+};
+#define BLOCK_DIRTY (1<<0)
+
+struct _memcache {
+	struct _list nodes;
+	int count;		/* nodes in cache */
+
+	GHashTable *index;	/* blockid->memblock mapping */
+	int fd;			/* file fd */
+
+#ifdef IBEX_STATS
+	GHashTable *stats;
+#endif
+	/* temporary here */
+	struct _IBEXWord *words; /* word index */
+};
+
+#ifdef IBEX_STATS
+struct _stat_info {
+	int read;
+	int write;
+	int cache_hit;
+	int cache_miss;
+};
+#endif /* IBEX_STATS */
+
+struct _memcache *ibex_block_cache_open(const char *name, int flags, int mode);
+void ibex_block_cache_close(struct _memcache *block_cache);
+void ibex_block_cache_sync(struct _memcache *block_cache);
+void ibex_block_cache_flush(struct _memcache *block_cache);
+
+blockid_t ibex_block_get(struct _memcache *block_cache);
+void ibex_block_free(struct _memcache *block_cache, blockid_t blockid);
+void ibex_block_dirty(struct _block *block);
+struct _block *ibex_block_read(struct _memcache *block_cache, blockid_t blockid);
+
+#define block_number(x) ((x)>>BLOCK_BITS)
+#define block_location(x) ((x)<<BLOCK_BITS)
+
+#endif /* ! _BLOCK_H */
--- a/libibex/diskarray.c
+++ b/libibex/diskarray.c
@ -0,0 +1,255 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* a disk based array storage class */
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+static struct _IBEXStore *disk_create(struct _memcache *bc);
+static int disk_sync(struct _IBEXStore *store);
+static int disk_close(struct _IBEXStore *store);
+
+static blockid_t disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data);
+static blockid_t disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static void disk_free(struct _IBEXStore *store, blockid_t head);
+
+static gboolean disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static GArray *disk_get(struct _IBEXStore *store, blockid_t head);
+
+struct _IBEXStoreClass ibex_diskarray_class = {
+	disk_create, disk_sync, disk_close,
+	disk_add, disk_add_list,
+	disk_remove, disk_free,
+	disk_find, disk_get
+};
+
+static struct _IBEXStore *disk_create(struct _memcache *bc)
+{
+	struct _IBEXStore *store;
+
+	store = g_malloc0(sizeof(*store));
+	store->klass = &ibex_diskarray_class;
+	store->blocks = bc;
+
+	return store;
+}
+
+static int disk_sync(struct _IBEXStore *store)
+{
+	/* no cache, nop */
+	return 0;
+}
+
+static int disk_close(struct _IBEXStore *store)
+{
+	g_free(store);
+	return 0;
+}
+
+static blockid_t
+disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+	struct _block *block;
+	struct _block *newblock;
+	blockid_t new;
+
+	if (head == 0) {
+		head = ibex_block_get(store->blocks);
+	}
+	block = ibex_block_read(store->blocks, head);
+
+	d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
+
+	if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
+		d(printf("adding record into block %d  %d\n", head, data));
+		block->bl_data[block->used] = data;
+		block->used++;
+		ibex_block_dirty(block);
+		return head;
+	} else {
+		new = ibex_block_get(store->blocks);
+		newblock = ibex_block_read(store->blocks, new);
+		newblock->next = head;
+		newblock->bl_data[0] = data;
+		newblock->used = 1;
+		d(printf("adding record into new %d  %d, next =%d\n", new, data, newblock->next));
+		ibex_block_dirty(newblock);
+		return new;
+	}
+}
+
+static blockid_t
+disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data)
+{
+	struct _block *block;
+	struct _block *newblock;
+	blockid_t new;
+	int copied = 0;
+	int left, space, tocopy;
+
+	if (head == 0) {
+		head = ibex_block_get(store->blocks);
+	}
+	block = ibex_block_read(store->blocks, head);
+
+	while (copied < data->len) {
+		left = data->len - copied;
+		space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
+		if (space) {
+			tocopy = MIN(left, space);
+			memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+			block->used += tocopy;
+			ibex_block_dirty(block);
+		} else {
+			new = ibex_block_get(store->blocks);
+			newblock = ibex_block_read(store->blocks, new);
+			newblock->next = head;
+			tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
+			memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+			newblock->used = tocopy;
+			block = newblock;
+			head = new;
+			ibex_block_dirty(newblock);
+		}
+		copied += tocopy;
+	}
+	return head;
+}
+
+static blockid_t
+disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+	blockid_t node = head;
+
+	d(printf("removing %d from %d\n", data, head));
+	while (node) {
+		struct _block *block = ibex_block_read(store->blocks, node);
+		int i;
+
+		for (i=0;i<block->used;i++) {
+			if (block->bl_data[i] == data) {
+				struct _block *start = ibex_block_read(store->blocks, head);
+
+				start->used--;
+				block->bl_data[i] = start->bl_data[start->used];
+				if (start->used == 0) {
+					struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
+					blockid_t new;
+
+					d(printf("dropping block %d, new head = %d\n", head, start->next));
+					new = start->next;
+					start->next = root->free;
+					root->free = head;
+					head = new;
+					ibex_block_dirty((struct _block *)root);
+				}
+				ibex_block_dirty(block);
+				ibex_block_dirty(start);
+				return head;
+			}
+		}
+		node = block->next;
+	}
+	return head;
+}
+
+static void disk_free(struct _IBEXStore *store, blockid_t head)
+{
+	blockid_t next;
+	struct _block *block;
+
+	while (head) {
+		block = ibex_block_read(store->blocks, head);
+		next = block->next;
+		ibex_block_free(store->blocks, head);
+		head = next;
+	}
+}
+
+static gboolean
+disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+	blockid_t node = head;
+
+	d(printf("finding %d from %d\n", data, head));
+	while (node) {
+		struct _block *block = ibex_block_read(store->blocks, node);
+		int i;
+
+		for (i=0;i<block->used;i++) {
+			if (block->bl_data[i] == data) {
+				return TRUE;
+			}
+		}
+		node = block->next;
+	}
+	return FALSE;
+}
+
+static GArray *
+disk_get(struct _IBEXStore *store, blockid_t head)
+{
+	GArray *result = g_array_new(0, 0, sizeof(nameid_t));
+
+	while (head) {
+		struct _block *block = ibex_block_read(store->blocks, head);
+
+		d(printf("getting data from block %d\n", head));
+
+		g_array_append_vals(result, block->bl_data, block->used);
+		head = block->next;
+		d(printf("next = %d\n", head));
+	}
+	return result;
+}
+
+void
+ibex_diskarray_dump(struct _memcache *blocks, blockid_t head)
+{
+	blockid_t node = head;
+
+	printf("dumping list %d\n", node);
+	while (node) {
+		struct _block *block = ibex_block_read(blocks, node);
+		int i;
+
+		printf(" block %d used %d\n ", node, block->used);
+		for (i=0;i<block->used;i++) {
+			printf(" %d", block->bl_data[i]);
+		}
+		printf("\n");
+		node = block->next;
+	}
+}
--- a/libibex/disktail.c
+++ b/libibex/disktail.c
@ -0,0 +1,807 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* a disk based array storage class that stores the tails of data lists
+   in common blocks */
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+#include <string.h>
+
+#include <stdio.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+/* marker to define which root keys indicate a single length key */
+#define BLOCK_ONE (1<<BLOCK_BITS)
+
+/* tail blocks only contain tail data ... */
+/* and we pack it in, similar to the key data, only more compact */
+struct _tailblock {
+	unsigned int next:32-BLOCK_BITS; /* only needs to point to block numbers */
+	unsigned int used:BLOCK_BITS; /* how many entries are used */
+	union {
+		unsigned char offset[BLOCK_SIZE-4]; /* works upto blocksize of 1024 bytes */
+		nameid_t data[(BLOCK_SIZE-4)/4];
+	} tailblock_u;
+};
+#define tb_offset tailblock_u.offset
+#define tb_data tailblock_u.data
+
+/* map a tail index to a block index */
+#define TAIL_INDEX(b) ((b) & (BLOCK_SIZE-1))
+/* map a tail index to a block number */
+#define TAIL_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
+/* map a block + index to a tailid */
+#define TAIL_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
+
+#define TAIL_THRESHOLD ((BLOCK_SIZE-8)/6)
+
+static struct _IBEXStore *disk_create(struct _memcache *bc);
+static int disk_sync(struct _IBEXStore *store);
+static int disk_close(struct _IBEXStore *store);
+
+static blockid_t disk_add(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
+static blockid_t disk_remove(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+static gboolean disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
+static GArray *disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+struct _IBEXStoreClass ibex_diskarray_class = {
+	disk_create, disk_sync, disk_close,
+	disk_add, disk_add_list,
+	disk_remove, disk_free,
+	disk_find, disk_get
+};
+
+
+static int
+tail_info(struct _tailblock *bucket, nameid_t tailid, blockid_t **startptr)
+{
+	blockid_t *start, *end;
+	int index;
+
+	/* get start/end of area to zap */
+	index = TAIL_INDEX(tailid);
+	start = &bucket->tb_data[bucket->tb_offset[index]];
+	if (index == 0) {
+		end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
+	} else {
+		end = &bucket->tb_data[bucket->tb_offset[index-1]];
+	}
+	if (startptr)
+		*startptr = start;
+	return end-start;
+}
+
+/* compresses (or expand) the bucket entry, to the new size */
+static void
+tail_compress(struct _tailblock *bucket, int index, int newsize)
+{
+	int i;
+	blockid_t *start, *end, *newstart;
+
+	/* get start/end of area to zap */
+	start = &bucket->tb_data[bucket->tb_offset[index]];
+	if (index == 0) {
+		end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
+	} else {
+		end = &bucket->tb_data[bucket->tb_offset[index-1]];
+	}
+
+	if (end-start == newsize)
+		return;
+
+	/*
+	     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
+	     0                           20   25
+
+	     newsize = 0
+	     end = 25
+	     newstart = 0
+	     start = 20
+
+	     newstart+(end-start)-newsize = 5
+	     i = start-newstart
+
+	     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
+	     0                           20   25
+
+	     newsize = 2
+	     end = 25
+	     newstart = 0
+	     start = 20
+
+	     newstart+(end-start)-newsize = 3
+	     i = start-newstart + MIN(end-start, newsize)) = 22
+
+	     XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+	     5 	                         25
+	     newsize = 5
+	     end = 25
+	     start = 25
+	     newstart = 5
+
+	     newstart+(end-start)-newsize = 0
+	     i = start-newstart = 20
+
+	     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyy
+	     3 	                         23 25
+	     newsize = 5
+	     end = 25
+	     start = 23
+	     newstart = 3
+
+	     newstart+(end-start)-newsize = 0
+	     i = start-newstart + MIN(end-start, newsize) = 22
+
+	*/
+
+
+	/* fixup data */
+	newstart = &bucket->tb_data[bucket->tb_offset[bucket->used-1]];
+
+	g_assert(newstart+(end-start)-newsize <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
+	g_assert(newstart + (start-newstart) + MIN(end-start, newsize) <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
+	g_assert(newstart+(end-start)-newsize >= &bucket->tb_offset[bucket->used]);
+	g_assert(newstart + (start-newstart) + MIN(end-start, newsize) >= &bucket->tb_offset[bucket->used]);
+
+	memmove(newstart+(end-start)-newsize, newstart, ((start-newstart)+MIN(end-start, newsize)) * sizeof(blockid_t));
+
+	/* fixup key pointers */
+	for (i=index;i<bucket->used;i++) {
+		bucket->tb_offset[i] += (end-start)-newsize;
+	}
+	ibex_block_dirty((struct _block *)bucket);
+}
+
+/*
+  returns the number of blockid's free
+*/
+static int
+tail_space(struct _tailblock *tail)
+{
+	if (tail->used == 0)
+		return sizeof(tail->tb_data)/sizeof(tail->tb_data[0])-1;
+
+	return  &tail->tb_data[tail->tb_offset[tail->used-1]]
+		- (blockid_t *)&tail->tb_offset[tail->used];
+}
+
+static void
+tail_dump(struct _memcache *blocks, blockid_t tailid)
+{
+	int i;
+	struct _tailblock *tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
+
+	printf("Block %d, used %d\n", tailid, tail->used);
+	for (i=0;i<sizeof(struct _tailblock)/sizeof(unsigned int);i++) {
+		printf(" %08x", ((unsigned int *)tail)[i]);
+	}
+	printf("\n");
+}
+
+static blockid_t
+tail_get(struct _memcache *blocks, int size)
+{
+	struct _root *root = (struct _root *)ibex_block_read(blocks, 0);
+	blockid_t tailid;
+	struct _tailblock *tail;
+	int freeindex;
+	blockid_t *end;
+	int i, count = 0;
+
+	d(printf("looking for a tail node with %d items in it\n", size));
+
+	/* look for a node with enough space, if we dont find it fairly
+	   quickly, just quit.  needs a better free algorithm i think ... */
+	tailid = root->tail;
+	while (tailid && count<5) {
+		int space;
+
+		d(printf(" checking tail node %d\n", tailid));
+
+		tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
+		if (tail->used == 0) {
+			/* assume its big enough ... */
+			tail->used = 1;
+			tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
+			d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used));
+			ibex_block_dirty((struct _block *)tail);
+			return tailid;
+		}
+
+		/* see if we have a free slot first */
+		freeindex = -1;
+		end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])];
+		for (i=0;i<tail->used;i++) {
+			if (end == &tail->tb_data[tail->tb_offset[i]]) {
+				freeindex = i;
+				break;
+			}
+			end = &tail->tb_data[tail->tb_offset[i]];
+		}
+
+		/* determine how much space we have available - incl any extra header we might need */
+		space =  ((char *)&tail->tb_data[tail->tb_offset[tail->used-1]])
+			- ((char *)&tail->tb_offset[tail->used])
+			/* FIXMEL work out why this is out a little bit */
+			- 8;
+		if (freeindex == -1)
+			space -= sizeof(tail->tb_offset[0]);
+
+		/* if we have enough, set it up, creating a new entry if necessary */
+		/* for some really odd reason the compiler promotes this expression to unsigned, hence
+		   the requirement for the space>0 check ... */
+		if (space>0 && space > size*sizeof(blockid_t)) {
+			d(printf("space = %d, size = %d size*sizeof() = %d truth = %d\n", space, size, size*sizeof(blockid_t), space>size*sizeof(blockid_t)));
+			if (freeindex == -1) {
+				freeindex = tail->used;
+				tail->tb_offset[tail->used] = tail->tb_offset[tail->used-1];
+				tail->used++;
+			}
+			tail_compress(tail, freeindex, size);
+			ibex_block_dirty((struct _block *)tail);
+			d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, freeindex), tail->used));
+			return TAIL_KEY(tailid, freeindex);
+		}
+		count++;
+		tailid = block_location(tail->next);
+	}
+
+	d(printf("allocating new data node for tail data\n"));
+	/* didn't find one, setup a new one */
+	root = (struct _root *)ibex_block_read(blocks, 0);
+	tailid = ibex_block_get(blocks);
+	tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
+	tail->next = block_number(root->tail);
+	root->tail = tailid;
+	tail->used = 1;
+	tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
+	ibex_block_dirty((struct _block *)tail);
+	d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used));
+	return TAIL_KEY(tailid, 0);
+}
+
+static void
+tail_free(struct _memcache *blocks, blockid_t tailid)
+{
+	struct _tailblock *tail;
+
+	d(printf("freeing tail id %d\n", tailid));
+
+	if (tailid == 0)
+		return;
+
+	tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
+	d(printf("  tail %d used %d\n", tailid, tail->used));
+	g_assert(tail->used);
+	if (TAIL_INDEX(tailid)  == tail->used - 1) {
+		tail->used --;
+	} else {
+		tail_compress(tail, TAIL_INDEX(tailid), 0);
+	}
+	ibex_block_dirty((struct _block *)tail);
+}
+
+static struct _IBEXStore *disk_create(struct _memcache *bc)
+{
+	struct _IBEXStore *store;
+
+	store = g_malloc0(sizeof(*store));
+	store->klass = &ibex_diskarray_class;
+	store->blocks = bc;
+
+	return store;
+}
+
+static int disk_sync(struct _IBEXStore *store)
+{
+	/* no cache, nop */
+	return 0;
+}
+
+static int disk_close(struct _IBEXStore *store)
+{
+	g_free(store);
+	return 0;
+}
+
+static blockid_t
+disk_add(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
+{
+	struct _block *block;
+	struct _block *newblock;
+	blockid_t new, head = *headptr /*, tail = *tailptr*/;
+
+	g_error("Inbimplemented");
+
+	if (head == 0) {
+		head = ibex_block_get(store->blocks);
+	}
+	block = ibex_block_read(store->blocks, head);
+
+	d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
+
+	if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
+		(printf("adding record into block %d  %d\n", head, data));
+		block->bl_data[block->used] = data;
+		block->used++;
+		ibex_block_dirty(block);
+		return head;
+	} else {
+		new = ibex_block_get(store->blocks);
+		newblock = ibex_block_read(store->blocks, new);
+		newblock->next = block_number(head);
+		newblock->bl_data[0] = data;
+		newblock->used = 1;
+		d(printf("adding record into new %d  %d, next =%d\n", new, data, newblock->next));
+		ibex_block_dirty(newblock);
+		return new;
+	}
+}
+
+static blockid_t
+disk_add_blocks_internal(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
+{
+	blockid_t head = *headptr, tail = *tailptr, new;
+	int tocopy;
+	struct _tailblock *tailblock;
+	struct _block *block, *newblock;
+	int space, copied = 0, left;
+
+	/* assumes this funciton is in control of any tail creation */ 
+	g_assert(tail == 0);
+
+	d(printf("Adding %d items to block list\n", data->len));
+
+	if (head == 0) {
+		head = ibex_block_get(store->blocks);
+		d(printf("allocating new head %d\n", head));
+	}
+	block = ibex_block_read(store->blocks, head);
+
+	/* ensure the first block is full before proceeding */
+	space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
+	if (space) {
+		tocopy = MIN(data->len, space);
+		memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+		block->used += tocopy;
+		ibex_block_dirty(block);
+		copied = tocopy;
+		d(printf("copied %d items to left over last node\n", tocopy));
+	}
+
+	while (copied < data->len) {
+		left = data->len - copied;
+		/* do we drop the rest in a tail? */
+		if (left < TAIL_THRESHOLD) {
+			d(printf("Storing remaining %d items in tail\n", left));
+			tocopy = left;
+			new = tail_get(store->blocks, tocopy);
+			tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+			memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(new)]],
+			       &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+			*tailptr = new;
+		} else {
+			new = ibex_block_get(store->blocks);
+			newblock = (struct _block *)ibex_block_read(store->blocks, new);
+			newblock->next = block_number(head);
+			tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
+			d(printf("storing %d items in own block\n", tocopy));
+			memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+			newblock->used = tocopy;
+			block = newblock;
+			head = new;
+			ibex_block_dirty(newblock);
+		}
+		copied += tocopy;
+	}
+
+	*headptr = head;
+	return head;
+}
+/*
+  case 1:
+    no head, no tail, adding a lot of data
+      add blocks until the last, which goes in a tail node
+  case 2:
+    no head, no tail, adding a little data
+      just add a tail node
+  case 3:
+    no head, tail, total exceeds a block
+      merge as much as possible into new full blocks, then the remainder in the tail
+  case 4:
+    no head, tail, room in existing tail for data
+      add new data to tail node
+  case 5:
+    no head, tail, no room in existing tail for data
+      add a new tail node, copy data across, free old tail node
+*/
+
+static blockid_t
+disk_add_list(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
+{
+	blockid_t new, head = *headptr, tail = *tailptr, *start;
+	struct _tailblock *tailblock, *tailnew;
+	int len;
+	GArray *tmpdata = NULL;
+
+	d(printf("adding %d items head=%d tail=%d\n", data->len, head, tail));
+	if (data->len == 0)
+		return head;
+
+	/* store length=1 data in the tail pointer */
+	if (head == 0 && tail == 0 && data->len == 1) {
+		*headptr = BLOCK_ONE;
+		*tailptr = g_array_index(data, blockid_t, 0);
+		return BLOCK_ONE;
+	}
+	/* if we got length=1 data to append to, upgrade it to a real block list */
+	if (head == BLOCK_ONE) {
+		tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+		g_array_append_vals(tmpdata, data->data, data->len);
+		g_array_append_val(tmpdata, tail);
+		head = *headptr = 0;
+		tail = *tailptr = 0;
+	}
+
+	/* if we have no head, then check the tail */
+	if (head == 0) {
+		if (tail == 0) {
+			if (data->len >= TAIL_THRESHOLD) {
+				/* add normally */
+				head = disk_add_blocks_internal(store, headptr, tailptr, data);
+			} else {
+				/* else add to a tail block */
+				new = tail_get(store->blocks, data->len);
+				d(printf("adding %d items to a tail block %d\n", data->len, new));
+				tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+				memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
+				       data->data, data->len*sizeof(blockid_t));
+				*tailptr = new;
+				ibex_block_dirty((struct _block *)tailnew);
+			}
+		} else {
+			tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+			len = tail_info(tailblock, tail, &start);
+			/* case 3 */
+			if (len + data->len >= TAIL_THRESHOLD) {
+				/* this is suboptimal, but should work - merge the tail data with
+				   our new data, and write it out */
+				if (tmpdata == NULL) {
+					tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+					g_array_append_vals(tmpdata, data->data, data->len);
+				}
+				g_array_append_vals(tmpdata, start, len);
+				*tailptr = 0;
+				tail_free(store->blocks, tail);
+				head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
+			} else if (tail_space(tailblock) >= data->len) {
+				/* can we just expand this in our node, or do we need a new one? */
+				tail_compress(tailblock, TAIL_INDEX(tail), data->len + len);
+				memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(tail)] + len],
+				       data->data, data->len * sizeof(blockid_t));
+				ibex_block_dirty((struct _block *)tailblock);
+			} else {
+				/* we need to allocate a new tail node */
+				new = tail_get(store->blocks, data->len + len);
+				/* and copy the data across */
+				tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+				memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
+				       start, len*sizeof(blockid_t));
+				memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]+len],
+				       data->data, data->len*sizeof(blockid_t));
+				tail_free(store->blocks, tail);
+				ibex_block_dirty((struct _block *)tailnew);
+				*tailptr = new;
+			}
+		}
+	} else {
+		if (tail) {
+			/* read/merge the tail with the new data, rewrite out.
+			   suboptimal, but it should be 'ok' ? */
+			tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+			len = tail_info(tailblock, tail, &start);
+			tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+			g_array_append_vals(tmpdata, start, len);
+			g_array_append_vals(tmpdata, data->data, data->len);
+			*tailptr = 0;
+			tail_free(store->blocks, tail);
+			head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
+		} else {
+			head = disk_add_blocks_internal(store, headptr, tailptr, data);
+		}
+	}
+	if (tmpdata)
+		g_array_free(tmpdata, TRUE);
+
+	return head;
+}
+
+static blockid_t
+disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
+{
+	blockid_t head = *headptr, node = *headptr, tail = *tailptr;
+	int i;
+
+	/* special case for 1-item nodes */
+	if (head == BLOCK_ONE) {
+		if (tail == data) {
+			*tailptr = 0;
+			*headptr = 0;
+			head = 0;
+		}
+		return head;
+	}
+
+	d(printf("removing %d from %d\n", data, head));
+	while (node) {
+		struct _block *block = ibex_block_read(store->blocks, node);
+
+		for (i=0;i<block->used;i++) {
+			if (block->bl_data[i] == data) {
+				struct _block *start = ibex_block_read(store->blocks, head);
+
+				d(printf("removing data from block %d\n ", head));
+
+				start->used--;
+				block->bl_data[i] = start->bl_data[start->used];
+				if (start->used == 0) {
+					struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
+					blockid_t new;
+
+					d(printf("dropping block %d, new head = %d\n", head, start->next));
+					new = block_location(start->next);
+					start->next = block_number(root->free);
+					root->free = head;
+					head = new;
+					ibex_block_dirty((struct _block *)root);
+				}
+				ibex_block_dirty(block);
+				ibex_block_dirty(start);
+				*headptr = head;
+				return head;
+			}
+		}
+		node = block_location(block->next);
+	}
+
+	if (tail) {
+		struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+		int len;
+		blockid_t *start;
+
+		len = tail_info(tailblock, tail, &start);
+		for (i=0;i<len;i++) {
+			if (start[i] == data) {
+				for (;i<len-1;i++)
+					start[i] = start[i+1];
+				if (len == 1)
+					*tailptr = 0;
+				if (len == 1 && (tailblock->used-1) == TAIL_INDEX(tail)) {
+					d(printf("dropping/unlinking tailblock %d (%d) used = %d\n",
+						 TAIL_BLOCK(tail), tail, tailblock->used));
+					tailblock->used--;
+					/* drop/unlink block? */
+					ibex_block_dirty((struct _block *)tailblock);
+					*tailptr = 0;
+				} else {
+					tail_compress(tailblock, TAIL_INDEX(tail), len-1);
+					ibex_block_dirty((struct _block *)tailblock);
+				}
+			}
+		}
+		
+	}
+
+	return head;
+}
+
+static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail)
+{
+	blockid_t next;
+	struct _block *block;
+
+	if (head == BLOCK_ONE)
+		return;
+
+	while (head) {
+		d(printf("freeing block %d\n", head));
+		block = ibex_block_read(store->blocks, head);
+		next = block_location(block->next);
+		ibex_block_free(store->blocks, head);
+		head = next;
+	}
+	if (tail) {
+		struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+		d(printf("freeing tail block %d (%d)\n", TAIL_BLOCK(tail), tail));
+		tail_compress(tailblock, TAIL_INDEX(tail), 0);
+		ibex_block_dirty((struct _block *)tailblock);
+	}
+}
+
+static gboolean
+disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data)
+{
+	blockid_t node = head;
+	int i;
+
+	d(printf("finding %d from %d\n", data, head));
+
+	if (head == BLOCK_ONE)
+		return data == tail;
+
+	/* first check full blocks */
+	while (node) {
+		struct _block *block = ibex_block_read(store->blocks, node);
+
+		for (i=0;i<block->used;i++) {
+			if (block->bl_data[i] == data) {
+				return TRUE;
+			}
+		}
+		node = block_location(block->next);
+	}
+
+	/* then check tail */
+	if (tail) {
+		struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+		int len;
+		blockid_t *start;
+
+		len = tail_info(tailblock, tail, &start);
+		for (i=0;i<len;i++) {
+			if (start[i] == data)
+				return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static GArray *
+disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail)
+{
+	GArray *result = g_array_new(0, 0, sizeof(nameid_t));
+
+	if (head == BLOCK_ONE) {
+		g_array_append_val(result, tail);
+		return result;
+	}
+
+	while (head) {
+		struct _block *block = ibex_block_read(store->blocks, head);
+
+		d(printf("getting data from block %d\n", head));
+
+		g_array_append_vals(result, block->bl_data, block->used);
+		head = block_location(block->next);
+		d(printf("next = %d\n", head));
+	}
+
+	/* chuck on any tail data as well */
+	if (tail) {
+		struct _tailblock *tailblock;
+		int len;
+		blockid_t *start;
+		
+		tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+		len = tail_info(tailblock, tail, &start);
+		g_array_append_vals(result, start, len);
+	}
+	return result;
+}
+
+void
+ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail)
+{
+	blockid_t node = head;
+
+	printf("dumping list %d tail %d\n", node, tail);
+	if (head == BLOCK_ONE) {
+		printf(" 1 length index: %d\n", tail);
+		return;
+	}
+       
+	while (node) {
+		struct _block *block = ibex_block_read(blocks, node);
+		int i;
+
+		printf(" block %d used %d\n ", node, block->used);
+		for (i=0;i<block->used;i++) {
+			printf(" %d", block->bl_data[i]);
+		}
+		printf("\n");
+		node = block_location(block->next);
+	}
+
+	printf("tail: ");
+	if (tail) {
+		struct _tailblock *tailblock;
+		int len;
+		blockid_t *start;
+		int i;
+
+		tailblock = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tail));
+		len = tail_info(tailblock, tail, &start);
+		for (i=0;i<len;i++)
+			printf(" %d", start[i]);
+	}
+	printf("\n");
+}
+
+#if 0
+int main(int argc, char **argv)
+{
+	struct _memcache *bc;
+	struct _IBEXStore *disk;
+	int i;
+	blockid_t data[100];
+	GArray adata;
+	blockid_t head=0, tail=0;
+
+	for (i=0;i<100;i++) {
+		data[i] = i;
+	}
+
+	bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
+	disk = ibex_diskarray_class.create(bc);
+
+	head = 0;
+	tail = 0;
+	adata.data = data;
+	adata.len = 70;
+	for (i=0;i<100;i++) {
+		printf("Loop %d\n", i);
+		ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
+		ibex_diskarray_dump(bc, head, tail);
+	}
+	
+#if 0
+	for (i=1;i<100;i++) {
+		printf("inserting %d items\n", i);
+		adata.data = data;
+		adata.len = i;
+		head=0;
+		tail=0;
+		ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
+		ibex_diskarray_dump(bc, head, tail);
+	}
+#endif
+	ibex_diskarray_class.close(disk);
+	ibex_block_cache_close(bc);
+	return 0;
+}
+
+#endif
--- a/libibex/file.c
+++ b/libibex/file.c
@ -1,481 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- * 
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB.  If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* file.c: index file read/write ops */
-
-#include <ctype.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "ibex_internal.h"
-
-static unsigned long read_number (FILE *f);
-static void write_number (FILE *f, unsigned long n);
-static char *get_compressed_word (FILE *f, char **lastword);
-
-static gint free_file (gpointer key, gpointer value, gpointer data);
-static void free_word (gpointer key, gpointer value, gpointer data);
-
-/* The file format is:
- *
- * version string (currently "ibex1")
- * file count
- * list of compressed filenames, separated by \0
- * word count
- * list of compressed words, each followed by \0, a count, and that
- *   many references.
- *
- * All numbers are stored 7-bit big-endian, with the high bit telling
- * whether or not the number continues to the next byte.
- *
- * compressed text consists of a byte telling how many characters the
- * line has in common with the line before it, followed by the rest of
- * the string. Obviously this only really works if the lists are sorted.
- */
-
-/**
- * ibex_open: open (or possibly create) an ibex index
- * @file: the name of the file
- * @flags: open flags, see open(2).
- * @mode: If O_CREAT is passed in flags, then the file mode
- * to create the new file with.  It will be anded with the current
- * umask.
- *
- * Open and/or create the named ibex file and return a handle to it.
- *
- * Return value: an ibex handle, or NULL if an error occurred.
- **/
-ibex *
-ibex_open (char *file, int flags, int mode)
-{
-	ibex *ib;
-	FILE *f;
-	char vbuf[sizeof (IBEX_VERSION) - 1];
-	char *word, *lastword;
-	unsigned long nfiles, nwords, nrefs, ref;
-	ibex_file **ibfs = NULL;
-	int i;
-	GPtrArray *refs;
-	int fd;
-	char *modestr;
-
-	fd = open(file, flags, mode);
-	if (fd == -1) {
-		return NULL;
-	}
-
-	/* yuck, this is because we use FILE * interface
-	   internally */
-	switch (flags & O_ACCMODE) {
-	case O_RDONLY:
-		modestr = "r";
-		break;
-	case O_RDWR:
-		if (flags & O_APPEND)
-			modestr = "a+";
-		else
-			modestr = "w+";
-		break;
-	case O_WRONLY:
-		if (flags & O_APPEND)
-			modestr = "a";
-		else
-			modestr = "w";
-		break;
-	default:
-		if (flags & O_APPEND)
-			modestr = "a+";
-		else
-			modestr = "r+";
-		break;
-	}
-
-	f = fdopen(fd, modestr);
-	if (f == NULL) {
-		if (errno == 0)
-			errno = ENOMEM;
-		close(fd);
-		return NULL;
-	}
-
-	ib = g_malloc (sizeof (ibex));
-	ib->dirty = FALSE;
-	ib->path = g_strdup (file);
-	ib->files = g_tree_new ((GCompareFunc) strcmp);
-	ib->words = g_hash_table_new (g_str_hash, g_str_equal);
-	ib->oldfiles = g_ptr_array_new ();
-
-	if (!f) {
-		close(fd);
-		return ib;
-	}
-
-	/* Check version.  If its empty, then we have just created it */
-	if (fread (vbuf, 1, sizeof (vbuf), f) != sizeof (vbuf)) {
-		if (feof (f)) {
-			fclose(f);
-			close(fd);
-			return ib;
-		}
-	}
-	if (strncmp (vbuf, IBEX_VERSION, sizeof (vbuf) != 0)) {
-		errno = EINVAL;
-		goto errout;
-	}
-
-	/* Read list of files. */
-	nfiles = read_number (f);
-	ibfs = g_malloc (nfiles * sizeof (ibex_file *));
-	lastword = NULL;
-	for (i = 0; i < nfiles; i++) {
-		ibfs[i] = g_malloc (sizeof (ibex_file));
-		ibfs[i]->name = get_compressed_word (f, &lastword);
-		if (!ibfs[i]->name) {
-			goto errout;
-		}
-		ibfs[i]->index = 0;
-		g_tree_insert (ib->files, ibfs[i]->name, ibfs[i]);
-	}
-
-	/* Read list of words. */
-	nwords = read_number (f);
-	lastword = NULL;
-	for (i = 0; i < nwords; i++) {
-		word = get_compressed_word (f, &lastword);
-		if (!word) {
-			goto errout;
-		}
-
-		nrefs = read_number (f);
-		refs = g_ptr_array_new ();
-		g_ptr_array_set_size (refs, nrefs);
-		while (nrefs--) {
-			ref = read_number (f);
-			if (ref >= nfiles) {
-				goto errout;
-			}
-			refs->pdata[nrefs] = ibfs[ref];
-		}
-
-		g_hash_table_insert (ib->words, word, refs);
-	}
-
-	g_free (ibfs);
-	fclose (f);
-	close(fd);
-	return ib;
-
-errout:
-
-	fclose (f);
-	close(fd);
-	g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
-	g_tree_destroy (ib->files);
-	g_hash_table_foreach (ib->words, free_word, NULL);
-	g_hash_table_destroy (ib->words);
-	g_ptr_array_free (ib->oldfiles, TRUE);
-	if (ibfs)
-		g_free (ibfs);
-	g_free (ib->path);
-	g_free (ib);
-
-	return NULL;
-}
-
-struct ibex_write_data {
-	unsigned long index;
-	FILE *f;
-	char *lastname;
-};
-
-/* This is an internal function to find the longest common initial
- * prefix between the last-written word and the current word.
- */
-static int
-get_prefix (struct ibex_write_data *iwd, char *name)
-{
-	int i = 0;
-	if (iwd->lastname) {
-		while (!strncmp (iwd->lastname, name, i + 1))
-			i++;
-	}
-	iwd->lastname = name;
-	return i;
-}
-
-static gint
-write_file (gpointer key, gpointer value, gpointer data)
-{
-	char *file = key;
-	ibex_file *ibf = value;
-	struct ibex_write_data *iwd = data;
-	int prefix;
-
-	ibf->index = iwd->index++;
-	prefix = get_prefix (iwd, file);
-	fprintf (iwd->f, "%c%s", prefix, file + prefix);
-	fputc (0, iwd->f);
-	return FALSE;
-}
-
-/* scans for words which still exist in the index (after
-   index removals), and adds them to the ordered tree for
-   writing out in order */
-static void
-store_word (gpointer key, gpointer value, gpointer data)
-{
-	GTree *wtree = data;
-	GPtrArray *refs = value;
-	int i;
-	ibex_file *ibf;
-
-	for (i = 0; i < refs->len; i++) {
-		ibf = g_ptr_array_index (refs, i);
-		if (ibf->index == -1) {
-			g_ptr_array_remove_index_fast (refs, i);
-			i--;
-		}
-	}
-
-	if (refs->len > 0) {
-		g_tree_insert (wtree, key, value);
-	}
-}
-
-/* writes a word out, in order */
-static gint
-write_word (gpointer key, gpointer value, gpointer data)
-{
-	char *word = key;
-	GPtrArray *refs = value;
-	struct ibex_write_data *iwd = data;
-	int i, prefix;
-	ibex_file *ibf;
-
-	prefix = get_prefix (iwd, word);
-	fprintf (iwd->f, "%c%s", prefix, word + prefix);
-	fputc (0, iwd->f);
-
-	write_number (iwd->f, refs->len);
-
-	for (i = 0; i < refs->len; i++) {
-		ibf = g_ptr_array_index (refs, i);
-		write_number (iwd->f, ibf->index);
-	}
-	return FALSE;
-}
-
-/**
- * ibex_write: Write an ibex out to disk.
- * @ib: the ibex
- *
- * This writes an ibex to disk.
- *
- * Return value: 0 for success, -1 for failure (in which case errno
- * is set).
- **/
-int
-ibex_write (ibex *ib)
-{
-	struct ibex_write_data iwd;
-	GTree *wtree;
-	char *tmpfile;
-
-	tmpfile = g_strdup_printf ("%s~", ib->path);
-	iwd.f = fopen (tmpfile, "w");
-	if (!iwd.f) {
-		if (errno == 0)
-			errno = ENOMEM;
-		g_free (tmpfile);
-		return -1;
-	}
-
-	fputs (IBEX_VERSION, iwd.f);
-	if (ferror (iwd.f))
-		goto lose;
-
-	iwd.index = 0;
-	iwd.lastname = NULL;
-	write_number (iwd.f, g_tree_nnodes (ib->files));
-	if (ferror (iwd.f))
-		goto lose;
-	g_tree_traverse (ib->files, write_file, G_IN_ORDER, &iwd);
-	if (ferror (iwd.f))
-		goto lose;
-
-	iwd.lastname = NULL;
-	wtree = g_tree_new ((GCompareFunc) strcmp);
-	g_hash_table_foreach (ib->words, store_word, wtree);
-	write_number (iwd.f, g_tree_nnodes(wtree));
-	if (ferror (iwd.f))
-		goto lose;
-	g_tree_traverse (wtree, write_word, G_IN_ORDER, &iwd);
-	g_tree_destroy (wtree);
-	if (ferror (iwd.f))
-		goto lose;
-
-	if (fclose (iwd.f) == 0 && rename (tmpfile, ib->path) == 0) {
-		g_free (tmpfile);
-		ib->dirty = FALSE;
-		return 0;
-	}
-
-lose:
-	unlink (tmpfile);
-	g_free (tmpfile);
-	return -1;
-}
-
-/**
- * ibex_save:
- * @ib: 
- * 
- * Only write out an ibex if it is dirty.
- * 
- * Return value: Same as ibex_write.
- **/
-int
-ibex_save (ibex *ib)
-{
-	if (ib->dirty)
-		return ibex_write(ib);
-	return 0;
-}
-
-/**
- * ibex_close: Write out the ibex file (if it has changed) and free
- * the data associated with it.
- * @ib: the ibex
- *
- * If this ibex file has been modified since it was opened, this will
- * call ibex_write() to write it out to disk. It will then free all data
- * associated with the ibex. After calling ibex_close(), @ib will no
- * longer be a valid ibex.
- *
- * Return value: 0 on success, -1 on an ibex_write() failure (in which
- * case @ib will not be destroyed).
- **/
-int
-ibex_close (ibex *ib)
-{
-	ibex_file *ibf;
-
-	if (ib->dirty && ibex_write (ib) == -1)
-		return -1;
-
-	g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
-	g_tree_destroy (ib->files);
-	g_hash_table_foreach (ib->words, free_word, NULL);
-	g_hash_table_destroy (ib->words);
-
-	while (ib->oldfiles->len) {
-		ibf = g_ptr_array_remove_index (ib->oldfiles, 0);
-		g_free (ibf->name);
-		g_free (ibf);
-	}
-	g_ptr_array_free (ib->oldfiles, TRUE);
-	g_free (ib->path);
-	g_free (ib);
-
-	return 0;
-}
-
-static gint
-free_file (gpointer key, gpointer value, gpointer data)
-{
-	ibex_file *ibf = value;
-
-	g_free (ibf->name);
-	g_free (ibf);
-	return FALSE;
-}
-
-static void
-free_word (gpointer key, gpointer value, gpointer data)
-{
-	g_free (key);
-	g_ptr_array_free (value, TRUE);
-}
-
-static char *
-get_compressed_word (FILE *f, char **lastword)
-{
-	char *buf, *p;
-	int c, size;
-
-	c = getc (f);
-	if (c == EOF)
-		return NULL;
-
-	size = c + 10;
-	buf = g_malloc (size);
-	if (*lastword)
-		strncpy (buf, *lastword, c);
-	p = buf + c;
-	do {
-		c = getc (f);
-		if (c == EOF)
-			return NULL;
-		if (p == buf + size) {
-			buf = g_realloc (buf, size + 10);
-			p = buf + size;
-			size += 10;
-		}
-		*p++ = c;
-	} while (c != 0);
-
-	*lastword = buf;
-	return buf;
-}
-
-static void
-write_number (FILE *f, unsigned long number)
-{
-	int i, flag = 0;
-	char buf[4];
-
-	i = 4;
-	do {
-		buf[--i] = (number & 0x7F) | flag;
-		number = number >> 7;
-		flag = 0x80;
-	} while (number != 0);
-
-	fwrite (buf + i, 1, 4 - i, f);
-}
-
-static unsigned long
-read_number (FILE *f)
-{
-	int byte;
-	unsigned long num;
-
-	num = 0;
-	do {
-		byte = getc (f);
-		num = num << 7 | (byte & 0x7F);
-	} while (byte & 0x80);
-  
-	return num;
-}
-
--- a/libibex/find.c
+++ b/libibex/find.c
@ -1,198 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- * 
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB.  If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* find.c: index file searching ops */
-
-#include <string.h>
-
-#include "ibex_internal.h"
-
-/**
- * ibex_find: search an ibex for a word
- * @ib: an ibex
- * @word: the word
- *
- * This routine searches an ibex for a word and returns a GPtrArray
- * containing the names of the files in the ibex that contain the word.
- * If no matches are found, it will return an empty array (not NULL).
- * The caller must free the array, but MUST NOT free or alter its
- * elements.
- *
- * Return value: the array of filenames containing @word
- **/
-GPtrArray *
-ibex_find (ibex *ib, char *word)
-{
-	GPtrArray *refs, *ret;
-	ibex_file *ibf;
-	int i;
-
-	ret = g_ptr_array_new ();
-	refs = g_hash_table_lookup (ib->words, word);
-	if (refs) {
-		for (i = 0; i < refs->len; i++)	{
-			ibf = g_ptr_array_index (refs, i);
-			g_ptr_array_add (ret, ibf->name);
-		}
-	}
-	return ret;
-}
-
-/**
- * ibex_contains_name:
- * @ib: 
- * @name: 
- * 
- * Returns #TRUE if the ibex @ib has any index entry for
- * the key @name.
- * 
- * Return value: 
- **/
-gboolean
-ibex_contains_name(ibex *ib, char *name)
-{
-	return g_tree_lookup(ib->files, name) != NULL;
-}
-
-/**
- * ibex_find_name: Check if a word occurs in a given file
- * @ib: an ibex
- * @name: a filename
- * @word: a word
- *
- * This checks if the given word occurs in the given file.
- *
- * Return value: TRUE or FALSE
- **/
-gboolean
-ibex_find_name (ibex *ib, char *name, char *word)
-{
-	GPtrArray *refs;
-	ibex_file *ibf;
-	int i;
-
-	refs = g_hash_table_lookup (ib->words, word);
-	if (refs) {
-		for (i = 0; i < refs->len; i++)	{
-			ibf = g_ptr_array_index (refs, i);
-			if (!strcmp (ibf->name, name))
-				return TRUE;
-		}
-	}
-	return FALSE;
-}
-
-static gint
-build_array (gpointer key, gpointer value, gpointer data)
-{
-	char *name = key;
-	unsigned int count = GPOINTER_TO_UINT (value);
-	GPtrArray *ret = data;
-
-	if (count == 1)
-		g_ptr_array_add (ret, name);
-	return FALSE;
-}
-
-/**
- * ibex_find_all: Find files containing multiple words
- * @ib: an ibex
- * @words: a GPtrArray of words
- *
- * This works like ibex_find(), but returns an array of filenames
- * which contain all of the words in @words.
- *
- * Return value: an array of matches
- **/
-GPtrArray *
-ibex_find_all (ibex *ib, GPtrArray *words)
-{
-	GTree *work;
-	GPtrArray *wrefs, *ret;
-	int i, j, count;
-	char *word;
-	ibex_file *ibf;
-
-	if (words->len == 0)
-		return g_ptr_array_new ();
-	else if (words->len == 1)
-		return ibex_find (ib, g_ptr_array_index (words, 0));
-
-	work = g_tree_new ((GCompareFunc) strcmp);
-	for (i = 0; i < words->len; i++) {
-		word = g_ptr_array_index (words, i);
-		wrefs = g_hash_table_lookup (ib->words, word);
-		if (!wrefs) {
-			/* One of the words isn't even in the index. */
-			g_tree_destroy (work);
-			return g_ptr_array_new ();
-		}
-
-		if (i == 0) {
-			/* Copy the references into a tree, using the
-			 * filenames as keys and the size of words as
-			 * the value.
-			 */
-			for (j = 0; j < wrefs->len; j++) {
-				ibf = g_ptr_array_index (wrefs, j);
-				g_tree_insert (work, ibf->name,
-					       GUINT_TO_POINTER (words->len));
-			}
-		} else {
-			/* Increment the counts in the working tree
-			 * for the references for this word.
-			 */
-			for (j = 0; j < wrefs->len; j++) {
-				ibf = g_ptr_array_index (wrefs, j);
-				count = GPOINTER_TO_UINT (g_tree_lookup (work, ibf->name));
-				if (count) {
-					g_tree_insert (work, ibf->name,
-						       GUINT_TO_POINTER (count - 1));
-				}
-			}
-		}
-	}
-
-	/* Build an array with the refs that contain all the words. */
-	ret = g_ptr_array_new ();
-	g_tree_traverse (work, build_array, G_IN_ORDER, ret);
-	g_tree_destroy (work);
-	return ret;
-}
-
-static void
-ibex_dump_foo(char *key, GPtrArray *refs, void *data)
-{
-	int i;
-
-	g_print("%s: ", key);
-	for (i=0;i<refs->len;i++) {
-		ibex_file *ibf = g_ptr_array_index (refs, i);
-		g_print("%c%s", ibf->index==-1?'-':' ', ibf->name);
-	}
-	g_print("\n");
-}
-
-/* debug function to dump the tree, in key order */
-void
-ibex_dump_all (ibex *ib)
-{
-	g_hash_table_foreach(ib->words, (GHFunc) ibex_dump_foo, 0);
-}
--- a/libibex/hash.c
+++ b/libibex/hash.c
@ -0,0 +1,701 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* hash based index mechanism */
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+
+#define HASH_SIZE (1024)
+#define KEY_THRESHOLD (sizeof(struct _hashkey) + 4) /* minimum number of free bytes we worry about
+						      maintaining free blocks for */
+#define ARRAY_LEN(a) (sizeof(a)/sizeof(a[0]))
+
+typedef guint32 hashid_t;
+
+static struct _IBEXIndex *hash_create(struct _memcache *bc, int size);
+static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root);
+static int hash_sync(struct _IBEXIndex *index);
+static int hash_close(struct _IBEXIndex *index);
+
+static hashid_t hash_find(struct _IBEXIndex *index, const char *key, int keylen);
+static void hash_remove(struct _IBEXIndex *index, const char *key, int keylen);
+static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keylen);
+static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len);
+static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail);
+static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail);
+
+struct _IBEXIndexClass ibex_hash_class = {
+	hash_create, hash_open,
+	hash_sync, hash_close,
+	hash_find,
+	hash_remove,
+	hash_insert,
+	hash_get_key,
+	hash_set_data_block,
+	hash_get_data_block,
+};
+
+/* the reason we have the tail here is that otherwise we need to
+   have a 32 bit blockid for the root node; which would make this
+   structure the same size anyway, with about 24 wasted bits */
+struct _hashkey {
+	blockid_t next;		/* next in hash chain */
+	blockid_t tail;
+	unsigned int root:32-BLOCK_BITS;
+	unsigned int keyoffset:BLOCK_BITS;
+};
+
+struct _hashblock {
+	/*blockid_t next;*/		/* all key blocks linked together? */
+	guint32 used;		/* elements used */
+	union {
+		struct _hashkey keys[(BLOCK_SIZE-4)/sizeof(struct _hashkey)];
+		char keydata[BLOCK_SIZE-4];
+	} hashblock_u;
+};
+#define hb_keys hashblock_u.keys
+#define hb_keydata hashblock_u.keydata
+
+/* root block for a hash index */
+struct _hashroot {
+	hashid_t free;		/* free list */
+	guint32 size;		/* how big the hash table is */
+	hashid_t table[(BLOCK_SIZE-8)/sizeof(hashid_t)]; /* pointers to blocks of pointers */
+};
+
+struct _hashtableblock {
+	hashid_t buckets[BLOCK_SIZE/sizeof(hashid_t)];
+};
+
+/* map a hash index to a block index */
+#define HASH_INDEX(b) ((b) & (BLOCK_SIZE-1))
+/* map a hash index to a block number */
+#define HASH_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
+/* map a block + index to a hash key */
+#define HASH_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
+
+/* taken from tdb/gdbm */
+static unsigned int hash_key(const unsigned char *key, int keylen)
+{
+	char *newkey;
+	newkey = alloca(keylen+1);
+	memcpy(newkey, key, keylen);
+	newkey[keylen]=0;
+	return g_str_hash(newkey);
+#if 0
+	unsigned int value;	/* Used to compute the hash value.  */
+	unsigned int  i;	/* Used to cycle through random values. */
+
+	/* Set the initial value from the key size. */
+	value = 0x238F13AF * keylen;
+	for (i=0; i < keylen; i++) {
+		value = (value + (key[i] << (i*5 % 24)));
+	}
+
+	value = (1103515243 * value + 12345);  
+
+	return value;
+#endif
+}
+
+/* create a new hash table, return a pointer to its root block */
+static struct _IBEXIndex *
+hash_create(struct _memcache *bc, int size)
+{
+	blockid_t root, block;
+	struct _hashroot *hashroot;
+	int i;
+	struct _hashtableblock *table;
+	struct _IBEXIndex *index;
+
+	g_assert(size<=10240);
+
+	d(printf("initialising hash table, size = %d\n", size));
+
+	index = g_malloc0(sizeof(*index));
+	index->blocks = bc;
+	index->klass = &ibex_hash_class;
+	root = ibex_block_get(bc);
+	index->root = root;
+	d(printf(" root = %d\n", root));
+	hashroot = (struct _hashroot *)ibex_block_read(bc, root);
+	hashroot->free = 0;
+	hashroot->size = size;
+	ibex_block_dirty((struct _block *)hashroot);
+	for (i=0;i<size/(BLOCK_SIZE/sizeof(blockid_t));i++) {
+		d(printf("initialising hash table index block %d\n", i));
+		block = hashroot->table[i] = ibex_block_get(bc);
+		table = (struct _hashtableblock *)ibex_block_read(bc, block);
+		memset(table, 0, sizeof(table));
+		ibex_block_dirty((struct _block *)table);
+	}
+
+	return index;
+}
+
+static struct _IBEXIndex *
+hash_open(struct _memcache *bc, blockid_t root)
+{
+	struct _IBEXIndex *index;
+
+	/* FIXME: check a 'magic', and the root for validity */
+
+	index = g_malloc0(sizeof(*index));
+	index->blocks = bc;
+	index->root = root;
+	index->klass = &ibex_hash_class;
+
+	return index;
+}
+
+
+static int hash_sync(struct _IBEXIndex *index)
+{
+	/* nop, index always synced on disk (at least, to blocks) */
+	return 0;
+}
+
+static int hash_close(struct _IBEXIndex *index)
+{
+	g_free(index);
+	return 0;
+}
+
+/* convert a hashbucket id into a name */
+static char *
+hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len)
+{
+	struct _hashblock *bucket;
+	int ind;
+	char *ret, *start, *end;
+
+	if (hashbucket == 0) {
+		if (len)
+			*len = 0;
+		return g_strdup("");
+	}
+
+	bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+	ind = HASH_INDEX(hashbucket);
+
+	g_assert(ind < bucket->used);
+
+	start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+	if (ind == 0) {
+		end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+	} else {
+		end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+	}
+	
+	ret = g_malloc(end-start+1);
+	memcpy(ret, start, end-start);
+	ret[end-start]=0;
+	if (len)
+		*len = end-start;
+	return ret;
+}
+
+/* sigh, this is fnugly code ... */
+static hashid_t
+hash_find(struct _IBEXIndex *index, const char *key, int keylen)
+{
+	struct _hashroot *hashroot;
+	guint32 hash;
+	int hashentry;
+	blockid_t hashtable;
+	hashid_t hashbucket;
+	struct _hashtableblock *table;
+
+	g_assert(index != 0);
+	g_assert(index->root != 0);
+
+	d(printf("finding hash %.*s\n", keylen, key));
+
+	hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+	/* find the table containing this entry */
+	hash = hash_key(key, keylen) % hashroot->size;
+	hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+	g_assert(hashtable != 0);
+	table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+	hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+	/* and its bucket */
+	hashbucket = table->buckets[hashentry];
+
+	/* go down the bucket chain, reading each entry till we are done ... */
+	while (hashbucket != 0) {
+		struct _hashblock *bucket;
+		char *start, *end;
+		int ind;
+
+		d(printf(" checking bucket %d\n", hashbucket));
+
+		/* get the real bucket id from the hashbucket id */
+		bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+		/* and get the key number within the block */
+		ind = HASH_INDEX(hashbucket);
+
+		g_assert(ind < bucket->used);
+
+		start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+		if (ind == 0) {
+			end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+		} else {
+			end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+		}
+		if ( (end-start) == keylen
+		     && memcmp(start, key, keylen) == 0) {
+			return hashbucket;
+		}
+		hashbucket = bucket->hb_keys[ind].next;
+	}
+	return 0;
+}
+
+/* compresses the bucket 'bucket', removing data
+   at index 'index' */
+static void
+hash_compress(struct _hashblock *bucket, int index)
+{
+	int i;
+	char *start, *end, *newstart;
+
+	/* get start/end of area to zap */
+	start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset];
+	if (index == 0) {
+		end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+	} else {
+		end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
+	}
+
+	if (start == end)
+		return;
+
+	/* fixup data */
+	newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
+	memmove(newstart+(end-start), newstart, start-newstart);
+
+	/* fixup key pointers */
+	for (i=index;i<bucket->used;i++) {
+		bucket->hb_keys[i].keyoffset += (end-start);
+	}
+	ibex_block_dirty((struct _block *)bucket);
+}
+
+/* make room 'len' for the key 'index' */
+/* assumes key 'index' is already empty (0 length) */
+static void
+hash_expand(struct _hashblock *bucket, int index, int len)
+{
+	int i;
+	char *end, *newstart;
+
+	/* get start/end of area to zap */
+	if (index == 0) {
+		end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+	} else {
+		end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
+	}
+
+	/* fixup data */
+	newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
+	memmove(newstart-len, newstart, end-newstart);
+
+	/* fixup key pointers */
+	for (i=index;i<bucket->used;i++) {
+		bucket->hb_keys[i].keyoffset -= len;
+	}
+	ibex_block_dirty((struct _block *)bucket);
+}
+
+static void
+hash_remove(struct _IBEXIndex *index, const char *key, int keylen)
+{
+	struct _hashroot *hashroot;
+	guint32 hash;
+	int hashentry;
+	blockid_t hashtable;
+	hashid_t hashbucket, hashprev;
+	struct _hashtableblock *table;
+
+	g_assert(index != 0);
+	g_assert(index->root != 0);
+
+	d(printf("removing hash %.*s\n", keylen, key));
+
+	hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+	/* find the table containing this entry */
+	hash = hash_key(key, keylen) % hashroot->size;
+	hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+	table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+	hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+	/* and its bucket */
+	hashbucket = table->buckets[hashentry];
+
+	/* go down the bucket chain, reading each entry till we are done ... */
+	hashprev = 0;
+	while (hashbucket != 0) {
+		struct _hashblock *bucket;
+		char *start, *end;
+		int ind;
+
+		d(printf(" checking bucket %d\n", hashbucket));
+
+		/* get the real bucket id from the hashbucket id */
+		bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+		/* and get the key number within the block */
+		ind = HASH_INDEX(hashbucket);
+
+		g_assert(ind < bucket->used);
+
+		start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+		if (ind == 0) {
+			end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+		} else {
+			end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+		}
+		if ( (end-start) == keylen
+		     && memcmp(start, key, keylen) == 0) {
+			struct _hashblock *prevbucket;
+
+			if (hashprev == 0) {
+				/* unlink from hash chain */
+				table->buckets[hashentry] = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+				/* link into free list */
+				bucket->hb_keys[HASH_INDEX(hashbucket)].next = hashroot->free;
+				hashroot->free = hashbucket;
+				/* compress away data */
+				hash_compress(bucket, HASH_INDEX(hashbucket));
+				ibex_block_dirty((struct _block *)bucket);
+				ibex_block_dirty((struct _block *)table);
+				ibex_block_dirty((struct _block *)hashroot);
+			} else {
+				prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashprev));
+				prevbucket->hb_keys[HASH_INDEX(hashprev)].next =
+					bucket->hb_keys[ind].next;
+				/* link into free list */
+				bucket->hb_keys[ind].next = hashroot->free;
+				hashroot->free = hashbucket;
+				/* compress entry */
+				hash_compress(bucket, ind);
+				ibex_block_dirty((struct _block *)bucket);
+				ibex_block_dirty((struct _block *)prevbucket);
+				ibex_block_dirty((struct _block *)hashroot);
+			}
+			return;
+		}
+		hashprev = hashbucket;
+		hashbucket = bucket->hb_keys[ind].next;
+	}
+}
+
+/* set where the datablock is located */
+static void
+hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail)
+{
+	struct _hashblock *bucket;
+
+	d(printf("setting data block hash %d to %d tail %d\n", keyid, blockid, tail));
+
+	/* map to a block number */
+	g_assert((blockid & (BLOCK_SIZE-1)) == 0);
+	blockid >>= BLOCK_BITS;
+
+	bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
+	if (bucket->hb_keys[HASH_INDEX(keyid)].root != blockid
+	    || bucket->hb_keys[HASH_INDEX(keyid)].tail != tail) {
+		bucket->hb_keys[HASH_INDEX(keyid)].tail = tail;
+		bucket->hb_keys[HASH_INDEX(keyid)].root = blockid;
+		ibex_block_dirty((struct _block *)bucket);
+	}
+}
+
+static blockid_t
+hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail)
+{
+	struct _hashblock *bucket;
+
+	d(printf("getting data block hash %d\n", keyid));
+
+	if (keyid == 0) {
+		if (tail)
+			*tail = 0;
+		return 0;
+	}
+
+	bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
+	if (tail)
+		*tail = bucket->hb_keys[HASH_INDEX(keyid)].tail;
+	return bucket->hb_keys[HASH_INDEX(keyid)].root << BLOCK_BITS;
+}
+
+static hashid_t
+hash_insert(struct _IBEXIndex *index, const char *key, int keylen)
+{
+	struct _hashroot *hashroot;
+	guint32 hash;
+	int hashentry;
+	blockid_t hashtable;
+	hashid_t hashbucket, keybucket, keyprev, keyfree;
+	struct _hashtableblock *table;
+	struct _hashblock *bucket;
+	int count;
+
+	g_assert(index != 0);
+	g_assert(index->root != 0);
+
+	d(printf("inserting hash %.*s\n", keylen, key));
+
+	hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+	/* find the table containing this entry */
+	hash = hash_key(key, keylen) % hashroot->size;
+	hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+	table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+	hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+	/* and its bucket */
+	hashbucket = table->buckets[hashentry];
+
+	/* now look for a free slot, first try the free list */
+	/* but dont try too hard if our key is just too long ... so just scan upto
+	   4 blocks, but if we dont find a space, tough ... */
+	keybucket = hashroot->free;
+	keyprev = 0;
+	count = 0;
+	while (keybucket && count<4) {
+		int space;
+		
+		d(printf(" checking free %d\n", keybucket));
+
+		/* read the bucket containing this free key */
+		bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keybucket));
+
+		/* check if there is enough space for the key */
+		space =  &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]
+			- (char *)&bucket->hb_keys[bucket->used];
+		if (space >= keylen) {
+			hash_expand(bucket, HASH_INDEX(keybucket), keylen);
+			memcpy(&bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(keybucket)].keyoffset],
+			       key, keylen);
+
+			/* check if there is free space still in this node, and there are no other empty blocks */
+			keyfree = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+			if ((space-keylen) >= KEY_THRESHOLD) {
+				int i;
+				int head = ARRAY_LEN(bucket->hb_keydata);
+				int found = FALSE;
+
+				for (i=0;i<bucket->used;i++) {
+					if (bucket->hb_keys[i].keyoffset == head) {
+						/* already have a free slot in this block, leave it */
+						found = TRUE;
+						break;
+					}
+					head = bucket->hb_keys[i].keyoffset;
+				}
+				if (!found) {
+					/* we should link in a new free slot for this node */
+					bucket->hb_keys[bucket->used].next = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+					bucket->hb_keys[bucket->used].keyoffset = bucket->hb_keys[bucket->used-1].keyoffset;
+					keyfree = HASH_KEY(HASH_BLOCK(keybucket), bucket->used);
+					bucket->used++;
+				}
+			}
+			/* link 'keyfree' back to the parent ... */
+			if (keyprev == 0) {
+				hashroot->free = keyfree;
+				ibex_block_dirty((struct _block *)hashroot);
+			} else {
+				struct _hashblock *prevbucket;
+				prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyprev));
+				prevbucket->hb_keys[HASH_INDEX(keyprev)].next = keyfree;
+				ibex_block_dirty((struct _block *)prevbucket);
+			}
+
+			/* link into the hash chain */
+			bucket->hb_keys[HASH_INDEX(keybucket)].next = hashbucket;
+			bucket->hb_keys[HASH_INDEX(keybucket)].root = 0;
+			bucket->hb_keys[HASH_INDEX(keybucket)].tail = 0;
+			table->buckets[hashentry] = keybucket;
+			ibex_block_dirty((struct _block *)table);
+			ibex_block_dirty((struct _block *)bucket);
+
+			d(printf(" new key id %d\n", keybucket));
+			d(printf(" new free id %d\n", hashroot->free));
+
+			return keybucket;
+		}
+
+		count++;
+		keyprev = keybucket;
+		keybucket = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+	}
+
+	/* else create a new block ... */
+	keybucket = ibex_block_get(index->blocks);
+	bucket = (struct _hashblock *)ibex_block_read(index->blocks, keybucket);
+
+	d(printf("creating new key bucket %d\n", keybucket));
+
+	memset(bucket, 0, sizeof(*bucket));
+
+	bucket->used = 2;
+	/* first block, is the new key */
+	bucket->hb_keys[0].keyoffset = ARRAY_LEN(bucket->hb_keydata) - keylen;
+	memcpy(&bucket->hb_keydata[bucket->hb_keys[0].keyoffset], key, keylen);
+	bucket->hb_keys[0].next = hashbucket;
+	bucket->hb_keys[0].root = 0;
+	bucket->hb_keys[0].tail = 0;
+
+	table->buckets[hashentry] = HASH_KEY(keybucket, 0);
+
+	/* next block is a free block, link into free list */
+	bucket->hb_keys[1].keyoffset = bucket->hb_keys[0].keyoffset;
+	bucket->hb_keys[1].next = hashroot->free;
+	hashroot->free = HASH_KEY(keybucket, 1);
+
+	ibex_block_dirty((struct _block *)hashroot);
+	ibex_block_dirty((struct _block *)table);
+	ibex_block_dirty((struct _block *)bucket);
+
+	d(printf(" new key id %d\n", HASH_KEY(keybucket, 0)));
+	d(printf(" new free id %d\n", hashroot->free));
+
+	return HASH_KEY(keybucket, 0);
+}
+
+/* debug */
+void ibex_hash_dump(struct _IBEXIndex *index);
+
+void
+ibex_hash_dump(struct _IBEXIndex *index)
+{
+	int i;
+	struct _hashtableblock *table;
+	struct _hashblock *bucket;
+	struct _hashroot *hashroot;
+	blockid_t hashtable;
+	hashid_t hashbucket;
+
+	printf("Walking hash tree:\n");
+	hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+	for (i=0;i<hashroot->size;i++) {
+		printf("Hash table chain: %d\n", i);
+		hashtable = hashroot->table[i / (BLOCK_SIZE/sizeof(blockid_t))];
+		table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+		hashbucket = table->buckets[i % (BLOCK_SIZE/sizeof(blockid_t))];
+		while (hashbucket) {
+			int len;
+
+			bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+			printf(" bucket %d: [used %d]", hashbucket, bucket->used);
+			if (HASH_INDEX(hashbucket) == 0) {
+				len = ARRAY_LEN(bucket->hb_keydata) -
+					bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
+			} else {
+				len = bucket->hb_keys[HASH_INDEX(hashbucket)-1].keyoffset -
+					bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
+			}
+			printf("'%.*s' = %d next=%d\n", len, &bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset],
+			       bucket->hb_keys[HASH_INDEX(hashbucket)].root,
+			       bucket->hb_keys[HASH_INDEX(hashbucket)].next);
+			ibex_diskarray_dump(index->blocks,
+					    bucket->hb_keys[HASH_INDEX(hashbucket)].root << BLOCK_BITS,
+					    bucket->hb_keys[HASH_INDEX(hashbucket)].tail);
+
+			hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+		}
+		/* make sure its still in the cache */
+		hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+	}
+
+	hashbucket = hashroot->free;
+	printf("Dumping free lists ..\n");
+	while (hashbucket) {
+		printf(" %d", hashbucket);
+		bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+		hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+	}
+	printf("\n");
+}
+
+#if 0
+int main(int argc, char **argv)
+{
+	struct _memcache *bc;
+	struct _IBEXIndex *hash;
+	int i;
+
+	bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
+	hash = ibex_hash_class.create(bc, 1024);
+	for (i=0;i<10000;i++) {
+		char key[16];
+		sprintf(key, "key %d", i);
+		ibex_hash_class.insert(hash, key, strlen(key));
+	}
+
+	for (i=500;i<1000;i++) {
+		char key[16];
+		sprintf(key, "key %d", i);
+		ibex_hash_class.remove(hash, key, strlen(key));
+	}
+
+	for (i=500;i<1000;i++) {
+		char key[16];
+		sprintf(key, "key %d", i);
+		ibex_hash_class.insert(hash, key, strlen(key));
+	}
+
+
+	ibex_hash_dump(hash);
+
+	for (i=0;i<2000;i++) {
+		char key[16], *lookup;
+		hashid_t keyid;
+		blockid_t root, tail;
+
+		sprintf(key, "key %d", i);
+		keyid = ibex_hash_class.find(hash, key, strlen(key));
+		lookup = ibex_hash_class.get_key(hash, keyid, 0);
+		root = ibex_hash_class.get_data(hash, keyid, &tail);
+		printf("key %s = %d = '%s' root:%d tail:%d \n", key, keyid, lookup, root, tail);
+		g_free(lookup);
+	}
+
+	ibex_hash_class.close(hash);
+	ibex_block_cache_close(bc);
+	return 0;
+}
+
+#endif
--- a/libibex/ibex_block.c
+++ b/libibex/ibex_block.c
@ -1,33 +1,16 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- * 
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB.  If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */

-/* words.c: low-level indexing ops */
+#include <glib.h>

+#include <db.h>
+#include <stdio.h>
+#include <unicode.h>
 #include <ctype.h>
-#include <errno.h>
 #include <string.h>

-#include <unicode.h>
-
 #include "ibex_internal.h"

+#define d(x)
+
 static signed char utf8_trans[] = {
 	'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
 	'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
@ -60,7 +43,7 @@ static char *utf8_long_trans[] = {
 * It is not safe to call this routine with bad UTF-8.
 */
 static void
-normalize_word (char *start, char *end, char *buf)
+ibex_normalise_word(char *start, char *end, char *buf)
 {
 	unsigned char *s, *d;
 	unicode_char_t uc;
@ -151,39 +134,6 @@ utf8_category (char *sp, char **snp, char *send)
 	}
 }

-static ibex_file *
-get_ibex_file (ibex *ib, char *name)
-{
-	ibex_file *ibf;
-
-	ibf = g_tree_lookup (ib->files, name);
-	if (!ibf) {
-		ibf = g_malloc (sizeof (ibex_file));
-		ibf->name = g_strdup (name);
-		ibf->index = 0;
-		g_tree_insert (ib->files, ibf->name, ibf);
-		ib->dirty = TRUE;
-	}
-	return ibf;
-}
-
-static void
-ref_word (ibex *ib, ibex_file *ibf, char *word)
-{
-	GPtrArray *refs;
-
-	refs = g_hash_table_lookup (ib->words, word);
-	if (!refs) {
-		refs = g_ptr_array_new ();
-		g_hash_table_insert (ib->words, g_strdup (word), refs);
-		g_ptr_array_add (refs, ibf);
-		ib->dirty = TRUE;
-	} else if (g_ptr_array_index (refs, refs->len - 1) != ibf) {
-		g_ptr_array_add (refs, ibf);
-		ib->dirty = TRUE;
-	}
-}
-
 /**
 * ibex_index_buffer: the lowest-level ibex indexing interface
 * @ib: an ibex
@ -206,12 +156,13 @@ ref_word (ibex *ib, ibex_file *ibf, char *word)
 * Return value: 0 on success, -1 on failure.
 **/
 int
-ibex_index_buffer (ibex *ib, char *name, char *buffer,
-		   size_t len, size_t *unread)
+ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
 {
 	char *p, *q, *nq, *end, *word;
-	ibex_file *ibf = get_ibex_file (ib, name);
 	int wordsiz, cat;
+	GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
+	GPtrArray *wordlist = g_ptr_array_new();
+	int i, ret=-1;

 	if (unread)
 		*unread = 0;
@ -229,12 +180,9 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
 			p = q;
 		}
 		if (p == end) {
-			g_free (word);
-			return 0;
+			goto done;
 		} else if (cat == IBEX_INVALID) {
-			errno = EINVAL;
-			g_free (word);
-			return -1;
+			goto error;
 		} else if (cat == IBEX_INCOMPLETE)
 			q = end;

@ -246,24 +194,107 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
 		}
 		if (cat == IBEX_INVALID ||
 		    (cat == IBEX_INCOMPLETE && !unread)) {
-			errno = EINVAL;
-			g_free (word);
-			return -1;
+			goto error;
 		} else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
 			*unread = end - p;
-			g_free (word);
-			return 0;
+			goto done;
 		}

 		if (wordsiz < q - p + 1) {
 			wordsiz = q - p + 1;
 			word = g_realloc (word, wordsiz);
 		}
-		normalize_word (p, q, word);
-		ref_word (ib, ibf, word);
+		ibex_normalise_word (p, q, word);
+		if (word[0]) {
+			if (g_hash_table_lookup(words, word) == 0) {
+				char *newword = g_strdup(word);
+				g_ptr_array_add(wordlist, newword);
+				g_hash_table_insert(words, newword, name);
+			}
+		}
 		p = q;
 	}
-
+done:
+	d(printf("name %s count %d size %d\n", name, wordlist->len, len));
+	ib->words->klass->add_list(ib->words, name, wordlist);
+	ret = 0;
+error:
+	for (i=0;i<wordlist->len;i++)
+		g_free(wordlist->pdata[i]);
+	g_ptr_array_free(wordlist, TRUE);
+	g_hash_table_destroy(words);
 	g_free (word);
+	return ret;
+}
+
+
+ibex *ibex_open (char *file, int flags, int mode)
+{
+	ibex *ib;
+
+	ib = g_malloc0(sizeof(*ib));
+	ib->blocks = ibex_block_cache_open(file, flags, mode);
+	if (ib->blocks == 0) {
+		g_warning("create: Error occured?: %s\n", strerror(errno));
+		g_free(ib);
+		return NULL;
+	}
+	/* FIXME: the blockcache or the wordindex needs to manage the other one */
+	ib->words = ib->blocks->words;
+
+	return ib;
+}
+
+int ibex_save (ibex *ib)
+{
+	printf("syncing database\n");
+	ib->words->klass->sync(ib->words);
+	/* FIXME: some return */
+	ibex_block_cache_sync(ib->blocks);
 	return 0;
 }
+
+int ibex_close (ibex *ib)
+{
+	int ret = 0;
+
+	printf("closing database\n");
+
+	ib->words->klass->close(ib->words);
+	ibex_block_cache_close(ib->blocks);
+	g_free(ib);
+	return ret;
+}
+
+void ibex_unindex (ibex *ib, char *name)
+{
+	d(printf("trying to unindex '%s'\n", name));
+	ib->words->klass->unindex_name(ib->words, name);
+}
+
+GPtrArray *ibex_find (ibex *ib, char *word)
+{
+	char *normal;
+	int len;
+
+	len = strlen(word);
+	normal = alloca(len+1);
+	ibex_normalise_word(word, word+len, normal);
+	return ib->words->klass->find(ib->words, normal);
+}
+
+gboolean ibex_find_name (ibex *ib, char *name, char *word)
+{
+	char *normal;
+	int len;
+
+	len = strlen(word);
+	normal = alloca(len+1);
+	ibex_normalise_word(word, word+len, normal);
+	return ib->words->klass->find_name(ib->words, name, normal);
+}
+
+gboolean ibex_contains_name(ibex *ib, char *name)
+{
+	return ib->words->klass->contains_name(ib->words, name);
+}
--- a/libibex/ibex_db.c
+++ b/libibex/ibex_db.c
@ -0,0 +1,512 @@
+
+#include <glib.h>
+
+#include <db.h>
+#include <stdio.h>
+#include <unicode.h>
+#include <ctype.h>
+
+#include "ibex_internal.h"
+
+#define d(x)
+
+/*
+  Uses 2 databases: 
+
+  names - HASH  - lists which files are stored
+  words - BTREE - index words to files
+
+*/
+
+static int
+db_delete_name(DB *db, const char *name)
+{
+	DBC *cursor;
+	int err, namelen;
+	DBT key, data;
+
+	printf("called to delete name %s\n", name);
+	return 0;
+
+	err = db->cursor(db, NULL, &cursor, 0);
+	if (err != 0) {
+		printf("Error occured?: %s\n", db_strerror(err));
+		return err;
+	}
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	namelen = strlen(name);
+
+	err = cursor->c_get(cursor, &key, &data, DB_FIRST);
+	while (err == 0) {
+		if (data.size == namelen && memcmp(data.data, name, namelen) == 0) {
+			d(printf("Found match, removing it\n"));
+			cursor->c_del(cursor, 0);
+		} else {
+			data.size = namelen;
+			data.data = (void *)name;
+			if (cursor->c_get(cursor, &key, &data, DB_GET_BOTH) == 0) {
+				d(printf("seek to match, removing it\n"));
+				cursor->c_del(cursor, 0);
+			} else {
+				d(printf("seek to match, not found\n"));
+			}
+		}
+		err = cursor->c_get(cursor, &key, &data, DB_NEXT_NODUP);
+	}
+
+	cursor->c_close(cursor);
+
+	return 0;
+}
+
+static int
+db_index_words(DB *db, char *name, char **words)
+{
+	DBT key, data;
+	int err = 0;
+	char *word;
+	
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	data.data = name;
+	data.size = strlen(name);
+	for (;(word=*words);words++) {
+		/* normalise word first ... */
+		key.data = word;
+		key.size = strlen(word);
+
+		err = db->put(db, NULL, &key, &data, 0);
+		if (err != 0)
+			printf("Error occured?: %s\n", db_strerror(err));
+	}
+
+	return err;
+}
+
+static int
+db_index_word(DB *db, char *name, char *word)
+{
+	DBT key, data;
+	int err = 0;
+	
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	data.data = name;
+	data.size = strlen(name);
+	key.data = word;
+	key.size = strlen(word);
+
+	err = db->put(db, NULL, &key, &data, 0);
+	if (err != 0)
+		printf("Error occured?: %s\n", db_strerror(err));
+
+	return err;
+}
+
+static GPtrArray *
+db_find(DB *db, char *word)
+{
+	DBT key, data;
+	int err;
+	DBC *cursor;
+	GPtrArray *matches = g_ptr_array_new();
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+
+	err = db->cursor(db, NULL, &cursor, 0);
+	if (err != 0) {
+		printf("Error occured?: %s\n", db_strerror(err));
+		return matches;
+	}
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+
+	key.size = strlen(word);
+	key.data = word;
+
+	err = cursor->c_get(cursor, &key, &data, DB_SET);
+	while (err == 0) {
+		char *name;
+		name = g_malloc(data.size+1);
+		memcpy(name, data.data, data.size);
+		name[data.size] = '\0';
+		g_ptr_array_add(matches, name);
+		err = cursor->c_get(cursor, &key, &data, DB_NEXT_DUP);
+	}
+	if (err != DB_NOTFOUND) {
+		printf("Error occured?: %s\n", db_strerror(err));
+		/* what to do?  doesn't matter too much its just a search ... */
+	}
+
+	cursor->c_close(cursor);
+
+	return matches;
+}
+
+/* check that db contains key @word, optionally with contents @name */
+static gboolean
+db_contains_word(DB *db, char *name, char *word)
+{
+	DBT key, data;
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+
+	if (name != NULL) {
+		data.size = strlen(name);
+		data.data = name;
+	}
+	key.size = strlen(word);
+	key.data = word;
+
+	return db->get(db, NULL, &key, &data, name?DB_GET_BOTH:DB_SET) == 0;
+}
+
+static int
+db_delete_word(DB *db, char *word)
+{
+	DBT key;
+
+	memset(&key, 0, sizeof(key));
+	key.size = strlen(word);
+	key.data = word;
+	return db->del(db, NULL, &key, 0);
+}
+
+
+static signed char utf8_trans[] = {
+	'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
+	'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
+	'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e',
+	'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/',
+	'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a',
+	'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e',
+	'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g',
+	'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
+	'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L',
+	'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11,
+	'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r',
+	'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't',
+	'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w',
+	'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's'
+};
+
+static char *utf8_long_trans[] = {
+	"AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij",
+	"NG", "ng", "OE", "oe"
+};
+
+/* This is a bit weird. It takes pointers to the start and end (actually
+ * just past the end) of a UTF-8-encoded word, and a buffer at least 1
+ * byte longer than the length of the word. It copies the word into the
+ * buffer in all lowercase without accents, and splits up ligatures.
+ * (Since any ligature would be a multi-byte character in UTF-8, splitting
+ * them into two US-ASCII characters won't overrun the buffer.)
+ *
+ * It is not safe to call this routine with bad UTF-8.
+ */
+static void
+ibex_normalise_word(char *start, char *end, char *buf)
+{
+	unsigned char *s, *d;
+	unicode_char_t uc;
+
+	s = (unsigned char *)start;
+	d = (unsigned char *)buf;
+	while (s < (unsigned char *)end) {
+		if (*s < 0x80) {
+			/* US-ASCII character: copy unless it's
+			 * an apostrophe.
+			 */
+			if (*s != '\'')
+				*d++ = tolower (*s);
+			s++;
+		} else {
+			char *next = unicode_get_utf8 (s, &uc);
+			if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) {
+				signed char ch = utf8_trans[uc - 0xc0];
+				if (ch > 0)
+					*d++ = tolower (ch);
+				else {
+					*d++ = tolower (utf8_long_trans[-ch - 1][0]);
+					*d++ = tolower (utf8_long_trans[-ch - 1][1]);
+				}
+				s = next;
+			} else {
+				while (s < (unsigned char *)next)
+					*d++ = *s++;
+			}
+		}
+	}
+	*d = '\0';
+}
+
+enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE };
+
+/* This incorporates parts of libunicode, because there's no way to
+ * force libunicode to not read past a certain point.
+ */
+static int
+utf8_category (char *sp, char **snp, char *send)
+{
+	unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp;
+	unsigned char *end = (unsigned char *)send;
+
+	if (isascii (*p)) {
+		*np = p + 1;
+		if (isalpha (*p) || *p == '\'')
+			return IBEX_ALPHA;
+		return IBEX_NONALPHA;
+	} else {
+		unicode_char_t uc;
+		int more;
+
+		if ((*p & 0xe0) == 0xc0) {
+			more = 1;
+			uc = *p & 0x1f;
+		} else if ((*p & 0xf0) == 0xe0) {
+			more = 2;
+			uc = *p & 0x0f;
+		} else if ((*p & 0xf8) == 0xf0) {
+			more = 3;
+			uc = *p & 0x07;
+		} else if ((*p & 0xfc) == 0xf8) {
+			more = 4;
+			uc = *p & 0x03;
+		} else if ((*p & 0xfe) == 0xfc) {
+			more = 5;
+			uc = *p & 0x01;
+		} else
+			return IBEX_INVALID;
+
+		if (p + more > end)
+			return IBEX_INCOMPLETE;
+
+		while (more--) {
+			if ((*++p & 0xc0) != 0x80)
+				return IBEX_INVALID;
+			uc <<= 6;
+			uc |= *p & 0x3f;
+		}
+
+		*np = p + 1;
+		if (unicode_isalpha (uc))
+			return IBEX_ALPHA;
+		else
+			return IBEX_NONALPHA;
+	}
+}
+
+static void
+do_insert_words(char *key, char *name, ibex *ib)
+{
+	db_index_word(ib->words, name, key);
+	g_free(key);
+}
+
+static void
+do_free_words(char *key, char *name, void *data)
+{
+	g_free(key);
+}
+
+/**
+ * ibex_index_buffer: the lowest-level ibex indexing interface
+ * @ib: an ibex
+ * @name: the name of the file being indexed
+ * @buffer: a buffer containing data from the file
+ * @len: the length of @buffer
+ * @unread: an output argument containing the number of unread bytes
+ *
+ * This routine indexes up to @len bytes from @buffer into @ib.
+ * If @unread is NULL, the indexer assumes that the buffer ends on a
+ * word boundary, and will index all the way to the end of the
+ * buffer. If @unread is not NULL, and the buffer ends with an
+ * alphabetic character, the indexer will assume that the buffer has
+ * been cut off in the middle of a word, and return the number of
+ * un-indexed bytes at the end of the buffer in *@unread. The caller
+ * should then read in more data through whatever means it has
+ * and pass in the unread bytes from the original buffer, followed
+ * by the new data, on its next call.
+ *
+ * Return value: 0 on success, -1 on failure.
+ **/
+int
+ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
+{
+	char *p, *q, *nq, *end, *word;
+	int wordsiz, cat;
+	GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
+
+	if (unread)
+		*unread = 0;
+
+	end = buffer + len;
+	wordsiz = 20;
+	word = g_malloc (wordsiz);
+
+	p = buffer;
+	while (p < end) {
+		while (p < end) {
+			cat = utf8_category (p, &q, end);
+			if (cat != IBEX_NONALPHA)
+				break;
+			p = q;
+		}
+		if (p == end) {
+			goto done;
+		} else if (cat == IBEX_INVALID) {
+			goto error;
+		} else if (cat == IBEX_INCOMPLETE)
+			q = end;
+
+		while (q < end) {
+			cat = utf8_category (q, &nq, end);
+			if (cat != IBEX_ALPHA)
+				break;
+			q = nq;
+		}
+		if (cat == IBEX_INVALID ||
+		    (cat == IBEX_INCOMPLETE && !unread)) {
+			goto error;
+		} else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
+			*unread = end - p;
+			goto done;
+		}
+
+		if (wordsiz < q - p + 1) {
+			wordsiz = q - p + 1;
+			word = g_realloc (word, wordsiz);
+		}
+		ibex_normalise_word (p, q, word);
+		if (word[0]) {
+			if (g_hash_table_lookup(words, word) == 0) {
+				g_hash_table_insert(words, g_strdup(word), name);
+			}
+		}
+		p = q;
+	}
+done:
+	/* FIXME: do this and inserts within a transaction */
+	if (!db_contains_word(ib->names, NULL, name)) {
+		printf("adding '%s' to database\n", name);
+		db_index_word(ib->names, "", name);
+	}
+	g_hash_table_foreach(words, (GHFunc)do_insert_words, ib);
+	g_hash_table_destroy(words);
+	g_free (word);
+	return 0;
+error:
+	g_hash_table_foreach(words, (GHFunc)do_free_words, NULL);
+	g_hash_table_destroy(words);
+	g_free (word);
+	return -1;
+}
+
+
+ibex *ibex_open (char *file, int flags, int mode)
+{
+	ibex *ib;
+	u_int32_t dbflags = 0;
+	int err;
+
+	ib = g_malloc0(sizeof(*ib));
+	err = db_create(&ib->words, NULL, 0);
+	if (err != 0) {
+		g_warning("create: Error occured?: %s\n", db_strerror(err));
+		g_free(ib);
+		return NULL;
+	}
+
+	err = ib->words->set_flags(ib->words, DB_DUP);
+	if (err != 0) {
+		g_warning("set flags: Error occured?: %s\n", db_strerror(err));
+		ib->words->close(ib->words, 0);
+		g_free(ib);
+		return NULL;
+	}
+
+	if (flags & O_CREAT)
+		dbflags |= DB_CREATE;
+	if (flags & O_EXCL)
+		dbflags |= DB_EXCL;
+	if (flags & O_RDONLY)
+		dbflags |= DB_RDONLY;
+
+	/* 1MB cache size? */
+	ib->words->set_cachesize(ib->words, 0, 1000000, 0);
+
+	err = ib->words->open(ib->words, file, "words", DB_BTREE, dbflags, mode);
+	if (err != 0) {
+		printf("open: Error occured?: %s\n", db_strerror(err));
+		ib->words->close(ib->words, 0);
+		g_free(ib);		
+		return NULL;
+	}
+
+	/* FIXME: check returns */
+	err = db_create(&ib->names, NULL, 0);
+	err = ib->names->open(ib->names, file, "names", DB_HASH, dbflags, mode);
+
+	return ib;
+}
+
+int ibex_save (ibex *ib)
+{
+	printf("syncing database\n");
+	ib->names->sync(ib->names, 0);
+	return ib->words->sync(ib->words, 0);
+}
+
+int ibex_close (ibex *ib)
+{
+	int ret;
+
+	printf("closing database\n");
+
+	ret = ib->names->close(ib->names, 0);
+	ret = ib->words->close(ib->words, 0);
+	g_free(ib);
+	return ret;
+}
+
+void ibex_unindex (ibex *ib, char *name)
+{
+	printf("trying to unindex '%s'\n", name);
+	if (db_contains_word(ib->names, NULL, name)) {
+		/* FIXME: do within transaction? */
+		db_delete_name(ib->words, name);
+		db_delete_word(ib->names, name);
+	}
+}
+
+GPtrArray *ibex_find (ibex *ib, char *word)
+{
+	char *normal;
+	int len;
+
+	len = strlen(word);
+	normal = alloca(len+1);
+	ibex_normalise_word(word, word+len, normal);
+	return db_find(ib->words, normal);
+}
+
+gboolean ibex_find_name (ibex *ib, char *name, char *word)
+{
+	char *normal;
+	int len;
+
+	len = strlen(word);
+	normal = alloca(len+1);
+	ibex_normalise_word(word, word+len, normal);
+	return db_contains_word(ib->words, name, normal);
+}
+
+gboolean ibex_contains_name(ibex *ib, char *name)
+{
+	return db_contains_word(ib->names, NULL, name);
+}
--- a/libibex/ibex_internal.h
+++ b/libibex/ibex_internal.h
@ -21,19 +21,13 @@
 #include <glib.h>

 #include "ibex.h"
+#include "block.h"
+#include "wordindex.h"

-#define IBEX_VERSION "ibex1"
+#define IBEX_VERSION "ibex3"

 struct ibex {
 	char *path;
-	GTree *files;
-	GHashTable *words;
-	GPtrArray *oldfiles;
-	gboolean dirty;
+	struct _memcache *blocks;
+	struct _IBEXWord *words;
 };
-
-struct ibex_file {
-	char *name;
-	long index;
-};
-typedef struct ibex_file ibex_file;
--- a/libibex/index.c
+++ b/libibex/index.c
@ -1,154 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- * 
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB.  If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* index.c: high-level indexing ops */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "ibex_internal.h"
-
-/**
- * ibex_index_file: Index a file by name
- * @ib: an ibex
- * @filename: name of the file to index
- *
- * This indexes the given file into @ib. If @filename is already in
- * the ibex, the existing index entries for it are discarded and the
- * file is indexed anew.
- *
- * Return value: 0 on success, -1 on failure.
- **/
-int
-ibex_index_file (ibex *ib, char *filename)
-{
-	int fd;
-	int status;
-	struct stat st;
-
-	fd = open (filename, O_RDONLY);
-	if (fd < 0)
-		return -1;
-
-	if (fstat (fd, &st) == -1) {
-		close (fd);
-		return -1;
-	}
-	if (!S_ISREG (st.st_mode)) {
-		close (fd);
-		errno = EINVAL;
-		return -1;
-	}
-
-	ibex_unindex (ib, filename);
-	status = ibex_index_fd (ib, filename, fd, st.st_size);
-	close (fd);
-	return status;
-}
-
-/**
- * ibex_index_fd: Index a file given a file descriptor
- * @ib: an ibex
- * @name: the name of the file being indexed
- * @fd: a file descriptor, open for reading
- * @len: the number of bytes to read from the file
- *
- * This indexes a file, or a part of a file, given an open file
- * descriptor and a size. There is no requirement that @name
- * actually correspond to @fd in any particular way.
- *
- * If the function returns successfully, the file descriptor offset of
- * @fd will be exactly @len bytes beyond where it was when the
- * function was called. The indexer assumes that this point is a word
- * boundary.
- *
- * The behavior of this function is not defined if it is not
- * possible to read @len bytes off @fd.
- *
- * Return value: 0 on success, -1 on failure.
- **/
-int
-ibex_index_fd (ibex *ib, char *name, int fd, size_t len)
-{
-	char *buf;
-	int off = 0, nread, status;
-
-	buf = g_malloc (len);
-	do {
-		nread = read (fd, buf + off, len - off);
-		if (nread == -1) {
-			g_free (buf);
-			return -1;
-		}
-		off += nread;
-	} while (off != len);
-
-	status = ibex_index_buffer (ib, name, buf, len, NULL);
-	g_free (buf);
-
-	return status;
-}
-
-/**
- * ibex_unindex: Remove a file from the ibex
- * @ib: an ibex
- * @name: name of the file to remove
- *
- * This removes all references to @name from @ib. No memory is freed
- * right away, but further searches on @ib will never return @name.
- **/
-void
-ibex_unindex (ibex *ib, char *name)
-{
-	ibex_file *ibf;
-
-	ibf = g_tree_lookup (ib->files, name);
-	if (ibf) {
-		ibf->index = -1;
-		g_tree_remove (ib->files, name);
-		g_ptr_array_add (ib->oldfiles, ibf);
-		ib->dirty = TRUE;
-	}
-}
-
-/**
- * ibex_rename: Rename a file in the ibex
- * @ib: an ibex
- * @oldname: the old name of the file
- * @newname: the new name of the file
- *
- * This renames a file in the ibex.
- **/
-void
-ibex_rename (ibex *ib, char *oldname, char *newname)
-{
-	ibex_file *ibf;
-
-	ibf = g_tree_lookup (ib->files, oldname);
-	if (ibf) {
-		g_tree_remove (ib->files, oldname);
-		g_free (ibf->name);
-		ibf->name = g_strdup (newname);
-		g_tree_insert (ib->files, ibf->name, ibf);
-	}
-}
--- a/libibex/index.h
+++ b/libibex/index.h
@ -0,0 +1,82 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _INDEX_H
+#define _INDEX_H
+
+/* an indexing 'class' maps a key to 1 piece of info */
+
+struct _IBEXIndex {
+	struct _IBEXIndexClass *klass;
+	struct _memcache *blocks;
+	blockid_t root;		/* root block of ondisk index data */
+};
+
+struct _IBEXIndexClass {
+
+	struct _IBEXIndex *(*create)(struct _memcache *bc, int size);
+	struct _IBEXIndex *(*open)(struct _memcache *bc, blockid_t root);
+
+	int (*sync)(struct _IBEXIndex *);
+	int (*close)(struct _IBEXIndex *);
+
+	/* lookup a key in the index, returns the keyid of this item, or 0 if not found */
+	guint32 (*find)(struct _IBEXIndex *, const char *key, int keylen);
+
+	/* remove a key from the index */
+	void (*remove)(struct _IBEXIndex *, const char *key, int keylen);
+
+	/* insert a new key into the index, the keyid is returned */
+	guint32 (*insert)(struct _IBEXIndex *, const char *key, int keylen);
+
+	/* get the key contents/key length from the keyid */
+	char *(*get_key)(struct _IBEXIndex *, guint32 keyid, int *keylenptr);
+
+	/* set the key contents based on the keyid */
+	void (*set_data)(struct _IBEXIndex *, guint32 keyid, blockid_t datablock, blockid_t tail);
+
+	/* get the key contents based on the keyid */
+	blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail);
+};
+
+/* a storage class, stores lists of lists of id's */
+
+struct _IBEXStore {
+	struct _IBEXStoreClass *klass;
+	struct _memcache *blocks;
+};
+
+struct _IBEXStoreClass {
+	struct _IBEXStore *(*create)(struct _memcache *bc);
+	int (*sync)(struct _IBEXStore *store);
+	int (*close)(struct _IBEXStore *store);
+
+	blockid_t (*add)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+	blockid_t (*add_list)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
+	blockid_t (*remove)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+	void (*free)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+	gboolean (*find)(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
+	GArray *(*get)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+};
+
+#endif
--- a/libibex/wordindex.c
+++ b/libibex/wordindex.c
@ -0,0 +1,488 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* code to manage a word index */
+/* includes a cache for word index writes,
+   but not for name index writes (currently), or any reads.
+
+Note the word cache is only needed during indexing of lots
+of words, and could then be discarded (:flush()).
+
+*/
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+#include "wordindex.h"
+
+#define d(x)
+
+/*#define WORDCACHE_SIZE (256)*/
+#define WORDCACHE_SIZE (10240)
+
+extern struct _IBEXStoreClass ibex_diskarray_class;
+extern struct _IBEXIndexClass ibex_hash_class;
+
+/* need 2 types of hash key?
+   one that just stores the wordid / wordblock
+   and one that stores the filecount/files?
+*/
+
+
+struct _wordcache {
+	struct _wordcache *next;
+	struct _wordcache *prev;
+	nameid_t wordid;	/* disk wordid */
+	blockid_t wordblock;	/* head of disk list */
+	blockid_t wordtail;	/* and the tail data */
+	int filecount;		/* how many we have in memory */
+	/*nameid_t files[32];*/	/* memory cache of files */
+	nameid_t files[62];	/* memory cache of files */
+	char word[1];		/* actual word follows */
+};
+
+static void unindex_name(struct _IBEXWord *, const char *name); 	/* unindex all entries for name */
+static gboolean contains_name(struct _IBEXWord *, const char *name);	/* index contains data for name */
+static GPtrArray *find(struct _IBEXWord *, const char *word);		/* returns all matches for word */
+static gboolean find_name(struct _IBEXWord *, const char *name, const char *word);	/* find if name contains word */
+static void add(struct _IBEXWord *, const char *name, const char *word);	/* adds a single word to name (slow) */
+static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
+static int word_sync(struct _IBEXWord *idx);
+static int word_flush(struct _IBEXWord *idx);
+static int word_close(struct _IBEXWord *idx);
+
+struct _IBEXWordClass ibex_word_index_class = {
+	word_sync, word_flush, word_close,
+	unindex_name, contains_name,
+	find, find_name,
+	add, add_list
+};
+
+/* this interface isn't the best, but it'll do for now */
+struct _IBEXWord *
+ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot)
+{
+	struct _IBEXWord *idx;
+
+	idx = g_malloc0(sizeof(*idx));
+	idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal);
+	ibex_list_new(&idx->wordnodes);
+	idx->wordcount = 0;
+	idx->klass = &ibex_word_index_class;
+
+	/* we use the same block array storage for both indexes at the moment */
+	idx->wordstore = ibex_diskarray_class.create(bc);
+	idx->namestore = idx->wordstore;
+
+	/* but not the same indexes! */
+	if (*wordroot) {
+		printf("opening wordindex root = %d\n", *wordroot);
+		idx->wordindex = ibex_hash_class.open(bc, *wordroot);
+	} else {
+		idx->wordindex = ibex_hash_class.create(bc, 1024);
+		*wordroot = idx->wordindex->root;
+		printf("creating wordindex root = %d\n", *wordroot);
+	}
+	if (*nameroot) {
+		printf("opening nameindex root = %d\n", *nameroot);
+		idx->nameindex = ibex_hash_class.open(bc, *nameroot);
+	} else {
+		idx->nameindex = ibex_hash_class.create(bc, 1024);
+		*nameroot = idx->nameindex->root;
+		printf("creating nameindex root = %d\n", *nameroot);
+	}
+	return idx;
+}
+
+#if 0
+static void
+cache_sanity(struct _wordcache *head)
+{
+	while (head->next) {
+		g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0]));
+		g_assert(strlen(head->word) != 0);
+		head = head->next;
+	}
+}
+#endif
+
+/* unindex all entries for name */
+static void unindex_name(struct _IBEXWord *idx, const char *name)
+{
+	GArray *words;
+	int i;
+	nameid_t nameid, wordid;
+	blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail;
+
+	d(printf("unindexing %s\n", name));
+
+	/* lookup the hash key */
+	nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+	/* get the block for this key */
+	nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+	/* and the data for this block */
+	words = idx->namestore->klass->get(idx->namestore, nameblock, nametail);
+	/* walk it ... */
+	for (i=0;i<words->len;i++) {
+		/* get the word */
+		wordid = g_array_index(words, nameid_t, i);
+		d(printf(" word %d\n", wordid));
+		/* get the data block */
+		wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+		/* clear this name from it */
+		newblock = wordblock;
+		newtail = wordtail;
+		idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid);
+		if (newblock != wordblock || newtail != wordtail)
+			idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail);
+
+		/* FIXME: check cache as well */
+	}
+	g_array_free(words, TRUE);
+
+	/* and remove name data and itself */
+	idx->namestore->klass->free(idx->namestore, nameblock, nametail);
+	idx->nameindex->klass->remove(idx->nameindex, name, strlen(name));
+}
+
+/* index contains (any) data for name */
+static gboolean contains_name(struct _IBEXWord *idx, const char *name)
+{
+	return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;
+}
+
+/* returns all matches for word */
+static GPtrArray *find(struct _IBEXWord *idx, const char *word)
+{
+	nameid_t wordid, nameid;
+	GPtrArray *res;
+	GArray *names;
+	int i;
+	char *new;
+	struct _wordcache *cache;
+	blockid_t wordblock, wordtail;
+
+	res = g_ptr_array_new();
+
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache) {
+		/* freshen cache entry if we touch it */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+		wordid = cache->wordid;
+		wordblock = cache->wordblock;
+		wordtail = cache->wordtail;
+	} else {
+		/* lookup the hash key */
+		wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+		/* get the block for this key */
+		wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+	}
+	/* and the data for this block */
+	names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
+	/* .. including any memory-only data */
+	if (cache) {
+		g_array_append_vals(names, cache->files, cache->filecount);
+	}
+
+	/* walk it ... converting id's back to strings */
+	g_ptr_array_set_size(res, names->len);
+	for (i=0;i<names->len;i++) {
+		nameid = g_array_index(names, nameid_t, i);
+		new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL);
+		res->pdata[i] = new;
+	}
+	g_array_free(names, TRUE);
+	return res;
+}
+
+/* find if name contains word */
+static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word)
+{
+	nameid_t wordid, nameid;
+	blockid_t nameblock, nametail;
+	struct _wordcache *cache;
+	int i;
+
+	/* lookup the hash key for the name */
+	nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+	/* get the block for this name */
+	nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+
+	/* check if there is an in-memory cache for this word, check its file there first */
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache) {
+		/* freshen cache entry if we touch it */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+		for (i=0;i<cache->filecount;i++) {
+			if (cache->files[i] == nameid)
+				return TRUE;
+		}
+		/* not there?  well we can use the wordid anyway */
+		wordid = cache->wordid;
+	} else {
+		/* lookup the hash key for word */
+		wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+	}
+
+	/* see if wordid is in nameblock */
+	return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid);
+}
+
+/* cache helper functions */
+/* flush a cache entry to disk, and empty it out */
+static void
+sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
+{
+	GArray array; /* just use this as a header */
+	blockid_t oldblock, oldtail;
+	
+	d(printf("syncing cache entry '%s'\n", cache->word));
+	array.data = (char *)cache->files;
+	array.len = cache->filecount;
+	oldblock = cache->wordblock;
+	oldtail = cache->wordtail;
+	idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array);
+	if (oldblock != cache->wordblock || oldtail != cache->wordtail) {
+		idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail);
+	}
+	cache->filecount = 0;
+}
+
+/* create a new key in an index, returning its id and head block */
+static void
+add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail)
+{
+	/* initialise cache entry - id of word entry and head block */
+	*wordid = wordindex->klass->find(wordindex, word, strlen(word));
+	if (*wordid == 0) {
+		*wordid = wordindex->klass->insert(wordindex, word, strlen(word));
+		*wordblock = 0;
+		*wordtail = 0;
+	} else {
+		*wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail);
+	}
+}
+
+/* create a new key in a cached index (only word cache so far), flushing old keys
+   if too much space is being used */
+static struct _wordcache *
+add_index_cache(struct _IBEXWord *idx, const char *word)
+{
+	struct _wordcache *cache;
+
+	d(printf("adding %s to cache\n", word));
+
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache == 0) {
+		/* see if we have to flush off the last entry */
+		if (idx->wordcount >= WORDCACHE_SIZE) {
+			/* remove last entry, and flush it */
+			cache = (struct _wordcache *)idx->wordnodes.tailpred;
+			d(printf("flushing word from cache %s\n", cache->word));
+			ibex_list_remove((struct _listnode *)cache);
+			g_hash_table_remove(idx->wordcache, cache->word);
+			sync_cache_entry(idx, cache);
+			g_free(cache);
+			idx->wordcount--;
+		}
+		cache = g_malloc0(sizeof(*cache)+strlen(word));
+		/* initialise cache entry - id of word entry and head block */
+		add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail);
+		/* other fields */
+		strcpy(cache->word, word);
+		cache->filecount = 0;
+		g_hash_table_insert(idx->wordcache, cache->word, cache);
+		ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+		idx->wordcount++;
+	} else {
+		/* move cache bucket ot the head of the cache list */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+	}
+	return cache;
+}
+
+/* adds a single word to name (slow) */
+static void add(struct _IBEXWord *idx, const char *name, const char *word)
+{
+	nameid_t nameid;
+	blockid_t nameblock, newblock, nametail, newtail;
+	struct _wordcache *cache;
+
+	g_error("Dont use wordindex::add()");
+	abort();
+
+	cache = add_index_cache(idx, word);
+
+	/* get the nameid and block start for this name */
+	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+	/* check for repeats of the last name - dont add anything */
+	if (cache->files[cache->filecount] == nameid)
+		return;
+
+	/* yay, now insert the nameindex into the word list, and the word index into the name list */
+	cache->files[cache->filecount] = nameid;
+	cache->filecount++;
+	/* if we are full, force a flush now */
+	if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+		sync_cache_entry(idx, cache);
+	}
+
+	newtail = nametail;
+	newblock = nameblock;
+	idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid);
+	if (newblock != nameblock || newtail != nametail) {
+		idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+	}
+}
+
+/* adds a bunch of words to a given name */
+static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
+{
+	int i;
+	GArray *data = g_array_new(0, 0, sizeof(nameid_t));
+	blockid_t nameblock, newblock, nametail, newtail;
+	nameid_t nameid;
+	struct _wordcache *cache;
+
+	d(printf("Adding words to name %s\n", name));
+
+	d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+	/* get the nameid and block start for this name */
+	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+	d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+	for (i=0;i<words->len;i++) {
+		char *word = words->pdata[i];
+
+		cache = add_index_cache(idx, word);
+
+		/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+
+		/* check for duplicates; doesn't catch duplicates over an overflow
+		   boundary.  Watch me care. */
+		if (cache->filecount == 0
+		    || cache->files[cache->filecount-1] != nameid) {
+			cache->files[cache->filecount] = nameid;
+			cache->filecount++;
+			/* if we are full, force a flush now */
+			if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+				sync_cache_entry(idx, cache);
+			}
+
+			/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+
+			/* and append this wordid for this name in memory */
+			g_array_append_val(data, cache->wordid);
+		}
+
+		/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+	}
+
+	d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+	/* and append these word id's in one go */
+	newblock = nameblock;
+	newtail = nametail;
+	idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data);
+	if (newblock != nameblock || newtail != nametail) {
+		idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+	}
+
+	d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+	g_array_free(data, TRUE);
+}
+
+/* sync any in-memory data to disk */
+static int
+word_sync(struct _IBEXWord *idx)
+{
+	/* we just flush also, save memory */
+	word_flush(idx);
+
+#if 0
+	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head;
+
+	while (cache->next) {
+		sync_cache_entry(idx, cache);
+		cache = cache->next;
+	}
+
+	/*ibex_hash_dump(idx->wordindex);*/
+	/*ibex_hash_dump(idx->nameindex);*/
+#endif
+	return 0;
+}
+
+/* sync and flush any in-memory data to disk and free it */
+static int
+word_flush(struct _IBEXWord *idx)
+{
+	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+
+	next = cache->next;
+	while (next) {
+		sync_cache_entry(idx, cache);
+		g_hash_table_remove(idx->wordcache, cache->word);
+		g_free(cache);
+		cache = next;
+		next = cache->next;
+	}
+	ibex_list_new(&idx->wordnodes);
+	return 0;
+}
+
+static int word_close(struct _IBEXWord *idx)
+{
+	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+
+	next = cache->next;
+	while (next) {
+		sync_cache_entry(idx, cache);
+		g_free(cache);
+		cache = next;
+		next = cache->next;
+	}
+	idx->namestore->klass->close(idx->namestore);
+	idx->nameindex->klass->close(idx->nameindex);
+	/*same as namestore:
+	  idx->wordstore->klass->close(idx->wordstore);*/
+	idx->wordindex->klass->close(idx->wordindex);
+	g_hash_table_destroy(idx->wordcache);
+	g_free(idx);
+
+	return 0;
+}
--- a/libibex/wordindex.h
+++ b/libibex/wordindex.h
@ -0,0 +1,65 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _WORDINDEX_H
+#define _WORDINDEX_H
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+
+struct _IBEXWord;
+
+/* not used yet */
+typedef void (*IBEXNormaliseFunc)(char *source, int len, char *dest);
+
+struct _IBEXWordClass {
+	int (*sync)(struct _IBEXWord *);
+	int (*flush)(struct _IBEXWord *);
+	int (*close)(struct _IBEXWord *);
+
+	void (*unindex_name)(struct _IBEXWord *, const char *name);		/* unindex all entries for name */
+	gboolean (*contains_name)(struct _IBEXWord *, const char *name);	/* index contains data for name */
+	GPtrArray *(*find)(struct _IBEXWord *, const char *word);		/* returns all matches for word */
+	gboolean (*find_name)(struct _IBEXWord *, const char *name, const char *word);	/* find if name contains word */
+	void (*add)(struct _IBEXWord *, const char *name, const char *word);	/* adds a single word to name */
+	void (*add_list)(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
+};
+
+struct _IBEXWord {
+	struct _IBEXWordClass *klass;
+	struct _IBEXStore *wordstore;
+	struct _IBEXIndex *wordindex;
+	struct _IBEXStore *namestore;
+	struct _IBEXIndex *nameindex;
+
+	/* word caching info (should probably be modularised) */
+	GHashTable *wordcache;	/* word->struct _wordcache mapping */
+	struct _list wordnodes;	/* LRU list of wordcache structures */
+	int wordcount;		/* how much space used in cache */
+};
+
+
+struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+
+#endif /* !_WORDINDEX_H */