** Merged from IBEX_DISK branch to head.

2000-09-19  Not Zed  <NotZed@HelixCode.com>

	** Merged from IBEX_DISK branch to head.

svn path=/trunk/; revision=5500
This commit is contained in:
Not Zed
2000-09-19 12:22:00 +00:00
committed by Michael Zucci
parent 122b2426db
commit 288e7bb1fb
16 changed files with 3681 additions and 924 deletions

View File

@ -1,3 +1,60 @@
2000-09-19 Not Zed <NotZed@HelixCode.com>
** Merged from IBEX_DISK branch to head.
* file.c:
* find.c:
* words.c:
* index.c: Removed unused files.
* block.h: Changed block to use only 24 bits for next and 8 for
used, and fixed all relevant code. Some cleanup.
* disktail.c (tail_get): If we use an empty tail node, then make
sure we make it dirty.
2000-09-15 Not Zed <NotZed@HelixCode.com>
* wordindex.c (word_close): Free hashtable on exit too.
* disktail.c: Implemented tail-node storage for the end of long
lists, or for short lists. Should save significant disk space
(5x?).
Implemented special case for 1-item lists, where the tailnode
pointer is used to store the index entry.
2000-09-14 Not Zed <NotZed@HelixCode.com>
* wordindex.c (add_index_key): Keys also handle tails.
* hash.c (hash_set_data_block): Added new parameter to keys - a
tail block (a full 32 bit block pointer).
(hash_get_data_block): And same here.
2000-09-12 Not Zed <NotZed@HelixCode.com>
* wordindex.c (word_close): Dont close namestore twice.
2000-09-11 Not Zed <NotZed@HelixCode.com>
** Redid almost everything, on-disk hash table to store an index
to index records, mroe on the way to modularisation (more to go),
now stores reverse indexes for deleting.
2000-08-31 Not Zed <NotZed@HelixCode.com>
* block.c (add_key_mem): Initialise a memory based array for newly
added index entries.
(add_record): Changed to cache updates in memory until we hit a
limit, and then flush them to disk.
(get_record): Merge in-memory records with disk records.
(remove_record): Remove from memory first, and if that fails, goto
disk.
(find_record): Check memory first, then disk if that fails.
(add_datum_list): oops, copy size * sizeof(blockid_t)
(add_indexed): Make sure we link in the head node when we create a
new one.
2000-08-09 Christopher James Lahey <clahey@helixcode.com>
* file.c, find.c: Fixed some warnings.

View File

@ -2,20 +2,35 @@
noinst_LTLIBRARIES = libibex.la
libibex_la_SOURCES = file.c index.c find.c words.c ibex.h
libibex_la_SOURCES = \
wordindex.c \
block.c ibex.h \
hash.c \
disktail.c \
ibex_block.c
libibex_la_LDFLAGS = -static
noinst_HEADERS = ibex_internal.h
noinst_HEADERS = \
ibex_internal.h \
block.h \
wordindex.h \
index.h
INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \
-DG_LOG_DOMAIN=\"libibex\"
noinst_PROGRAMS = mkindex lookup
#noinst_PROGRAMS = hash
mkindex_SOURCES = mkindex.c
mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
#hash_SOURCES = disktail.c
#hash_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
lookup_SOURCES = lookup.c
lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
#noinst_PROGRAMS = mkindex lookup
#
#mkindex_SOURCES = mkindex.c
#mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
#
#lookup_SOURCES = lookup.c
#lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)

481
libibex/block.c Normal file
View File

@ -0,0 +1,481 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/*
block file/cache/utility functions
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <glib.h>
#include "block.h"
#define d(x)
/*#define DEBUG*/
#ifdef IBEX_STATS
static void
init_stats(struct _memcache *index)
{
index->stats = g_hash_table_new(g_direct_hash, g_direct_equal);
}
static void
dump_1_stat(int id, struct _stat_info *info, struct _memcache *index)
{
printf("%d %d %d %d %d\n", id, info->read, info->write, info->cache_hit, info->cache_miss);
}
static void
dump_stats(struct _memcache *index)
{
printf("Block reads writes hits misses\n");
g_hash_table_foreach(index->stats, dump_1_stat, index);
}
static void
add_read(struct _memcache *index, int id)
{
struct _stat_info *info;
info = g_hash_table_lookup(index->stats, (void *)id);
if (info == NULL) {
info = g_malloc0(sizeof(*info));
g_hash_table_insert(index->stats, (void *)id, info);
}
info->read++;
}
static void
add_write(struct _memcache *index, int id)
{
struct _stat_info *info;
info = g_hash_table_lookup(index->stats, (void *)id);
if (info == NULL) {
info = g_malloc0(sizeof(*info));
g_hash_table_insert(index->stats, (void *)id, info);
}
info->write++;
}
static void
add_hit(struct _memcache *index, int id)
{
struct _stat_info *info;
info = g_hash_table_lookup(index->stats, (void *)id);
if (info == NULL) {
info = g_malloc0(sizeof(*info));
g_hash_table_insert(index->stats, (void *)id, info);
}
info->cache_hit++;
}
static void
add_miss(struct _memcache *index, int id)
{
struct _stat_info *info;
info = g_hash_table_lookup(index->stats, (void *)id);
if (info == NULL) {
info = g_malloc0(sizeof(*info));
g_hash_table_insert(index->stats, (void *)id, info);
}
info->cache_miss++;
}
#endif /* IBEX_STATS */
/* simple list routines (for simplified memory management of cache/lists) */
/**
* ibex_list_new:
* @v:
*
* Initialise a list header. A list header must always be initialised
* before use.
**/
void ibex_list_new(struct _list *v)
{
v->head = (struct _listnode *)&v->tail;
v->tail = 0;
v->tailpred = (struct _listnode *)&v->head;
}
/**
* ibex_list_addhead:
* @l: List.
* @n: Node to append.
*
* Prepend a listnode to the head of the list @l.
*
* Return value: Always @n.
**/
struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n)
{
n->next = l->head;
n->prev = (struct _listnode *)&l->head;
l->head->prev = n;
l->head = n;
return n;
}
/**
* ibex_list_addtail:
* @l:
* @n:
*
* Append a listnode to the end of the list @l.
*
* Return value: Always the same as @n.
**/
struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n)
{
n->next = (struct _listnode *)&l->tail;
n->prev = l->tailpred;
l->tailpred->next = n;
l->tailpred = n;
return n;
}
/**
* ibex_list_remove:
* @n: The node to remove.
*
* Remove a listnode from a list.
*
* Return value: Always the same as @n.
**/
struct _listnode *ibex_list_remove(struct _listnode *n)
{
n->next->prev = n->prev;
n->prev->next = n->next;
return n;
}
static struct _memblock *
memblock_addr(struct _block *block)
{
return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data));
}
/**
* ibex_block_dirty:
* @block:
*
* Dirty a block. This will cause it to be written to disk on
* a cache sync, or when the block is flushed from the cache.
**/
void
ibex_block_dirty(struct _block *block)
{
memblock_addr(block)->flags |= BLOCK_DIRTY;
}
static void
sync_block(struct _memcache *block_cache, struct _memblock *memblock)
{
lseek(block_cache->fd, memblock->block, SEEK_SET);
if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) {
memblock->flags &= ~BLOCK_DIRTY;
}
#ifdef IBEX_STATS
add_write(block_cache, memblock->block);
#endif
}
/**
* ibex_block_cache_sync:
* @block_cache:
*
* Ensure the block cache is fully synced to disk.
**/
void
ibex_block_cache_sync(struct _memcache *block_cache)
{
struct _memblock *memblock;
memblock = (struct _memblock *)block_cache->nodes.head;
while (memblock->next) {
if (memblock->flags & BLOCK_DIRTY) {
sync_block(block_cache, memblock);
}
memblock = memblock->next;
}
fsync(block_cache->fd);
#ifdef IBEX_STATS
dump_stats(block_cache);
#endif
}
/**
* ibex_block_cache_flush:
* @block_cache:
*
* Ensure the block cache is fully synced to disk, and then flush
* its contents from memory.
**/
void
ibex_block_cache_flush(struct _memcache *block_cache)
{
struct _memblock *mw, *mn;
ibex_block_cache_sync(block_cache);
mw = (struct _memblock *)block_cache->nodes.head;
mn = mw->next;
while (mn) {
g_hash_table_remove(block_cache->index, (void *)mw->block);
g_free(mw);
mw = mn;
mn = mn->next;
}
ibex_list_new(&block_cache->nodes);
}
/**
* ibex_block_read:
* @block_cache:
* @blockid:
*
* Read the data of a block by blockid. The data contents is backed by
* the block cache, and should be considered static.
*
* TODO; should this return a NULL block on error?
*
* Return value: The address of the block data (which may be cached).
**/
struct _block *
ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
{
struct _memblock *memblock;
{
/* assert blockid<roof */
if (blockid > 0) {
struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
g_assert(blockid < root->roof);
}
}
memblock = g_hash_table_lookup(block_cache->index, (void *)blockid);
if (memblock) {
d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data));
/* 'access' page */
ibex_list_remove((struct _listnode *)memblock);
ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
#ifdef IBEX_STATS
add_hit(block_cache, memblock->block);
#endif
return &memblock->data;
}
#ifdef IBEX_STATS
add_miss(block_cache, blockid);
add_read(block_cache, blockid);
#endif
d(printf("loading blockid from disk %d\n", blockid));
memblock = g_malloc(sizeof(*memblock));
memblock->block = blockid;
memblock->flags = 0;
lseek(block_cache->fd, blockid, SEEK_SET);
memset(&memblock->data, 0, sizeof(memblock->data));
read(block_cache->fd, &memblock->data, sizeof(memblock->data));
ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
g_hash_table_insert(block_cache->index, (void *)blockid, memblock);
if (block_cache->count >= CACHE_SIZE) {
struct _memblock *old = (struct _memblock *)block_cache->nodes.head;
d(printf("discaring cache block %d\n", old->block));
g_hash_table_remove(block_cache->index, (void *)old->block);
ibex_list_remove((struct _listnode *)old);
if (old->flags & BLOCK_DIRTY) {
sync_block(block_cache, old);
}
g_free(old);
} else {
block_cache->count++;
}
d(printf(" --- cached blocks : %d\n", block_cache->count));
return &memblock->data;
}
/**
* ibex_block_cache_open:
* @name:
* @flags: Flags as to open(2), should use O_RDWR and optionally O_CREAT.
* @mode: Mose as to open(2)
*
* Open a block file.
*
* FIXME; this currently also initialises the word and name indexes
* because their pointers are stored in the root block. Should be
* upto the caller to manage these pointers/data.
*
* Return value: NULL if the backing file could not be opened.
**/
struct _memcache *
ibex_block_cache_open(const char *name, int flags, int mode)
{
struct _root *root;
struct _memcache *block_cache = g_malloc0(sizeof(*block_cache));
d(printf("opening ibex file: %s", name));
/* setup cache */
ibex_list_new(&block_cache->nodes);
block_cache->count = 0;
block_cache->index = g_hash_table_new(g_direct_hash, g_direct_equal);
block_cache->fd = open(name, flags, mode);
if (block_cache->fd == -1) {
g_hash_table_destroy(block_cache->index);
g_free(block_cache);
return NULL;
}
root = (struct _root *)ibex_block_read(block_cache, 0);
if (root->roof == 0 || memcmp(root->version, "ibx3", 4)) {
d(printf("Initialising superblock\n"));
/* reset root data */
memcpy(root->version, "ibx3", 4);
root->roof = 1024;
root->free = 0;
root->words = 0;
root->names = 0;
root->tail = 0; /* list of tail blocks */
ibex_block_dirty((struct _block *)root);
} else {
d(printf("superblock already initialised:\n"
" roof = %d\n free = %d\n words = %d\n names = %d\n",
root->roof, root->free, root->words, root->names));
}
/* this should be moved higher up */
{
struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names);
}
#ifdef IBEX_STATS
init_stats(block_cache);
#endif
return block_cache;
}
/**
* ibex_block_cache_close:
* @block_cache:
*
* Close the block file, sync any remaining cached data
* to disk, and free all resources.
**/
void
ibex_block_cache_close(struct _memcache *block_cache)
{
struct _memblock *mw, *mn;
ibex_block_cache_sync(block_cache);
close(block_cache->fd);
mw = (struct _memblock *)block_cache->nodes.head;
mn = mw->next;
while (mn) {
g_free(mw);
mw = mn;
mn = mw->next;
}
g_hash_table_destroy(block_cache->index);
g_free(block_cache);
}
/**
* ibex_block_free:
* @block_cache:
* @blockid:
*
* Return a block to the free pool.
**/
void
ibex_block_free(struct _memcache *block_cache, blockid_t blockid)
{
struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
struct _block *block = ibex_block_read(block_cache, blockid);
block->next = root->free;
root->free = blockid;
ibex_block_dirty((struct _block *)root);
ibex_block_dirty((struct _block *)block);
}
/**
* ibex_block_get:
* @block_cache:
*
* Allocate a new block, or access a previously freed block and return
* its block id. The block will have zeroed contents.
*
* Return value: 0 if there are no blocks left (disk full/read only
* file, etc).
**/
blockid_t
ibex_block_get(struct _memcache *block_cache)
{
struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
struct _block *block;
blockid_t head;
if (root->free) {
head = root->free;
block = ibex_block_read(block_cache, head);
root->free = block_location(block->next);
} else {
/* TODO: check the block will fit first */
/* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */
head = root->roof;
root->roof += BLOCK_SIZE;
block = ibex_block_read(block_cache, head);
}
g_assert(head != 0);
d(printf("new block = %d\n", head));
block->next = 0;
block->used = 0;
ibex_block_dirty(block);
ibex_block_dirty((struct _block *)root);
return head;
}

102
libibex/block.h Normal file
View File

@ -0,0 +1,102 @@
/* public interfaces for block io routines */
#ifndef _BLOCK_H
#define _BLOCK_H
/*#define IBEX_STATS*/ /* define to get/dump block access stats */
#include <glib.h>
typedef guint32 nameid_t;
typedef guint32 blockid_t;
#define BLOCK_BITS (8)
#define BLOCK_SIZE (1<<BLOCK_BITS)
#define CACHE_SIZE 256 /* blocks in disk cache */
/* root block */
struct _root {
char version[4];
blockid_t free; /* list of free blocks */
blockid_t roof; /* top of allocated space, everything below is in a free or used list */
blockid_t tail; /* list of 'tail' blocks */
blockid_t words; /* root of words index */
blockid_t names; /* root of names index */
};
/* basic disk structure for (data) blocks */
struct _block {
unsigned int next:32-BLOCK_BITS; /* next block */
unsigned int used:BLOCK_BITS; /* number of elements used */
nameid_t bl_data[(BLOCK_SIZE-4)/4]; /* references */
};
/* custom list structure, for a simple/efficient cache */
struct _listnode {
struct _listnode *next;
struct _listnode *prev;
};
struct _list {
struct _listnode *head;
struct _listnode *tail;
struct _listnode *tailpred;
};
void ibex_list_new(struct _list *v);
struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n);
struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n);
struct _listnode *ibex_list_remove(struct _listnode *n);
/* in-memory structure for block cache */
struct _memblock {
struct _memblock *next;
struct _memblock *prev;
blockid_t block;
int flags;
struct _block data;
};
#define BLOCK_DIRTY (1<<0)
struct _memcache {
struct _list nodes;
int count; /* nodes in cache */
GHashTable *index; /* blockid->memblock mapping */
int fd; /* file fd */
#ifdef IBEX_STATS
GHashTable *stats;
#endif
/* temporary here */
struct _IBEXWord *words; /* word index */
};
#ifdef IBEX_STATS
struct _stat_info {
int read;
int write;
int cache_hit;
int cache_miss;
};
#endif /* IBEX_STATS */
struct _memcache *ibex_block_cache_open(const char *name, int flags, int mode);
void ibex_block_cache_close(struct _memcache *block_cache);
void ibex_block_cache_sync(struct _memcache *block_cache);
void ibex_block_cache_flush(struct _memcache *block_cache);
blockid_t ibex_block_get(struct _memcache *block_cache);
void ibex_block_free(struct _memcache *block_cache, blockid_t blockid);
void ibex_block_dirty(struct _block *block);
struct _block *ibex_block_read(struct _memcache *block_cache, blockid_t blockid);
#define block_number(x) ((x)>>BLOCK_BITS)
#define block_location(x) ((x)<<BLOCK_BITS)
#endif /* ! _BLOCK_H */

255
libibex/diskarray.c Normal file
View File

@ -0,0 +1,255 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* a disk based array storage class */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <glib.h>
#include "block.h"
#include "index.h"
#define d(x)
/*#define DEBUG*/
static struct _IBEXStore *disk_create(struct _memcache *bc);
static int disk_sync(struct _IBEXStore *store);
static int disk_close(struct _IBEXStore *store);
static blockid_t disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data);
static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data);
static blockid_t disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data);
static void disk_free(struct _IBEXStore *store, blockid_t head);
static gboolean disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data);
static GArray *disk_get(struct _IBEXStore *store, blockid_t head);
struct _IBEXStoreClass ibex_diskarray_class = {
disk_create, disk_sync, disk_close,
disk_add, disk_add_list,
disk_remove, disk_free,
disk_find, disk_get
};
static struct _IBEXStore *disk_create(struct _memcache *bc)
{
struct _IBEXStore *store;
store = g_malloc0(sizeof(*store));
store->klass = &ibex_diskarray_class;
store->blocks = bc;
return store;
}
static int disk_sync(struct _IBEXStore *store)
{
/* no cache, nop */
return 0;
}
static int disk_close(struct _IBEXStore *store)
{
g_free(store);
return 0;
}
static blockid_t
disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data)
{
struct _block *block;
struct _block *newblock;
blockid_t new;
if (head == 0) {
head = ibex_block_get(store->blocks);
}
block = ibex_block_read(store->blocks, head);
d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
d(printf("adding record into block %d %d\n", head, data));
block->bl_data[block->used] = data;
block->used++;
ibex_block_dirty(block);
return head;
} else {
new = ibex_block_get(store->blocks);
newblock = ibex_block_read(store->blocks, new);
newblock->next = head;
newblock->bl_data[0] = data;
newblock->used = 1;
d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next));
ibex_block_dirty(newblock);
return new;
}
}
static blockid_t
disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data)
{
struct _block *block;
struct _block *newblock;
blockid_t new;
int copied = 0;
int left, space, tocopy;
if (head == 0) {
head = ibex_block_get(store->blocks);
}
block = ibex_block_read(store->blocks, head);
while (copied < data->len) {
left = data->len - copied;
space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
if (space) {
tocopy = MIN(left, space);
memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
block->used += tocopy;
ibex_block_dirty(block);
} else {
new = ibex_block_get(store->blocks);
newblock = ibex_block_read(store->blocks, new);
newblock->next = head;
tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
newblock->used = tocopy;
block = newblock;
head = new;
ibex_block_dirty(newblock);
}
copied += tocopy;
}
return head;
}
static blockid_t
disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data)
{
blockid_t node = head;
d(printf("removing %d from %d\n", data, head));
while (node) {
struct _block *block = ibex_block_read(store->blocks, node);
int i;
for (i=0;i<block->used;i++) {
if (block->bl_data[i] == data) {
struct _block *start = ibex_block_read(store->blocks, head);
start->used--;
block->bl_data[i] = start->bl_data[start->used];
if (start->used == 0) {
struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
blockid_t new;
d(printf("dropping block %d, new head = %d\n", head, start->next));
new = start->next;
start->next = root->free;
root->free = head;
head = new;
ibex_block_dirty((struct _block *)root);
}
ibex_block_dirty(block);
ibex_block_dirty(start);
return head;
}
}
node = block->next;
}
return head;
}
static void disk_free(struct _IBEXStore *store, blockid_t head)
{
blockid_t next;
struct _block *block;
while (head) {
block = ibex_block_read(store->blocks, head);
next = block->next;
ibex_block_free(store->blocks, head);
head = next;
}
}
static gboolean
disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data)
{
blockid_t node = head;
d(printf("finding %d from %d\n", data, head));
while (node) {
struct _block *block = ibex_block_read(store->blocks, node);
int i;
for (i=0;i<block->used;i++) {
if (block->bl_data[i] == data) {
return TRUE;
}
}
node = block->next;
}
return FALSE;
}
static GArray *
disk_get(struct _IBEXStore *store, blockid_t head)
{
GArray *result = g_array_new(0, 0, sizeof(nameid_t));
while (head) {
struct _block *block = ibex_block_read(store->blocks, head);
d(printf("getting data from block %d\n", head));
g_array_append_vals(result, block->bl_data, block->used);
head = block->next;
d(printf("next = %d\n", head));
}
return result;
}
void
ibex_diskarray_dump(struct _memcache *blocks, blockid_t head)
{
blockid_t node = head;
printf("dumping list %d\n", node);
while (node) {
struct _block *block = ibex_block_read(blocks, node);
int i;
printf(" block %d used %d\n ", node, block->used);
for (i=0;i<block->used;i++) {
printf(" %d", block->bl_data[i]);
}
printf("\n");
node = block->next;
}
}

807
libibex/disktail.c Normal file
View File

@ -0,0 +1,807 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* a disk based array storage class that stores the tails of data lists
in common blocks */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <glib.h>
#include <string.h>
#include <stdio.h>
#include "block.h"
#include "index.h"
#define d(x)
/*#define DEBUG*/
/* marker to define which root keys indicate a single length key */
#define BLOCK_ONE (1<<BLOCK_BITS)
/* tail blocks only contain tail data ... */
/* and we pack it in, similar to the key data, only more compact */
struct _tailblock {
unsigned int next:32-BLOCK_BITS; /* only needs to point to block numbers */
unsigned int used:BLOCK_BITS; /* how many entries are used */
union {
unsigned char offset[BLOCK_SIZE-4]; /* works upto blocksize of 1024 bytes */
nameid_t data[(BLOCK_SIZE-4)/4];
} tailblock_u;
};
#define tb_offset tailblock_u.offset
#define tb_data tailblock_u.data
/* map a tail index to a block index */
#define TAIL_INDEX(b) ((b) & (BLOCK_SIZE-1))
/* map a tail index to a block number */
#define TAIL_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
/* map a block + index to a tailid */
#define TAIL_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
#define TAIL_THRESHOLD ((BLOCK_SIZE-8)/6)
static struct _IBEXStore *disk_create(struct _memcache *bc);
static int disk_sync(struct _IBEXStore *store);
static int disk_close(struct _IBEXStore *store);
static blockid_t disk_add(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
static blockid_t disk_remove(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail);
static gboolean disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
static GArray *disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail);
struct _IBEXStoreClass ibex_diskarray_class = {
disk_create, disk_sync, disk_close,
disk_add, disk_add_list,
disk_remove, disk_free,
disk_find, disk_get
};
static int
tail_info(struct _tailblock *bucket, nameid_t tailid, blockid_t **startptr)
{
blockid_t *start, *end;
int index;
/* get start/end of area to zap */
index = TAIL_INDEX(tailid);
start = &bucket->tb_data[bucket->tb_offset[index]];
if (index == 0) {
end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
} else {
end = &bucket->tb_data[bucket->tb_offset[index-1]];
}
if (startptr)
*startptr = start;
return end-start;
}
/* compresses (or expand) the bucket entry, to the new size */
static void
tail_compress(struct _tailblock *bucket, int index, int newsize)
{
int i;
blockid_t *start, *end, *newstart;
/* get start/end of area to zap */
start = &bucket->tb_data[bucket->tb_offset[index]];
if (index == 0) {
end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
} else {
end = &bucket->tb_data[bucket->tb_offset[index-1]];
}
if (end-start == newsize)
return;
/*
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
0 20 25
newsize = 0
end = 25
newstart = 0
start = 20
newstart+(end-start)-newsize = 5
i = start-newstart
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
0 20 25
newsize = 2
end = 25
newstart = 0
start = 20
newstart+(end-start)-newsize = 3
i = start-newstart + MIN(end-start, newsize)) = 22
XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
5 25
newsize = 5
end = 25
start = 25
newstart = 5
newstart+(end-start)-newsize = 0
i = start-newstart = 20
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyy
3 23 25
newsize = 5
end = 25
start = 23
newstart = 3
newstart+(end-start)-newsize = 0
i = start-newstart + MIN(end-start, newsize) = 22
*/
/* fixup data */
newstart = &bucket->tb_data[bucket->tb_offset[bucket->used-1]];
g_assert(newstart+(end-start)-newsize <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
g_assert(newstart + (start-newstart) + MIN(end-start, newsize) <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
g_assert(newstart+(end-start)-newsize >= &bucket->tb_offset[bucket->used]);
g_assert(newstart + (start-newstart) + MIN(end-start, newsize) >= &bucket->tb_offset[bucket->used]);
memmove(newstart+(end-start)-newsize, newstart, ((start-newstart)+MIN(end-start, newsize)) * sizeof(blockid_t));
/* fixup key pointers */
for (i=index;i<bucket->used;i++) {
bucket->tb_offset[i] += (end-start)-newsize;
}
ibex_block_dirty((struct _block *)bucket);
}
/*
returns the number of blockid's free
*/
static int
tail_space(struct _tailblock *tail)
{
if (tail->used == 0)
return sizeof(tail->tb_data)/sizeof(tail->tb_data[0])-1;
return &tail->tb_data[tail->tb_offset[tail->used-1]]
- (blockid_t *)&tail->tb_offset[tail->used];
}
static void
tail_dump(struct _memcache *blocks, blockid_t tailid)
{
int i;
struct _tailblock *tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
printf("Block %d, used %d\n", tailid, tail->used);
for (i=0;i<sizeof(struct _tailblock)/sizeof(unsigned int);i++) {
printf(" %08x", ((unsigned int *)tail)[i]);
}
printf("\n");
}
static blockid_t
tail_get(struct _memcache *blocks, int size)
{
struct _root *root = (struct _root *)ibex_block_read(blocks, 0);
blockid_t tailid;
struct _tailblock *tail;
int freeindex;
blockid_t *end;
int i, count = 0;
d(printf("looking for a tail node with %d items in it\n", size));
/* look for a node with enough space, if we dont find it fairly
quickly, just quit. needs a better free algorithm i think ... */
tailid = root->tail;
while (tailid && count<5) {
int space;
d(printf(" checking tail node %d\n", tailid));
tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
if (tail->used == 0) {
/* assume its big enough ... */
tail->used = 1;
tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used));
ibex_block_dirty((struct _block *)tail);
return tailid;
}
/* see if we have a free slot first */
freeindex = -1;
end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])];
for (i=0;i<tail->used;i++) {
if (end == &tail->tb_data[tail->tb_offset[i]]) {
freeindex = i;
break;
}
end = &tail->tb_data[tail->tb_offset[i]];
}
/* determine how much space we have available - incl any extra header we might need */
space = ((char *)&tail->tb_data[tail->tb_offset[tail->used-1]])
- ((char *)&tail->tb_offset[tail->used])
/* FIXMEL work out why this is out a little bit */
- 8;
if (freeindex == -1)
space -= sizeof(tail->tb_offset[0]);
/* if we have enough, set it up, creating a new entry if necessary */
/* for some really odd reason the compiler promotes this expression to unsigned, hence
the requirement for the space>0 check ... */
if (space>0 && space > size*sizeof(blockid_t)) {
d(printf("space = %d, size = %d size*sizeof() = %d truth = %d\n", space, size, size*sizeof(blockid_t), space>size*sizeof(blockid_t)));
if (freeindex == -1) {
freeindex = tail->used;
tail->tb_offset[tail->used] = tail->tb_offset[tail->used-1];
tail->used++;
}
tail_compress(tail, freeindex, size);
ibex_block_dirty((struct _block *)tail);
d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, freeindex), tail->used));
return TAIL_KEY(tailid, freeindex);
}
count++;
tailid = block_location(tail->next);
}
d(printf("allocating new data node for tail data\n"));
/* didn't find one, setup a new one */
root = (struct _root *)ibex_block_read(blocks, 0);
tailid = ibex_block_get(blocks);
tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
tail->next = block_number(root->tail);
root->tail = tailid;
tail->used = 1;
tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
ibex_block_dirty((struct _block *)tail);
d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used));
return TAIL_KEY(tailid, 0);
}
static void
tail_free(struct _memcache *blocks, blockid_t tailid)
{
struct _tailblock *tail;
d(printf("freeing tail id %d\n", tailid));
if (tailid == 0)
return;
tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
d(printf(" tail %d used %d\n", tailid, tail->used));
g_assert(tail->used);
if (TAIL_INDEX(tailid) == tail->used - 1) {
tail->used --;
} else {
tail_compress(tail, TAIL_INDEX(tailid), 0);
}
ibex_block_dirty((struct _block *)tail);
}
static struct _IBEXStore *disk_create(struct _memcache *bc)
{
struct _IBEXStore *store;
store = g_malloc0(sizeof(*store));
store->klass = &ibex_diskarray_class;
store->blocks = bc;
return store;
}
static int disk_sync(struct _IBEXStore *store)
{
/* no cache, nop */
return 0;
}
static int disk_close(struct _IBEXStore *store)
{
g_free(store);
return 0;
}
static blockid_t
disk_add(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
{
struct _block *block;
struct _block *newblock;
blockid_t new, head = *headptr /*, tail = *tailptr*/;
g_error("Inbimplemented");
if (head == 0) {
head = ibex_block_get(store->blocks);
}
block = ibex_block_read(store->blocks, head);
d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
(printf("adding record into block %d %d\n", head, data));
block->bl_data[block->used] = data;
block->used++;
ibex_block_dirty(block);
return head;
} else {
new = ibex_block_get(store->blocks);
newblock = ibex_block_read(store->blocks, new);
newblock->next = block_number(head);
newblock->bl_data[0] = data;
newblock->used = 1;
d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next));
ibex_block_dirty(newblock);
return new;
}
}
static blockid_t
disk_add_blocks_internal(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
{
blockid_t head = *headptr, tail = *tailptr, new;
int tocopy;
struct _tailblock *tailblock;
struct _block *block, *newblock;
int space, copied = 0, left;
/* assumes this funciton is in control of any tail creation */
g_assert(tail == 0);
d(printf("Adding %d items to block list\n", data->len));
if (head == 0) {
head = ibex_block_get(store->blocks);
d(printf("allocating new head %d\n", head));
}
block = ibex_block_read(store->blocks, head);
/* ensure the first block is full before proceeding */
space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
if (space) {
tocopy = MIN(data->len, space);
memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
block->used += tocopy;
ibex_block_dirty(block);
copied = tocopy;
d(printf("copied %d items to left over last node\n", tocopy));
}
while (copied < data->len) {
left = data->len - copied;
/* do we drop the rest in a tail? */
if (left < TAIL_THRESHOLD) {
d(printf("Storing remaining %d items in tail\n", left));
tocopy = left;
new = tail_get(store->blocks, tocopy);
tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(new)]],
&g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
*tailptr = new;
} else {
new = ibex_block_get(store->blocks);
newblock = (struct _block *)ibex_block_read(store->blocks, new);
newblock->next = block_number(head);
tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
d(printf("storing %d items in own block\n", tocopy));
memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
newblock->used = tocopy;
block = newblock;
head = new;
ibex_block_dirty(newblock);
}
copied += tocopy;
}
*headptr = head;
return head;
}
/*
case 1:
no head, no tail, adding a lot of data
add blocks until the last, which goes in a tail node
case 2:
no head, no tail, adding a little data
just add a tail node
case 3:
no head, tail, total exceeds a block
merge as much as possible into new full blocks, then the remainder in the tail
case 4:
no head, tail, room in existing tail for data
add new data to tail node
case 5:
no head, tail, no room in existing tail for data
add a new tail node, copy data across, free old tail node
*/
static blockid_t
disk_add_list(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
{
blockid_t new, head = *headptr, tail = *tailptr, *start;
struct _tailblock *tailblock, *tailnew;
int len;
GArray *tmpdata = NULL;
d(printf("adding %d items head=%d tail=%d\n", data->len, head, tail));
if (data->len == 0)
return head;
/* store length=1 data in the tail pointer */
if (head == 0 && tail == 0 && data->len == 1) {
*headptr = BLOCK_ONE;
*tailptr = g_array_index(data, blockid_t, 0);
return BLOCK_ONE;
}
/* if we got length=1 data to append to, upgrade it to a real block list */
if (head == BLOCK_ONE) {
tmpdata = g_array_new(0, 0, sizeof(blockid_t));
g_array_append_vals(tmpdata, data->data, data->len);
g_array_append_val(tmpdata, tail);
head = *headptr = 0;
tail = *tailptr = 0;
}
/* if we have no head, then check the tail */
if (head == 0) {
if (tail == 0) {
if (data->len >= TAIL_THRESHOLD) {
/* add normally */
head = disk_add_blocks_internal(store, headptr, tailptr, data);
} else {
/* else add to a tail block */
new = tail_get(store->blocks, data->len);
d(printf("adding %d items to a tail block %d\n", data->len, new));
tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
data->data, data->len*sizeof(blockid_t));
*tailptr = new;
ibex_block_dirty((struct _block *)tailnew);
}
} else {
tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
len = tail_info(tailblock, tail, &start);
/* case 3 */
if (len + data->len >= TAIL_THRESHOLD) {
/* this is suboptimal, but should work - merge the tail data with
our new data, and write it out */
if (tmpdata == NULL) {
tmpdata = g_array_new(0, 0, sizeof(blockid_t));
g_array_append_vals(tmpdata, data->data, data->len);
}
g_array_append_vals(tmpdata, start, len);
*tailptr = 0;
tail_free(store->blocks, tail);
head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
} else if (tail_space(tailblock) >= data->len) {
/* can we just expand this in our node, or do we need a new one? */
tail_compress(tailblock, TAIL_INDEX(tail), data->len + len);
memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(tail)] + len],
data->data, data->len * sizeof(blockid_t));
ibex_block_dirty((struct _block *)tailblock);
} else {
/* we need to allocate a new tail node */
new = tail_get(store->blocks, data->len + len);
/* and copy the data across */
tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
start, len*sizeof(blockid_t));
memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]+len],
data->data, data->len*sizeof(blockid_t));
tail_free(store->blocks, tail);
ibex_block_dirty((struct _block *)tailnew);
*tailptr = new;
}
}
} else {
if (tail) {
/* read/merge the tail with the new data, rewrite out.
suboptimal, but it should be 'ok' ? */
tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
len = tail_info(tailblock, tail, &start);
tmpdata = g_array_new(0, 0, sizeof(blockid_t));
g_array_append_vals(tmpdata, start, len);
g_array_append_vals(tmpdata, data->data, data->len);
*tailptr = 0;
tail_free(store->blocks, tail);
head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
} else {
head = disk_add_blocks_internal(store, headptr, tailptr, data);
}
}
if (tmpdata)
g_array_free(tmpdata, TRUE);
return head;
}
static blockid_t
disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
{
blockid_t head = *headptr, node = *headptr, tail = *tailptr;
int i;
/* special case for 1-item nodes */
if (head == BLOCK_ONE) {
if (tail == data) {
*tailptr = 0;
*headptr = 0;
head = 0;
}
return head;
}
d(printf("removing %d from %d\n", data, head));
while (node) {
struct _block *block = ibex_block_read(store->blocks, node);
for (i=0;i<block->used;i++) {
if (block->bl_data[i] == data) {
struct _block *start = ibex_block_read(store->blocks, head);
d(printf("removing data from block %d\n ", head));
start->used--;
block->bl_data[i] = start->bl_data[start->used];
if (start->used == 0) {
struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
blockid_t new;
d(printf("dropping block %d, new head = %d\n", head, start->next));
new = block_location(start->next);
start->next = block_number(root->free);
root->free = head;
head = new;
ibex_block_dirty((struct _block *)root);
}
ibex_block_dirty(block);
ibex_block_dirty(start);
*headptr = head;
return head;
}
}
node = block_location(block->next);
}
if (tail) {
struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
int len;
blockid_t *start;
len = tail_info(tailblock, tail, &start);
for (i=0;i<len;i++) {
if (start[i] == data) {
for (;i<len-1;i++)
start[i] = start[i+1];
if (len == 1)
*tailptr = 0;
if (len == 1 && (tailblock->used-1) == TAIL_INDEX(tail)) {
d(printf("dropping/unlinking tailblock %d (%d) used = %d\n",
TAIL_BLOCK(tail), tail, tailblock->used));
tailblock->used--;
/* drop/unlink block? */
ibex_block_dirty((struct _block *)tailblock);
*tailptr = 0;
} else {
tail_compress(tailblock, TAIL_INDEX(tail), len-1);
ibex_block_dirty((struct _block *)tailblock);
}
}
}
}
return head;
}
static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail)
{
blockid_t next;
struct _block *block;
if (head == BLOCK_ONE)
return;
while (head) {
d(printf("freeing block %d\n", head));
block = ibex_block_read(store->blocks, head);
next = block_location(block->next);
ibex_block_free(store->blocks, head);
head = next;
}
if (tail) {
struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
d(printf("freeing tail block %d (%d)\n", TAIL_BLOCK(tail), tail));
tail_compress(tailblock, TAIL_INDEX(tail), 0);
ibex_block_dirty((struct _block *)tailblock);
}
}
static gboolean
disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data)
{
blockid_t node = head;
int i;
d(printf("finding %d from %d\n", data, head));
if (head == BLOCK_ONE)
return data == tail;
/* first check full blocks */
while (node) {
struct _block *block = ibex_block_read(store->blocks, node);
for (i=0;i<block->used;i++) {
if (block->bl_data[i] == data) {
return TRUE;
}
}
node = block_location(block->next);
}
/* then check tail */
if (tail) {
struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
int len;
blockid_t *start;
len = tail_info(tailblock, tail, &start);
for (i=0;i<len;i++) {
if (start[i] == data)
return TRUE;
}
}
return FALSE;
}
static GArray *
disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail)
{
GArray *result = g_array_new(0, 0, sizeof(nameid_t));
if (head == BLOCK_ONE) {
g_array_append_val(result, tail);
return result;
}
while (head) {
struct _block *block = ibex_block_read(store->blocks, head);
d(printf("getting data from block %d\n", head));
g_array_append_vals(result, block->bl_data, block->used);
head = block_location(block->next);
d(printf("next = %d\n", head));
}
/* chuck on any tail data as well */
if (tail) {
struct _tailblock *tailblock;
int len;
blockid_t *start;
tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
len = tail_info(tailblock, tail, &start);
g_array_append_vals(result, start, len);
}
return result;
}
void
ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail)
{
blockid_t node = head;
printf("dumping list %d tail %d\n", node, tail);
if (head == BLOCK_ONE) {
printf(" 1 length index: %d\n", tail);
return;
}
while (node) {
struct _block *block = ibex_block_read(blocks, node);
int i;
printf(" block %d used %d\n ", node, block->used);
for (i=0;i<block->used;i++) {
printf(" %d", block->bl_data[i]);
}
printf("\n");
node = block_location(block->next);
}
printf("tail: ");
if (tail) {
struct _tailblock *tailblock;
int len;
blockid_t *start;
int i;
tailblock = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tail));
len = tail_info(tailblock, tail, &start);
for (i=0;i<len;i++)
printf(" %d", start[i]);
}
printf("\n");
}
#if 0
int main(int argc, char **argv)
{
struct _memcache *bc;
struct _IBEXStore *disk;
int i;
blockid_t data[100];
GArray adata;
blockid_t head=0, tail=0;
for (i=0;i<100;i++) {
data[i] = i;
}
bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
disk = ibex_diskarray_class.create(bc);
head = 0;
tail = 0;
adata.data = data;
adata.len = 70;
for (i=0;i<100;i++) {
printf("Loop %d\n", i);
ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
ibex_diskarray_dump(bc, head, tail);
}
#if 0
for (i=1;i<100;i++) {
printf("inserting %d items\n", i);
adata.data = data;
adata.len = i;
head=0;
tail=0;
ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
ibex_diskarray_dump(bc, head, tail);
}
#endif
ibex_diskarray_class.close(disk);
ibex_block_cache_close(bc);
return 0;
}
#endif

View File

@ -1,481 +0,0 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2000 Helix Code, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* file.c: index file read/write ops */
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "ibex_internal.h"
static unsigned long read_number (FILE *f);
static void write_number (FILE *f, unsigned long n);
static char *get_compressed_word (FILE *f, char **lastword);
static gint free_file (gpointer key, gpointer value, gpointer data);
static void free_word (gpointer key, gpointer value, gpointer data);
/* The file format is:
*
* version string (currently "ibex1")
* file count
* list of compressed filenames, separated by \0
* word count
* list of compressed words, each followed by \0, a count, and that
* many references.
*
* All numbers are stored 7-bit big-endian, with the high bit telling
* whether or not the number continues to the next byte.
*
* compressed text consists of a byte telling how many characters the
* line has in common with the line before it, followed by the rest of
* the string. Obviously this only really works if the lists are sorted.
*/
/**
* ibex_open: open (or possibly create) an ibex index
* @file: the name of the file
* @flags: open flags, see open(2).
* @mode: If O_CREAT is passed in flags, then the file mode
* to create the new file with. It will be anded with the current
* umask.
*
* Open and/or create the named ibex file and return a handle to it.
*
* Return value: an ibex handle, or NULL if an error occurred.
**/
ibex *
ibex_open (char *file, int flags, int mode)
{
ibex *ib;
FILE *f;
char vbuf[sizeof (IBEX_VERSION) - 1];
char *word, *lastword;
unsigned long nfiles, nwords, nrefs, ref;
ibex_file **ibfs = NULL;
int i;
GPtrArray *refs;
int fd;
char *modestr;
fd = open(file, flags, mode);
if (fd == -1) {
return NULL;
}
/* yuck, this is because we use FILE * interface
internally */
switch (flags & O_ACCMODE) {
case O_RDONLY:
modestr = "r";
break;
case O_RDWR:
if (flags & O_APPEND)
modestr = "a+";
else
modestr = "w+";
break;
case O_WRONLY:
if (flags & O_APPEND)
modestr = "a";
else
modestr = "w";
break;
default:
if (flags & O_APPEND)
modestr = "a+";
else
modestr = "r+";
break;
}
f = fdopen(fd, modestr);
if (f == NULL) {
if (errno == 0)
errno = ENOMEM;
close(fd);
return NULL;
}
ib = g_malloc (sizeof (ibex));
ib->dirty = FALSE;
ib->path = g_strdup (file);
ib->files = g_tree_new ((GCompareFunc) strcmp);
ib->words = g_hash_table_new (g_str_hash, g_str_equal);
ib->oldfiles = g_ptr_array_new ();
if (!f) {
close(fd);
return ib;
}
/* Check version. If its empty, then we have just created it */
if (fread (vbuf, 1, sizeof (vbuf), f) != sizeof (vbuf)) {
if (feof (f)) {
fclose(f);
close(fd);
return ib;
}
}
if (strncmp (vbuf, IBEX_VERSION, sizeof (vbuf) != 0)) {
errno = EINVAL;
goto errout;
}
/* Read list of files. */
nfiles = read_number (f);
ibfs = g_malloc (nfiles * sizeof (ibex_file *));
lastword = NULL;
for (i = 0; i < nfiles; i++) {
ibfs[i] = g_malloc (sizeof (ibex_file));
ibfs[i]->name = get_compressed_word (f, &lastword);
if (!ibfs[i]->name) {
goto errout;
}
ibfs[i]->index = 0;
g_tree_insert (ib->files, ibfs[i]->name, ibfs[i]);
}
/* Read list of words. */
nwords = read_number (f);
lastword = NULL;
for (i = 0; i < nwords; i++) {
word = get_compressed_word (f, &lastword);
if (!word) {
goto errout;
}
nrefs = read_number (f);
refs = g_ptr_array_new ();
g_ptr_array_set_size (refs, nrefs);
while (nrefs--) {
ref = read_number (f);
if (ref >= nfiles) {
goto errout;
}
refs->pdata[nrefs] = ibfs[ref];
}
g_hash_table_insert (ib->words, word, refs);
}
g_free (ibfs);
fclose (f);
close(fd);
return ib;
errout:
fclose (f);
close(fd);
g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
g_tree_destroy (ib->files);
g_hash_table_foreach (ib->words, free_word, NULL);
g_hash_table_destroy (ib->words);
g_ptr_array_free (ib->oldfiles, TRUE);
if (ibfs)
g_free (ibfs);
g_free (ib->path);
g_free (ib);
return NULL;
}
struct ibex_write_data {
unsigned long index;
FILE *f;
char *lastname;
};
/* This is an internal function to find the longest common initial
* prefix between the last-written word and the current word.
*/
static int
get_prefix (struct ibex_write_data *iwd, char *name)
{
int i = 0;
if (iwd->lastname) {
while (!strncmp (iwd->lastname, name, i + 1))
i++;
}
iwd->lastname = name;
return i;
}
static gint
write_file (gpointer key, gpointer value, gpointer data)
{
char *file = key;
ibex_file *ibf = value;
struct ibex_write_data *iwd = data;
int prefix;
ibf->index = iwd->index++;
prefix = get_prefix (iwd, file);
fprintf (iwd->f, "%c%s", prefix, file + prefix);
fputc (0, iwd->f);
return FALSE;
}
/* scans for words which still exist in the index (after
index removals), and adds them to the ordered tree for
writing out in order */
static void
store_word (gpointer key, gpointer value, gpointer data)
{
GTree *wtree = data;
GPtrArray *refs = value;
int i;
ibex_file *ibf;
for (i = 0; i < refs->len; i++) {
ibf = g_ptr_array_index (refs, i);
if (ibf->index == -1) {
g_ptr_array_remove_index_fast (refs, i);
i--;
}
}
if (refs->len > 0) {
g_tree_insert (wtree, key, value);
}
}
/* writes a word out, in order */
static gint
write_word (gpointer key, gpointer value, gpointer data)
{
char *word = key;
GPtrArray *refs = value;
struct ibex_write_data *iwd = data;
int i, prefix;
ibex_file *ibf;
prefix = get_prefix (iwd, word);
fprintf (iwd->f, "%c%s", prefix, word + prefix);
fputc (0, iwd->f);
write_number (iwd->f, refs->len);
for (i = 0; i < refs->len; i++) {
ibf = g_ptr_array_index (refs, i);
write_number (iwd->f, ibf->index);
}
return FALSE;
}
/**
* ibex_write: Write an ibex out to disk.
* @ib: the ibex
*
* This writes an ibex to disk.
*
* Return value: 0 for success, -1 for failure (in which case errno
* is set).
**/
int
ibex_write (ibex *ib)
{
struct ibex_write_data iwd;
GTree *wtree;
char *tmpfile;
tmpfile = g_strdup_printf ("%s~", ib->path);
iwd.f = fopen (tmpfile, "w");
if (!iwd.f) {
if (errno == 0)
errno = ENOMEM;
g_free (tmpfile);
return -1;
}
fputs (IBEX_VERSION, iwd.f);
if (ferror (iwd.f))
goto lose;
iwd.index = 0;
iwd.lastname = NULL;
write_number (iwd.f, g_tree_nnodes (ib->files));
if (ferror (iwd.f))
goto lose;
g_tree_traverse (ib->files, write_file, G_IN_ORDER, &iwd);
if (ferror (iwd.f))
goto lose;
iwd.lastname = NULL;
wtree = g_tree_new ((GCompareFunc) strcmp);
g_hash_table_foreach (ib->words, store_word, wtree);
write_number (iwd.f, g_tree_nnodes(wtree));
if (ferror (iwd.f))
goto lose;
g_tree_traverse (wtree, write_word, G_IN_ORDER, &iwd);
g_tree_destroy (wtree);
if (ferror (iwd.f))
goto lose;
if (fclose (iwd.f) == 0 && rename (tmpfile, ib->path) == 0) {
g_free (tmpfile);
ib->dirty = FALSE;
return 0;
}
lose:
unlink (tmpfile);
g_free (tmpfile);
return -1;
}
/**
* ibex_save:
* @ib:
*
* Only write out an ibex if it is dirty.
*
* Return value: Same as ibex_write.
**/
int
ibex_save (ibex *ib)
{
if (ib->dirty)
return ibex_write(ib);
return 0;
}
/**
* ibex_close: Write out the ibex file (if it has changed) and free
* the data associated with it.
* @ib: the ibex
*
* If this ibex file has been modified since it was opened, this will
* call ibex_write() to write it out to disk. It will then free all data
* associated with the ibex. After calling ibex_close(), @ib will no
* longer be a valid ibex.
*
* Return value: 0 on success, -1 on an ibex_write() failure (in which
* case @ib will not be destroyed).
**/
int
ibex_close (ibex *ib)
{
ibex_file *ibf;
if (ib->dirty && ibex_write (ib) == -1)
return -1;
g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
g_tree_destroy (ib->files);
g_hash_table_foreach (ib->words, free_word, NULL);
g_hash_table_destroy (ib->words);
while (ib->oldfiles->len) {
ibf = g_ptr_array_remove_index (ib->oldfiles, 0);
g_free (ibf->name);
g_free (ibf);
}
g_ptr_array_free (ib->oldfiles, TRUE);
g_free (ib->path);
g_free (ib);
return 0;
}
static gint
free_file (gpointer key, gpointer value, gpointer data)
{
ibex_file *ibf = value;
g_free (ibf->name);
g_free (ibf);
return FALSE;
}
static void
free_word (gpointer key, gpointer value, gpointer data)
{
g_free (key);
g_ptr_array_free (value, TRUE);
}
static char *
get_compressed_word (FILE *f, char **lastword)
{
char *buf, *p;
int c, size;
c = getc (f);
if (c == EOF)
return NULL;
size = c + 10;
buf = g_malloc (size);
if (*lastword)
strncpy (buf, *lastword, c);
p = buf + c;
do {
c = getc (f);
if (c == EOF)
return NULL;
if (p == buf + size) {
buf = g_realloc (buf, size + 10);
p = buf + size;
size += 10;
}
*p++ = c;
} while (c != 0);
*lastword = buf;
return buf;
}
static void
write_number (FILE *f, unsigned long number)
{
int i, flag = 0;
char buf[4];
i = 4;
do {
buf[--i] = (number & 0x7F) | flag;
number = number >> 7;
flag = 0x80;
} while (number != 0);
fwrite (buf + i, 1, 4 - i, f);
}
static unsigned long
read_number (FILE *f)
{
int byte;
unsigned long num;
num = 0;
do {
byte = getc (f);
num = num << 7 | (byte & 0x7F);
} while (byte & 0x80);
return num;
}

View File

@ -1,198 +0,0 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2000 Helix Code, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* find.c: index file searching ops */
#include <string.h>
#include "ibex_internal.h"
/**
* ibex_find: search an ibex for a word
* @ib: an ibex
* @word: the word
*
* This routine searches an ibex for a word and returns a GPtrArray
* containing the names of the files in the ibex that contain the word.
* If no matches are found, it will return an empty array (not NULL).
* The caller must free the array, but MUST NOT free or alter its
* elements.
*
* Return value: the array of filenames containing @word
**/
GPtrArray *
ibex_find (ibex *ib, char *word)
{
GPtrArray *refs, *ret;
ibex_file *ibf;
int i;
ret = g_ptr_array_new ();
refs = g_hash_table_lookup (ib->words, word);
if (refs) {
for (i = 0; i < refs->len; i++) {
ibf = g_ptr_array_index (refs, i);
g_ptr_array_add (ret, ibf->name);
}
}
return ret;
}
/**
* ibex_contains_name:
* @ib:
* @name:
*
* Returns #TRUE if the ibex @ib has any index entry for
* the key @name.
*
* Return value:
**/
gboolean
ibex_contains_name(ibex *ib, char *name)
{
return g_tree_lookup(ib->files, name) != NULL;
}
/**
* ibex_find_name: Check if a word occurs in a given file
* @ib: an ibex
* @name: a filename
* @word: a word
*
* This checks if the given word occurs in the given file.
*
* Return value: TRUE or FALSE
**/
gboolean
ibex_find_name (ibex *ib, char *name, char *word)
{
GPtrArray *refs;
ibex_file *ibf;
int i;
refs = g_hash_table_lookup (ib->words, word);
if (refs) {
for (i = 0; i < refs->len; i++) {
ibf = g_ptr_array_index (refs, i);
if (!strcmp (ibf->name, name))
return TRUE;
}
}
return FALSE;
}
static gint
build_array (gpointer key, gpointer value, gpointer data)
{
char *name = key;
unsigned int count = GPOINTER_TO_UINT (value);
GPtrArray *ret = data;
if (count == 1)
g_ptr_array_add (ret, name);
return FALSE;
}
/**
* ibex_find_all: Find files containing multiple words
* @ib: an ibex
* @words: a GPtrArray of words
*
* This works like ibex_find(), but returns an array of filenames
* which contain all of the words in @words.
*
* Return value: an array of matches
**/
GPtrArray *
ibex_find_all (ibex *ib, GPtrArray *words)
{
GTree *work;
GPtrArray *wrefs, *ret;
int i, j, count;
char *word;
ibex_file *ibf;
if (words->len == 0)
return g_ptr_array_new ();
else if (words->len == 1)
return ibex_find (ib, g_ptr_array_index (words, 0));
work = g_tree_new ((GCompareFunc) strcmp);
for (i = 0; i < words->len; i++) {
word = g_ptr_array_index (words, i);
wrefs = g_hash_table_lookup (ib->words, word);
if (!wrefs) {
/* One of the words isn't even in the index. */
g_tree_destroy (work);
return g_ptr_array_new ();
}
if (i == 0) {
/* Copy the references into a tree, using the
* filenames as keys and the size of words as
* the value.
*/
for (j = 0; j < wrefs->len; j++) {
ibf = g_ptr_array_index (wrefs, j);
g_tree_insert (work, ibf->name,
GUINT_TO_POINTER (words->len));
}
} else {
/* Increment the counts in the working tree
* for the references for this word.
*/
for (j = 0; j < wrefs->len; j++) {
ibf = g_ptr_array_index (wrefs, j);
count = GPOINTER_TO_UINT (g_tree_lookup (work, ibf->name));
if (count) {
g_tree_insert (work, ibf->name,
GUINT_TO_POINTER (count - 1));
}
}
}
}
/* Build an array with the refs that contain all the words. */
ret = g_ptr_array_new ();
g_tree_traverse (work, build_array, G_IN_ORDER, ret);
g_tree_destroy (work);
return ret;
}
static void
ibex_dump_foo(char *key, GPtrArray *refs, void *data)
{
int i;
g_print("%s: ", key);
for (i=0;i<refs->len;i++) {
ibex_file *ibf = g_ptr_array_index (refs, i);
g_print("%c%s", ibf->index==-1?'-':' ', ibf->name);
}
g_print("\n");
}
/* debug function to dump the tree, in key order */
void
ibex_dump_all (ibex *ib)
{
g_hash_table_foreach(ib->words, (GHFunc) ibex_dump_foo, 0);
}

701
libibex/hash.c Normal file
View File

@ -0,0 +1,701 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* hash based index mechanism */
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include "block.h"
#include "index.h"
#define d(x)
#define HASH_SIZE (1024)
#define KEY_THRESHOLD (sizeof(struct _hashkey) + 4) /* minimum number of free bytes we worry about
maintaining free blocks for */
#define ARRAY_LEN(a) (sizeof(a)/sizeof(a[0]))
typedef guint32 hashid_t;
static struct _IBEXIndex *hash_create(struct _memcache *bc, int size);
static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root);
static int hash_sync(struct _IBEXIndex *index);
static int hash_close(struct _IBEXIndex *index);
static hashid_t hash_find(struct _IBEXIndex *index, const char *key, int keylen);
static void hash_remove(struct _IBEXIndex *index, const char *key, int keylen);
static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keylen);
static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len);
static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail);
static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail);
struct _IBEXIndexClass ibex_hash_class = {
hash_create, hash_open,
hash_sync, hash_close,
hash_find,
hash_remove,
hash_insert,
hash_get_key,
hash_set_data_block,
hash_get_data_block,
};
/* the reason we have the tail here is that otherwise we need to
have a 32 bit blockid for the root node; which would make this
structure the same size anyway, with about 24 wasted bits */
struct _hashkey {
blockid_t next; /* next in hash chain */
blockid_t tail;
unsigned int root:32-BLOCK_BITS;
unsigned int keyoffset:BLOCK_BITS;
};
struct _hashblock {
/*blockid_t next;*/ /* all key blocks linked together? */
guint32 used; /* elements used */
union {
struct _hashkey keys[(BLOCK_SIZE-4)/sizeof(struct _hashkey)];
char keydata[BLOCK_SIZE-4];
} hashblock_u;
};
#define hb_keys hashblock_u.keys
#define hb_keydata hashblock_u.keydata
/* root block for a hash index */
struct _hashroot {
hashid_t free; /* free list */
guint32 size; /* how big the hash table is */
hashid_t table[(BLOCK_SIZE-8)/sizeof(hashid_t)]; /* pointers to blocks of pointers */
};
struct _hashtableblock {
hashid_t buckets[BLOCK_SIZE/sizeof(hashid_t)];
};
/* map a hash index to a block index */
#define HASH_INDEX(b) ((b) & (BLOCK_SIZE-1))
/* map a hash index to a block number */
#define HASH_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
/* map a block + index to a hash key */
#define HASH_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
/* taken from tdb/gdbm */
static unsigned int hash_key(const unsigned char *key, int keylen)
{
char *newkey;
newkey = alloca(keylen+1);
memcpy(newkey, key, keylen);
newkey[keylen]=0;
return g_str_hash(newkey);
#if 0
unsigned int value; /* Used to compute the hash value. */
unsigned int i; /* Used to cycle through random values. */
/* Set the initial value from the key size. */
value = 0x238F13AF * keylen;
for (i=0; i < keylen; i++) {
value = (value + (key[i] << (i*5 % 24)));
}
value = (1103515243 * value + 12345);
return value;
#endif
}
/* create a new hash table, return a pointer to its root block */
static struct _IBEXIndex *
hash_create(struct _memcache *bc, int size)
{
blockid_t root, block;
struct _hashroot *hashroot;
int i;
struct _hashtableblock *table;
struct _IBEXIndex *index;
g_assert(size<=10240);
d(printf("initialising hash table, size = %d\n", size));
index = g_malloc0(sizeof(*index));
index->blocks = bc;
index->klass = &ibex_hash_class;
root = ibex_block_get(bc);
index->root = root;
d(printf(" root = %d\n", root));
hashroot = (struct _hashroot *)ibex_block_read(bc, root);
hashroot->free = 0;
hashroot->size = size;
ibex_block_dirty((struct _block *)hashroot);
for (i=0;i<size/(BLOCK_SIZE/sizeof(blockid_t));i++) {
d(printf("initialising hash table index block %d\n", i));
block = hashroot->table[i] = ibex_block_get(bc);
table = (struct _hashtableblock *)ibex_block_read(bc, block);
memset(table, 0, sizeof(table));
ibex_block_dirty((struct _block *)table);
}
return index;
}
static struct _IBEXIndex *
hash_open(struct _memcache *bc, blockid_t root)
{
struct _IBEXIndex *index;
/* FIXME: check a 'magic', and the root for validity */
index = g_malloc0(sizeof(*index));
index->blocks = bc;
index->root = root;
index->klass = &ibex_hash_class;
return index;
}
static int hash_sync(struct _IBEXIndex *index)
{
/* nop, index always synced on disk (at least, to blocks) */
return 0;
}
static int hash_close(struct _IBEXIndex *index)
{
g_free(index);
return 0;
}
/* convert a hashbucket id into a name */
static char *
hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len)
{
struct _hashblock *bucket;
int ind;
char *ret, *start, *end;
if (hashbucket == 0) {
if (len)
*len = 0;
return g_strdup("");
}
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
ind = HASH_INDEX(hashbucket);
g_assert(ind < bucket->used);
start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
if (ind == 0) {
end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
} else {
end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
}
ret = g_malloc(end-start+1);
memcpy(ret, start, end-start);
ret[end-start]=0;
if (len)
*len = end-start;
return ret;
}
/* sigh, this is fnugly code ... */
static hashid_t
hash_find(struct _IBEXIndex *index, const char *key, int keylen)
{
struct _hashroot *hashroot;
guint32 hash;
int hashentry;
blockid_t hashtable;
hashid_t hashbucket;
struct _hashtableblock *table;
g_assert(index != 0);
g_assert(index->root != 0);
d(printf("finding hash %.*s\n", keylen, key));
hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
/* find the table containing this entry */
hash = hash_key(key, keylen) % hashroot->size;
hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
g_assert(hashtable != 0);
table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
/* and its bucket */
hashbucket = table->buckets[hashentry];
/* go down the bucket chain, reading each entry till we are done ... */
while (hashbucket != 0) {
struct _hashblock *bucket;
char *start, *end;
int ind;
d(printf(" checking bucket %d\n", hashbucket));
/* get the real bucket id from the hashbucket id */
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
/* and get the key number within the block */
ind = HASH_INDEX(hashbucket);
g_assert(ind < bucket->used);
start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
if (ind == 0) {
end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
} else {
end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
}
if ( (end-start) == keylen
&& memcmp(start, key, keylen) == 0) {
return hashbucket;
}
hashbucket = bucket->hb_keys[ind].next;
}
return 0;
}
/* compresses the bucket 'bucket', removing data
at index 'index' */
static void
hash_compress(struct _hashblock *bucket, int index)
{
int i;
char *start, *end, *newstart;
/* get start/end of area to zap */
start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset];
if (index == 0) {
end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
} else {
end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
}
if (start == end)
return;
/* fixup data */
newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
memmove(newstart+(end-start), newstart, start-newstart);
/* fixup key pointers */
for (i=index;i<bucket->used;i++) {
bucket->hb_keys[i].keyoffset += (end-start);
}
ibex_block_dirty((struct _block *)bucket);
}
/* make room 'len' for the key 'index' */
/* assumes key 'index' is already empty (0 length) */
static void
hash_expand(struct _hashblock *bucket, int index, int len)
{
int i;
char *end, *newstart;
/* get start/end of area to zap */
if (index == 0) {
end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
} else {
end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
}
/* fixup data */
newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
memmove(newstart-len, newstart, end-newstart);
/* fixup key pointers */
for (i=index;i<bucket->used;i++) {
bucket->hb_keys[i].keyoffset -= len;
}
ibex_block_dirty((struct _block *)bucket);
}
static void
hash_remove(struct _IBEXIndex *index, const char *key, int keylen)
{
struct _hashroot *hashroot;
guint32 hash;
int hashentry;
blockid_t hashtable;
hashid_t hashbucket, hashprev;
struct _hashtableblock *table;
g_assert(index != 0);
g_assert(index->root != 0);
d(printf("removing hash %.*s\n", keylen, key));
hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
/* find the table containing this entry */
hash = hash_key(key, keylen) % hashroot->size;
hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
/* and its bucket */
hashbucket = table->buckets[hashentry];
/* go down the bucket chain, reading each entry till we are done ... */
hashprev = 0;
while (hashbucket != 0) {
struct _hashblock *bucket;
char *start, *end;
int ind;
d(printf(" checking bucket %d\n", hashbucket));
/* get the real bucket id from the hashbucket id */
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
/* and get the key number within the block */
ind = HASH_INDEX(hashbucket);
g_assert(ind < bucket->used);
start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
if (ind == 0) {
end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
} else {
end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
}
if ( (end-start) == keylen
&& memcmp(start, key, keylen) == 0) {
struct _hashblock *prevbucket;
if (hashprev == 0) {
/* unlink from hash chain */
table->buckets[hashentry] = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
/* link into free list */
bucket->hb_keys[HASH_INDEX(hashbucket)].next = hashroot->free;
hashroot->free = hashbucket;
/* compress away data */
hash_compress(bucket, HASH_INDEX(hashbucket));
ibex_block_dirty((struct _block *)bucket);
ibex_block_dirty((struct _block *)table);
ibex_block_dirty((struct _block *)hashroot);
} else {
prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashprev));
prevbucket->hb_keys[HASH_INDEX(hashprev)].next =
bucket->hb_keys[ind].next;
/* link into free list */
bucket->hb_keys[ind].next = hashroot->free;
hashroot->free = hashbucket;
/* compress entry */
hash_compress(bucket, ind);
ibex_block_dirty((struct _block *)bucket);
ibex_block_dirty((struct _block *)prevbucket);
ibex_block_dirty((struct _block *)hashroot);
}
return;
}
hashprev = hashbucket;
hashbucket = bucket->hb_keys[ind].next;
}
}
/* set where the datablock is located */
static void
hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail)
{
struct _hashblock *bucket;
d(printf("setting data block hash %d to %d tail %d\n", keyid, blockid, tail));
/* map to a block number */
g_assert((blockid & (BLOCK_SIZE-1)) == 0);
blockid >>= BLOCK_BITS;
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
if (bucket->hb_keys[HASH_INDEX(keyid)].root != blockid
|| bucket->hb_keys[HASH_INDEX(keyid)].tail != tail) {
bucket->hb_keys[HASH_INDEX(keyid)].tail = tail;
bucket->hb_keys[HASH_INDEX(keyid)].root = blockid;
ibex_block_dirty((struct _block *)bucket);
}
}
static blockid_t
hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail)
{
struct _hashblock *bucket;
d(printf("getting data block hash %d\n", keyid));
if (keyid == 0) {
if (tail)
*tail = 0;
return 0;
}
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
if (tail)
*tail = bucket->hb_keys[HASH_INDEX(keyid)].tail;
return bucket->hb_keys[HASH_INDEX(keyid)].root << BLOCK_BITS;
}
static hashid_t
hash_insert(struct _IBEXIndex *index, const char *key, int keylen)
{
struct _hashroot *hashroot;
guint32 hash;
int hashentry;
blockid_t hashtable;
hashid_t hashbucket, keybucket, keyprev, keyfree;
struct _hashtableblock *table;
struct _hashblock *bucket;
int count;
g_assert(index != 0);
g_assert(index->root != 0);
d(printf("inserting hash %.*s\n", keylen, key));
hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
/* find the table containing this entry */
hash = hash_key(key, keylen) % hashroot->size;
hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
/* and its bucket */
hashbucket = table->buckets[hashentry];
/* now look for a free slot, first try the free list */
/* but dont try too hard if our key is just too long ... so just scan upto
4 blocks, but if we dont find a space, tough ... */
keybucket = hashroot->free;
keyprev = 0;
count = 0;
while (keybucket && count<4) {
int space;
d(printf(" checking free %d\n", keybucket));
/* read the bucket containing this free key */
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keybucket));
/* check if there is enough space for the key */
space = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]
- (char *)&bucket->hb_keys[bucket->used];
if (space >= keylen) {
hash_expand(bucket, HASH_INDEX(keybucket), keylen);
memcpy(&bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(keybucket)].keyoffset],
key, keylen);
/* check if there is free space still in this node, and there are no other empty blocks */
keyfree = bucket->hb_keys[HASH_INDEX(keybucket)].next;
if ((space-keylen) >= KEY_THRESHOLD) {
int i;
int head = ARRAY_LEN(bucket->hb_keydata);
int found = FALSE;
for (i=0;i<bucket->used;i++) {
if (bucket->hb_keys[i].keyoffset == head) {
/* already have a free slot in this block, leave it */
found = TRUE;
break;
}
head = bucket->hb_keys[i].keyoffset;
}
if (!found) {
/* we should link in a new free slot for this node */
bucket->hb_keys[bucket->used].next = bucket->hb_keys[HASH_INDEX(keybucket)].next;
bucket->hb_keys[bucket->used].keyoffset = bucket->hb_keys[bucket->used-1].keyoffset;
keyfree = HASH_KEY(HASH_BLOCK(keybucket), bucket->used);
bucket->used++;
}
}
/* link 'keyfree' back to the parent ... */
if (keyprev == 0) {
hashroot->free = keyfree;
ibex_block_dirty((struct _block *)hashroot);
} else {
struct _hashblock *prevbucket;
prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyprev));
prevbucket->hb_keys[HASH_INDEX(keyprev)].next = keyfree;
ibex_block_dirty((struct _block *)prevbucket);
}
/* link into the hash chain */
bucket->hb_keys[HASH_INDEX(keybucket)].next = hashbucket;
bucket->hb_keys[HASH_INDEX(keybucket)].root = 0;
bucket->hb_keys[HASH_INDEX(keybucket)].tail = 0;
table->buckets[hashentry] = keybucket;
ibex_block_dirty((struct _block *)table);
ibex_block_dirty((struct _block *)bucket);
d(printf(" new key id %d\n", keybucket));
d(printf(" new free id %d\n", hashroot->free));
return keybucket;
}
count++;
keyprev = keybucket;
keybucket = bucket->hb_keys[HASH_INDEX(keybucket)].next;
}
/* else create a new block ... */
keybucket = ibex_block_get(index->blocks);
bucket = (struct _hashblock *)ibex_block_read(index->blocks, keybucket);
d(printf("creating new key bucket %d\n", keybucket));
memset(bucket, 0, sizeof(*bucket));
bucket->used = 2;
/* first block, is the new key */
bucket->hb_keys[0].keyoffset = ARRAY_LEN(bucket->hb_keydata) - keylen;
memcpy(&bucket->hb_keydata[bucket->hb_keys[0].keyoffset], key, keylen);
bucket->hb_keys[0].next = hashbucket;
bucket->hb_keys[0].root = 0;
bucket->hb_keys[0].tail = 0;
table->buckets[hashentry] = HASH_KEY(keybucket, 0);
/* next block is a free block, link into free list */
bucket->hb_keys[1].keyoffset = bucket->hb_keys[0].keyoffset;
bucket->hb_keys[1].next = hashroot->free;
hashroot->free = HASH_KEY(keybucket, 1);
ibex_block_dirty((struct _block *)hashroot);
ibex_block_dirty((struct _block *)table);
ibex_block_dirty((struct _block *)bucket);
d(printf(" new key id %d\n", HASH_KEY(keybucket, 0)));
d(printf(" new free id %d\n", hashroot->free));
return HASH_KEY(keybucket, 0);
}
/* debug */
void ibex_hash_dump(struct _IBEXIndex *index);
void
ibex_hash_dump(struct _IBEXIndex *index)
{
int i;
struct _hashtableblock *table;
struct _hashblock *bucket;
struct _hashroot *hashroot;
blockid_t hashtable;
hashid_t hashbucket;
printf("Walking hash tree:\n");
hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
for (i=0;i<hashroot->size;i++) {
printf("Hash table chain: %d\n", i);
hashtable = hashroot->table[i / (BLOCK_SIZE/sizeof(blockid_t))];
table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
hashbucket = table->buckets[i % (BLOCK_SIZE/sizeof(blockid_t))];
while (hashbucket) {
int len;
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
printf(" bucket %d: [used %d]", hashbucket, bucket->used);
if (HASH_INDEX(hashbucket) == 0) {
len = ARRAY_LEN(bucket->hb_keydata) -
bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
} else {
len = bucket->hb_keys[HASH_INDEX(hashbucket)-1].keyoffset -
bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
}
printf("'%.*s' = %d next=%d\n", len, &bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset],
bucket->hb_keys[HASH_INDEX(hashbucket)].root,
bucket->hb_keys[HASH_INDEX(hashbucket)].next);
ibex_diskarray_dump(index->blocks,
bucket->hb_keys[HASH_INDEX(hashbucket)].root << BLOCK_BITS,
bucket->hb_keys[HASH_INDEX(hashbucket)].tail);
hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
}
/* make sure its still in the cache */
hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
}
hashbucket = hashroot->free;
printf("Dumping free lists ..\n");
while (hashbucket) {
printf(" %d", hashbucket);
bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
}
printf("\n");
}
#if 0
int main(int argc, char **argv)
{
struct _memcache *bc;
struct _IBEXIndex *hash;
int i;
bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
hash = ibex_hash_class.create(bc, 1024);
for (i=0;i<10000;i++) {
char key[16];
sprintf(key, "key %d", i);
ibex_hash_class.insert(hash, key, strlen(key));
}
for (i=500;i<1000;i++) {
char key[16];
sprintf(key, "key %d", i);
ibex_hash_class.remove(hash, key, strlen(key));
}
for (i=500;i<1000;i++) {
char key[16];
sprintf(key, "key %d", i);
ibex_hash_class.insert(hash, key, strlen(key));
}
ibex_hash_dump(hash);
for (i=0;i<2000;i++) {
char key[16], *lookup;
hashid_t keyid;
blockid_t root, tail;
sprintf(key, "key %d", i);
keyid = ibex_hash_class.find(hash, key, strlen(key));
lookup = ibex_hash_class.get_key(hash, keyid, 0);
root = ibex_hash_class.get_data(hash, keyid, &tail);
printf("key %s = %d = '%s' root:%d tail:%d \n", key, keyid, lookup, root, tail);
g_free(lookup);
}
ibex_hash_class.close(hash);
ibex_block_cache_close(bc);
return 0;
}
#endif

View File

@ -1,33 +1,16 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2000 Helix Code, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* words.c: low-level indexing ops */
#include <glib.h>
#include <db.h>
#include <stdio.h>
#include <unicode.h>
#include <ctype.h>
#include <errno.h>
#include <string.h>
#include <unicode.h>
#include "ibex_internal.h"
#define d(x)
static signed char utf8_trans[] = {
'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
@ -60,7 +43,7 @@ static char *utf8_long_trans[] = {
* It is not safe to call this routine with bad UTF-8.
*/
static void
normalize_word (char *start, char *end, char *buf)
ibex_normalise_word(char *start, char *end, char *buf)
{
unsigned char *s, *d;
unicode_char_t uc;
@ -151,39 +134,6 @@ utf8_category (char *sp, char **snp, char *send)
}
}
static ibex_file *
get_ibex_file (ibex *ib, char *name)
{
ibex_file *ibf;
ibf = g_tree_lookup (ib->files, name);
if (!ibf) {
ibf = g_malloc (sizeof (ibex_file));
ibf->name = g_strdup (name);
ibf->index = 0;
g_tree_insert (ib->files, ibf->name, ibf);
ib->dirty = TRUE;
}
return ibf;
}
static void
ref_word (ibex *ib, ibex_file *ibf, char *word)
{
GPtrArray *refs;
refs = g_hash_table_lookup (ib->words, word);
if (!refs) {
refs = g_ptr_array_new ();
g_hash_table_insert (ib->words, g_strdup (word), refs);
g_ptr_array_add (refs, ibf);
ib->dirty = TRUE;
} else if (g_ptr_array_index (refs, refs->len - 1) != ibf) {
g_ptr_array_add (refs, ibf);
ib->dirty = TRUE;
}
}
/**
* ibex_index_buffer: the lowest-level ibex indexing interface
* @ib: an ibex
@ -206,12 +156,13 @@ ref_word (ibex *ib, ibex_file *ibf, char *word)
* Return value: 0 on success, -1 on failure.
**/
int
ibex_index_buffer (ibex *ib, char *name, char *buffer,
size_t len, size_t *unread)
ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
{
char *p, *q, *nq, *end, *word;
ibex_file *ibf = get_ibex_file (ib, name);
int wordsiz, cat;
GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
GPtrArray *wordlist = g_ptr_array_new();
int i, ret=-1;
if (unread)
*unread = 0;
@ -229,12 +180,9 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
p = q;
}
if (p == end) {
g_free (word);
return 0;
goto done;
} else if (cat == IBEX_INVALID) {
errno = EINVAL;
g_free (word);
return -1;
goto error;
} else if (cat == IBEX_INCOMPLETE)
q = end;
@ -246,24 +194,107 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
}
if (cat == IBEX_INVALID ||
(cat == IBEX_INCOMPLETE && !unread)) {
errno = EINVAL;
g_free (word);
return -1;
goto error;
} else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
*unread = end - p;
g_free (word);
return 0;
goto done;
}
if (wordsiz < q - p + 1) {
wordsiz = q - p + 1;
word = g_realloc (word, wordsiz);
}
normalize_word (p, q, word);
ref_word (ib, ibf, word);
ibex_normalise_word (p, q, word);
if (word[0]) {
if (g_hash_table_lookup(words, word) == 0) {
char *newword = g_strdup(word);
g_ptr_array_add(wordlist, newword);
g_hash_table_insert(words, newword, name);
}
}
p = q;
}
done:
d(printf("name %s count %d size %d\n", name, wordlist->len, len));
ib->words->klass->add_list(ib->words, name, wordlist);
ret = 0;
error:
for (i=0;i<wordlist->len;i++)
g_free(wordlist->pdata[i]);
g_ptr_array_free(wordlist, TRUE);
g_hash_table_destroy(words);
g_free (word);
return ret;
}
ibex *ibex_open (char *file, int flags, int mode)
{
ibex *ib;
ib = g_malloc0(sizeof(*ib));
ib->blocks = ibex_block_cache_open(file, flags, mode);
if (ib->blocks == 0) {
g_warning("create: Error occured?: %s\n", strerror(errno));
g_free(ib);
return NULL;
}
/* FIXME: the blockcache or the wordindex needs to manage the other one */
ib->words = ib->blocks->words;
return ib;
}
int ibex_save (ibex *ib)
{
printf("syncing database\n");
ib->words->klass->sync(ib->words);
/* FIXME: some return */
ibex_block_cache_sync(ib->blocks);
return 0;
}
int ibex_close (ibex *ib)
{
int ret = 0;
printf("closing database\n");
ib->words->klass->close(ib->words);
ibex_block_cache_close(ib->blocks);
g_free(ib);
return ret;
}
void ibex_unindex (ibex *ib, char *name)
{
d(printf("trying to unindex '%s'\n", name));
ib->words->klass->unindex_name(ib->words, name);
}
GPtrArray *ibex_find (ibex *ib, char *word)
{
char *normal;
int len;
len = strlen(word);
normal = alloca(len+1);
ibex_normalise_word(word, word+len, normal);
return ib->words->klass->find(ib->words, normal);
}
gboolean ibex_find_name (ibex *ib, char *name, char *word)
{
char *normal;
int len;
len = strlen(word);
normal = alloca(len+1);
ibex_normalise_word(word, word+len, normal);
return ib->words->klass->find_name(ib->words, name, normal);
}
gboolean ibex_contains_name(ibex *ib, char *name)
{
return ib->words->klass->contains_name(ib->words, name);
}

512
libibex/ibex_db.c Normal file
View File

@ -0,0 +1,512 @@
#include <glib.h>
#include <db.h>
#include <stdio.h>
#include <unicode.h>
#include <ctype.h>
#include "ibex_internal.h"
#define d(x)
/*
Uses 2 databases:
names - HASH - lists which files are stored
words - BTREE - index words to files
*/
static int
db_delete_name(DB *db, const char *name)
{
DBC *cursor;
int err, namelen;
DBT key, data;
printf("called to delete name %s\n", name);
return 0;
err = db->cursor(db, NULL, &cursor, 0);
if (err != 0) {
printf("Error occured?: %s\n", db_strerror(err));
return err;
}
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
namelen = strlen(name);
err = cursor->c_get(cursor, &key, &data, DB_FIRST);
while (err == 0) {
if (data.size == namelen && memcmp(data.data, name, namelen) == 0) {
d(printf("Found match, removing it\n"));
cursor->c_del(cursor, 0);
} else {
data.size = namelen;
data.data = (void *)name;
if (cursor->c_get(cursor, &key, &data, DB_GET_BOTH) == 0) {
d(printf("seek to match, removing it\n"));
cursor->c_del(cursor, 0);
} else {
d(printf("seek to match, not found\n"));
}
}
err = cursor->c_get(cursor, &key, &data, DB_NEXT_NODUP);
}
cursor->c_close(cursor);
return 0;
}
static int
db_index_words(DB *db, char *name, char **words)
{
DBT key, data;
int err = 0;
char *word;
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
data.data = name;
data.size = strlen(name);
for (;(word=*words);words++) {
/* normalise word first ... */
key.data = word;
key.size = strlen(word);
err = db->put(db, NULL, &key, &data, 0);
if (err != 0)
printf("Error occured?: %s\n", db_strerror(err));
}
return err;
}
static int
db_index_word(DB *db, char *name, char *word)
{
DBT key, data;
int err = 0;
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
data.data = name;
data.size = strlen(name);
key.data = word;
key.size = strlen(word);
err = db->put(db, NULL, &key, &data, 0);
if (err != 0)
printf("Error occured?: %s\n", db_strerror(err));
return err;
}
static GPtrArray *
db_find(DB *db, char *word)
{
DBT key, data;
int err;
DBC *cursor;
GPtrArray *matches = g_ptr_array_new();
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
err = db->cursor(db, NULL, &cursor, 0);
if (err != 0) {
printf("Error occured?: %s\n", db_strerror(err));
return matches;
}
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
key.size = strlen(word);
key.data = word;
err = cursor->c_get(cursor, &key, &data, DB_SET);
while (err == 0) {
char *name;
name = g_malloc(data.size+1);
memcpy(name, data.data, data.size);
name[data.size] = '\0';
g_ptr_array_add(matches, name);
err = cursor->c_get(cursor, &key, &data, DB_NEXT_DUP);
}
if (err != DB_NOTFOUND) {
printf("Error occured?: %s\n", db_strerror(err));
/* what to do? doesn't matter too much its just a search ... */
}
cursor->c_close(cursor);
return matches;
}
/* check that db contains key @word, optionally with contents @name */
static gboolean
db_contains_word(DB *db, char *name, char *word)
{
DBT key, data;
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
if (name != NULL) {
data.size = strlen(name);
data.data = name;
}
key.size = strlen(word);
key.data = word;
return db->get(db, NULL, &key, &data, name?DB_GET_BOTH:DB_SET) == 0;
}
static int
db_delete_word(DB *db, char *word)
{
DBT key;
memset(&key, 0, sizeof(key));
key.size = strlen(word);
key.data = word;
return db->del(db, NULL, &key, 0);
}
static signed char utf8_trans[] = {
'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e',
'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/',
'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a',
'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e',
'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g',
'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L',
'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11,
'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r',
'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't',
'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w',
'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's'
};
static char *utf8_long_trans[] = {
"AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij",
"NG", "ng", "OE", "oe"
};
/* This is a bit weird. It takes pointers to the start and end (actually
* just past the end) of a UTF-8-encoded word, and a buffer at least 1
* byte longer than the length of the word. It copies the word into the
* buffer in all lowercase without accents, and splits up ligatures.
* (Since any ligature would be a multi-byte character in UTF-8, splitting
* them into two US-ASCII characters won't overrun the buffer.)
*
* It is not safe to call this routine with bad UTF-8.
*/
static void
ibex_normalise_word(char *start, char *end, char *buf)
{
unsigned char *s, *d;
unicode_char_t uc;
s = (unsigned char *)start;
d = (unsigned char *)buf;
while (s < (unsigned char *)end) {
if (*s < 0x80) {
/* US-ASCII character: copy unless it's
* an apostrophe.
*/
if (*s != '\'')
*d++ = tolower (*s);
s++;
} else {
char *next = unicode_get_utf8 (s, &uc);
if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) {
signed char ch = utf8_trans[uc - 0xc0];
if (ch > 0)
*d++ = tolower (ch);
else {
*d++ = tolower (utf8_long_trans[-ch - 1][0]);
*d++ = tolower (utf8_long_trans[-ch - 1][1]);
}
s = next;
} else {
while (s < (unsigned char *)next)
*d++ = *s++;
}
}
}
*d = '\0';
}
enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE };
/* This incorporates parts of libunicode, because there's no way to
* force libunicode to not read past a certain point.
*/
static int
utf8_category (char *sp, char **snp, char *send)
{
unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp;
unsigned char *end = (unsigned char *)send;
if (isascii (*p)) {
*np = p + 1;
if (isalpha (*p) || *p == '\'')
return IBEX_ALPHA;
return IBEX_NONALPHA;
} else {
unicode_char_t uc;
int more;
if ((*p & 0xe0) == 0xc0) {
more = 1;
uc = *p & 0x1f;
} else if ((*p & 0xf0) == 0xe0) {
more = 2;
uc = *p & 0x0f;
} else if ((*p & 0xf8) == 0xf0) {
more = 3;
uc = *p & 0x07;
} else if ((*p & 0xfc) == 0xf8) {
more = 4;
uc = *p & 0x03;
} else if ((*p & 0xfe) == 0xfc) {
more = 5;
uc = *p & 0x01;
} else
return IBEX_INVALID;
if (p + more > end)
return IBEX_INCOMPLETE;
while (more--) {
if ((*++p & 0xc0) != 0x80)
return IBEX_INVALID;
uc <<= 6;
uc |= *p & 0x3f;
}
*np = p + 1;
if (unicode_isalpha (uc))
return IBEX_ALPHA;
else
return IBEX_NONALPHA;
}
}
static void
do_insert_words(char *key, char *name, ibex *ib)
{
db_index_word(ib->words, name, key);
g_free(key);
}
static void
do_free_words(char *key, char *name, void *data)
{
g_free(key);
}
/**
* ibex_index_buffer: the lowest-level ibex indexing interface
* @ib: an ibex
* @name: the name of the file being indexed
* @buffer: a buffer containing data from the file
* @len: the length of @buffer
* @unread: an output argument containing the number of unread bytes
*
* This routine indexes up to @len bytes from @buffer into @ib.
* If @unread is NULL, the indexer assumes that the buffer ends on a
* word boundary, and will index all the way to the end of the
* buffer. If @unread is not NULL, and the buffer ends with an
* alphabetic character, the indexer will assume that the buffer has
* been cut off in the middle of a word, and return the number of
* un-indexed bytes at the end of the buffer in *@unread. The caller
* should then read in more data through whatever means it has
* and pass in the unread bytes from the original buffer, followed
* by the new data, on its next call.
*
* Return value: 0 on success, -1 on failure.
**/
int
ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
{
char *p, *q, *nq, *end, *word;
int wordsiz, cat;
GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
if (unread)
*unread = 0;
end = buffer + len;
wordsiz = 20;
word = g_malloc (wordsiz);
p = buffer;
while (p < end) {
while (p < end) {
cat = utf8_category (p, &q, end);
if (cat != IBEX_NONALPHA)
break;
p = q;
}
if (p == end) {
goto done;
} else if (cat == IBEX_INVALID) {
goto error;
} else if (cat == IBEX_INCOMPLETE)
q = end;
while (q < end) {
cat = utf8_category (q, &nq, end);
if (cat != IBEX_ALPHA)
break;
q = nq;
}
if (cat == IBEX_INVALID ||
(cat == IBEX_INCOMPLETE && !unread)) {
goto error;
} else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
*unread = end - p;
goto done;
}
if (wordsiz < q - p + 1) {
wordsiz = q - p + 1;
word = g_realloc (word, wordsiz);
}
ibex_normalise_word (p, q, word);
if (word[0]) {
if (g_hash_table_lookup(words, word) == 0) {
g_hash_table_insert(words, g_strdup(word), name);
}
}
p = q;
}
done:
/* FIXME: do this and inserts within a transaction */
if (!db_contains_word(ib->names, NULL, name)) {
printf("adding '%s' to database\n", name);
db_index_word(ib->names, "", name);
}
g_hash_table_foreach(words, (GHFunc)do_insert_words, ib);
g_hash_table_destroy(words);
g_free (word);
return 0;
error:
g_hash_table_foreach(words, (GHFunc)do_free_words, NULL);
g_hash_table_destroy(words);
g_free (word);
return -1;
}
ibex *ibex_open (char *file, int flags, int mode)
{
ibex *ib;
u_int32_t dbflags = 0;
int err;
ib = g_malloc0(sizeof(*ib));
err = db_create(&ib->words, NULL, 0);
if (err != 0) {
g_warning("create: Error occured?: %s\n", db_strerror(err));
g_free(ib);
return NULL;
}
err = ib->words->set_flags(ib->words, DB_DUP);
if (err != 0) {
g_warning("set flags: Error occured?: %s\n", db_strerror(err));
ib->words->close(ib->words, 0);
g_free(ib);
return NULL;
}
if (flags & O_CREAT)
dbflags |= DB_CREATE;
if (flags & O_EXCL)
dbflags |= DB_EXCL;
if (flags & O_RDONLY)
dbflags |= DB_RDONLY;
/* 1MB cache size? */
ib->words->set_cachesize(ib->words, 0, 1000000, 0);
err = ib->words->open(ib->words, file, "words", DB_BTREE, dbflags, mode);
if (err != 0) {
printf("open: Error occured?: %s\n", db_strerror(err));
ib->words->close(ib->words, 0);
g_free(ib);
return NULL;
}
/* FIXME: check returns */
err = db_create(&ib->names, NULL, 0);
err = ib->names->open(ib->names, file, "names", DB_HASH, dbflags, mode);
return ib;
}
int ibex_save (ibex *ib)
{
printf("syncing database\n");
ib->names->sync(ib->names, 0);
return ib->words->sync(ib->words, 0);
}
int ibex_close (ibex *ib)
{
int ret;
printf("closing database\n");
ret = ib->names->close(ib->names, 0);
ret = ib->words->close(ib->words, 0);
g_free(ib);
return ret;
}
void ibex_unindex (ibex *ib, char *name)
{
printf("trying to unindex '%s'\n", name);
if (db_contains_word(ib->names, NULL, name)) {
/* FIXME: do within transaction? */
db_delete_name(ib->words, name);
db_delete_word(ib->names, name);
}
}
GPtrArray *ibex_find (ibex *ib, char *word)
{
char *normal;
int len;
len = strlen(word);
normal = alloca(len+1);
ibex_normalise_word(word, word+len, normal);
return db_find(ib->words, normal);
}
gboolean ibex_find_name (ibex *ib, char *name, char *word)
{
char *normal;
int len;
len = strlen(word);
normal = alloca(len+1);
ibex_normalise_word(word, word+len, normal);
return db_contains_word(ib->words, name, normal);
}
gboolean ibex_contains_name(ibex *ib, char *name)
{
return db_contains_word(ib->names, NULL, name);
}

View File

@ -21,19 +21,13 @@
#include <glib.h>
#include "ibex.h"
#include "block.h"
#include "wordindex.h"
#define IBEX_VERSION "ibex1"
#define IBEX_VERSION "ibex3"
struct ibex {
char *path;
GTree *files;
GHashTable *words;
GPtrArray *oldfiles;
gboolean dirty;
struct _memcache *blocks;
struct _IBEXWord *words;
};
struct ibex_file {
char *name;
long index;
};
typedef struct ibex_file ibex_file;

View File

@ -1,154 +0,0 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2000 Helix Code, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* index.c: high-level indexing ops */
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include "ibex_internal.h"
/**
* ibex_index_file: Index a file by name
* @ib: an ibex
* @filename: name of the file to index
*
* This indexes the given file into @ib. If @filename is already in
* the ibex, the existing index entries for it are discarded and the
* file is indexed anew.
*
* Return value: 0 on success, -1 on failure.
**/
int
ibex_index_file (ibex *ib, char *filename)
{
int fd;
int status;
struct stat st;
fd = open (filename, O_RDONLY);
if (fd < 0)
return -1;
if (fstat (fd, &st) == -1) {
close (fd);
return -1;
}
if (!S_ISREG (st.st_mode)) {
close (fd);
errno = EINVAL;
return -1;
}
ibex_unindex (ib, filename);
status = ibex_index_fd (ib, filename, fd, st.st_size);
close (fd);
return status;
}
/**
* ibex_index_fd: Index a file given a file descriptor
* @ib: an ibex
* @name: the name of the file being indexed
* @fd: a file descriptor, open for reading
* @len: the number of bytes to read from the file
*
* This indexes a file, or a part of a file, given an open file
* descriptor and a size. There is no requirement that @name
* actually correspond to @fd in any particular way.
*
* If the function returns successfully, the file descriptor offset of
* @fd will be exactly @len bytes beyond where it was when the
* function was called. The indexer assumes that this point is a word
* boundary.
*
* The behavior of this function is not defined if it is not
* possible to read @len bytes off @fd.
*
* Return value: 0 on success, -1 on failure.
**/
int
ibex_index_fd (ibex *ib, char *name, int fd, size_t len)
{
char *buf;
int off = 0, nread, status;
buf = g_malloc (len);
do {
nread = read (fd, buf + off, len - off);
if (nread == -1) {
g_free (buf);
return -1;
}
off += nread;
} while (off != len);
status = ibex_index_buffer (ib, name, buf, len, NULL);
g_free (buf);
return status;
}
/**
* ibex_unindex: Remove a file from the ibex
* @ib: an ibex
* @name: name of the file to remove
*
* This removes all references to @name from @ib. No memory is freed
* right away, but further searches on @ib will never return @name.
**/
void
ibex_unindex (ibex *ib, char *name)
{
ibex_file *ibf;
ibf = g_tree_lookup (ib->files, name);
if (ibf) {
ibf->index = -1;
g_tree_remove (ib->files, name);
g_ptr_array_add (ib->oldfiles, ibf);
ib->dirty = TRUE;
}
}
/**
* ibex_rename: Rename a file in the ibex
* @ib: an ibex
* @oldname: the old name of the file
* @newname: the new name of the file
*
* This renames a file in the ibex.
**/
void
ibex_rename (ibex *ib, char *oldname, char *newname)
{
ibex_file *ibf;
ibf = g_tree_lookup (ib->files, oldname);
if (ibf) {
g_tree_remove (ib->files, oldname);
g_free (ibf->name);
ibf->name = g_strdup (newname);
g_tree_insert (ib->files, ibf->name, ibf);
}
}

82
libibex/index.h Normal file
View File

@ -0,0 +1,82 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifndef _INDEX_H
#define _INDEX_H
/* an indexing 'class' maps a key to 1 piece of info */
struct _IBEXIndex {
struct _IBEXIndexClass *klass;
struct _memcache *blocks;
blockid_t root; /* root block of ondisk index data */
};
struct _IBEXIndexClass {
struct _IBEXIndex *(*create)(struct _memcache *bc, int size);
struct _IBEXIndex *(*open)(struct _memcache *bc, blockid_t root);
int (*sync)(struct _IBEXIndex *);
int (*close)(struct _IBEXIndex *);
/* lookup a key in the index, returns the keyid of this item, or 0 if not found */
guint32 (*find)(struct _IBEXIndex *, const char *key, int keylen);
/* remove a key from the index */
void (*remove)(struct _IBEXIndex *, const char *key, int keylen);
/* insert a new key into the index, the keyid is returned */
guint32 (*insert)(struct _IBEXIndex *, const char *key, int keylen);
/* get the key contents/key length from the keyid */
char *(*get_key)(struct _IBEXIndex *, guint32 keyid, int *keylenptr);
/* set the key contents based on the keyid */
void (*set_data)(struct _IBEXIndex *, guint32 keyid, blockid_t datablock, blockid_t tail);
/* get the key contents based on the keyid */
blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail);
};
/* a storage class, stores lists of lists of id's */
struct _IBEXStore {
struct _IBEXStoreClass *klass;
struct _memcache *blocks;
};
struct _IBEXStoreClass {
struct _IBEXStore *(*create)(struct _memcache *bc);
int (*sync)(struct _IBEXStore *store);
int (*close)(struct _IBEXStore *store);
blockid_t (*add)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
blockid_t (*add_list)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
blockid_t (*remove)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
void (*free)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
gboolean (*find)(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
GArray *(*get)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
};
#endif

488
libibex/wordindex.c Normal file
View File

@ -0,0 +1,488 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* code to manage a word index */
/* includes a cache for word index writes,
but not for name index writes (currently), or any reads.
Note the word cache is only needed during indexing of lots
of words, and could then be discarded (:flush()).
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <glib.h>
#include "block.h"
#include "index.h"
#include "wordindex.h"
#define d(x)
/*#define WORDCACHE_SIZE (256)*/
#define WORDCACHE_SIZE (10240)
extern struct _IBEXStoreClass ibex_diskarray_class;
extern struct _IBEXIndexClass ibex_hash_class;
/* need 2 types of hash key?
one that just stores the wordid / wordblock
and one that stores the filecount/files?
*/
struct _wordcache {
struct _wordcache *next;
struct _wordcache *prev;
nameid_t wordid; /* disk wordid */
blockid_t wordblock; /* head of disk list */
blockid_t wordtail; /* and the tail data */
int filecount; /* how many we have in memory */
/*nameid_t files[32];*/ /* memory cache of files */
nameid_t files[62]; /* memory cache of files */
char word[1]; /* actual word follows */
};
static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */
static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */
static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */
static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */
static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */
static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
static int word_sync(struct _IBEXWord *idx);
static int word_flush(struct _IBEXWord *idx);
static int word_close(struct _IBEXWord *idx);
struct _IBEXWordClass ibex_word_index_class = {
word_sync, word_flush, word_close,
unindex_name, contains_name,
find, find_name,
add, add_list
};
/* this interface isn't the best, but it'll do for now */
struct _IBEXWord *
ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot)
{
struct _IBEXWord *idx;
idx = g_malloc0(sizeof(*idx));
idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal);
ibex_list_new(&idx->wordnodes);
idx->wordcount = 0;
idx->klass = &ibex_word_index_class;
/* we use the same block array storage for both indexes at the moment */
idx->wordstore = ibex_diskarray_class.create(bc);
idx->namestore = idx->wordstore;
/* but not the same indexes! */
if (*wordroot) {
printf("opening wordindex root = %d\n", *wordroot);
idx->wordindex = ibex_hash_class.open(bc, *wordroot);
} else {
idx->wordindex = ibex_hash_class.create(bc, 1024);
*wordroot = idx->wordindex->root;
printf("creating wordindex root = %d\n", *wordroot);
}
if (*nameroot) {
printf("opening nameindex root = %d\n", *nameroot);
idx->nameindex = ibex_hash_class.open(bc, *nameroot);
} else {
idx->nameindex = ibex_hash_class.create(bc, 1024);
*nameroot = idx->nameindex->root;
printf("creating nameindex root = %d\n", *nameroot);
}
return idx;
}
#if 0
static void
cache_sanity(struct _wordcache *head)
{
while (head->next) {
g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0]));
g_assert(strlen(head->word) != 0);
head = head->next;
}
}
#endif
/* unindex all entries for name */
static void unindex_name(struct _IBEXWord *idx, const char *name)
{
GArray *words;
int i;
nameid_t nameid, wordid;
blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail;
d(printf("unindexing %s\n", name));
/* lookup the hash key */
nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
/* get the block for this key */
nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
/* and the data for this block */
words = idx->namestore->klass->get(idx->namestore, nameblock, nametail);
/* walk it ... */
for (i=0;i<words->len;i++) {
/* get the word */
wordid = g_array_index(words, nameid_t, i);
d(printf(" word %d\n", wordid));
/* get the data block */
wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
/* clear this name from it */
newblock = wordblock;
newtail = wordtail;
idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid);
if (newblock != wordblock || newtail != wordtail)
idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail);
/* FIXME: check cache as well */
}
g_array_free(words, TRUE);
/* and remove name data and itself */
idx->namestore->klass->free(idx->namestore, nameblock, nametail);
idx->nameindex->klass->remove(idx->nameindex, name, strlen(name));
}
/* index contains (any) data for name */
static gboolean contains_name(struct _IBEXWord *idx, const char *name)
{
return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;
}
/* returns all matches for word */
static GPtrArray *find(struct _IBEXWord *idx, const char *word)
{
nameid_t wordid, nameid;
GPtrArray *res;
GArray *names;
int i;
char *new;
struct _wordcache *cache;
blockid_t wordblock, wordtail;
res = g_ptr_array_new();
cache = g_hash_table_lookup(idx->wordcache, word);
if (cache) {
/* freshen cache entry if we touch it */
ibex_list_remove((struct _listnode *)cache);
ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
wordid = cache->wordid;
wordblock = cache->wordblock;
wordtail = cache->wordtail;
} else {
/* lookup the hash key */
wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
/* get the block for this key */
wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
}
/* and the data for this block */
names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
/* .. including any memory-only data */
if (cache) {
g_array_append_vals(names, cache->files, cache->filecount);
}
/* walk it ... converting id's back to strings */
g_ptr_array_set_size(res, names->len);
for (i=0;i<names->len;i++) {
nameid = g_array_index(names, nameid_t, i);
new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL);
res->pdata[i] = new;
}
g_array_free(names, TRUE);
return res;
}
/* find if name contains word */
static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word)
{
nameid_t wordid, nameid;
blockid_t nameblock, nametail;
struct _wordcache *cache;
int i;
/* lookup the hash key for the name */
nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
/* get the block for this name */
nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
/* check if there is an in-memory cache for this word, check its file there first */
cache = g_hash_table_lookup(idx->wordcache, word);
if (cache) {
/* freshen cache entry if we touch it */
ibex_list_remove((struct _listnode *)cache);
ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
for (i=0;i<cache->filecount;i++) {
if (cache->files[i] == nameid)
return TRUE;
}
/* not there? well we can use the wordid anyway */
wordid = cache->wordid;
} else {
/* lookup the hash key for word */
wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
}
/* see if wordid is in nameblock */
return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid);
}
/* cache helper functions */
/* flush a cache entry to disk, and empty it out */
static void
sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
{
GArray array; /* just use this as a header */
blockid_t oldblock, oldtail;
d(printf("syncing cache entry '%s'\n", cache->word));
array.data = (char *)cache->files;
array.len = cache->filecount;
oldblock = cache->wordblock;
oldtail = cache->wordtail;
idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array);
if (oldblock != cache->wordblock || oldtail != cache->wordtail) {
idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail);
}
cache->filecount = 0;
}
/* create a new key in an index, returning its id and head block */
static void
add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail)
{
/* initialise cache entry - id of word entry and head block */
*wordid = wordindex->klass->find(wordindex, word, strlen(word));
if (*wordid == 0) {
*wordid = wordindex->klass->insert(wordindex, word, strlen(word));
*wordblock = 0;
*wordtail = 0;
} else {
*wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail);
}
}
/* create a new key in a cached index (only word cache so far), flushing old keys
if too much space is being used */
static struct _wordcache *
add_index_cache(struct _IBEXWord *idx, const char *word)
{
struct _wordcache *cache;
d(printf("adding %s to cache\n", word));
cache = g_hash_table_lookup(idx->wordcache, word);
if (cache == 0) {
/* see if we have to flush off the last entry */
if (idx->wordcount >= WORDCACHE_SIZE) {
/* remove last entry, and flush it */
cache = (struct _wordcache *)idx->wordnodes.tailpred;
d(printf("flushing word from cache %s\n", cache->word));
ibex_list_remove((struct _listnode *)cache);
g_hash_table_remove(idx->wordcache, cache->word);
sync_cache_entry(idx, cache);
g_free(cache);
idx->wordcount--;
}
cache = g_malloc0(sizeof(*cache)+strlen(word));
/* initialise cache entry - id of word entry and head block */
add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail);
/* other fields */
strcpy(cache->word, word);
cache->filecount = 0;
g_hash_table_insert(idx->wordcache, cache->word, cache);
ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
idx->wordcount++;
} else {
/* move cache bucket ot the head of the cache list */
ibex_list_remove((struct _listnode *)cache);
ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
}
return cache;
}
/* adds a single word to name (slow) */
static void add(struct _IBEXWord *idx, const char *name, const char *word)
{
nameid_t nameid;
blockid_t nameblock, newblock, nametail, newtail;
struct _wordcache *cache;
g_error("Dont use wordindex::add()");
abort();
cache = add_index_cache(idx, word);
/* get the nameid and block start for this name */
add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
/* check for repeats of the last name - dont add anything */
if (cache->files[cache->filecount] == nameid)
return;
/* yay, now insert the nameindex into the word list, and the word index into the name list */
cache->files[cache->filecount] = nameid;
cache->filecount++;
/* if we are full, force a flush now */
if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
sync_cache_entry(idx, cache);
}
newtail = nametail;
newblock = nameblock;
idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid);
if (newblock != nameblock || newtail != nametail) {
idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
}
}
/* adds a bunch of words to a given name */
static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
{
int i;
GArray *data = g_array_new(0, 0, sizeof(nameid_t));
blockid_t nameblock, newblock, nametail, newtail;
nameid_t nameid;
struct _wordcache *cache;
d(printf("Adding words to name %s\n", name));
d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
/* get the nameid and block start for this name */
add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
for (i=0;i<words->len;i++) {
char *word = words->pdata[i];
cache = add_index_cache(idx, word);
/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
/* check for duplicates; doesn't catch duplicates over an overflow
boundary. Watch me care. */
if (cache->filecount == 0
|| cache->files[cache->filecount-1] != nameid) {
cache->files[cache->filecount] = nameid;
cache->filecount++;
/* if we are full, force a flush now */
if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
sync_cache_entry(idx, cache);
}
/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
/* and append this wordid for this name in memory */
g_array_append_val(data, cache->wordid);
}
/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
}
d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
/* and append these word id's in one go */
newblock = nameblock;
newtail = nametail;
idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data);
if (newblock != nameblock || newtail != nametail) {
idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
}
d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
g_array_free(data, TRUE);
}
/* sync any in-memory data to disk */
static int
word_sync(struct _IBEXWord *idx)
{
/* we just flush also, save memory */
word_flush(idx);
#if 0
struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head;
while (cache->next) {
sync_cache_entry(idx, cache);
cache = cache->next;
}
/*ibex_hash_dump(idx->wordindex);*/
/*ibex_hash_dump(idx->nameindex);*/
#endif
return 0;
}
/* sync and flush any in-memory data to disk and free it */
static int
word_flush(struct _IBEXWord *idx)
{
struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
next = cache->next;
while (next) {
sync_cache_entry(idx, cache);
g_hash_table_remove(idx->wordcache, cache->word);
g_free(cache);
cache = next;
next = cache->next;
}
ibex_list_new(&idx->wordnodes);
return 0;
}
static int word_close(struct _IBEXWord *idx)
{
struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
next = cache->next;
while (next) {
sync_cache_entry(idx, cache);
g_free(cache);
cache = next;
next = cache->next;
}
idx->namestore->klass->close(idx->namestore);
idx->nameindex->klass->close(idx->nameindex);
/*same as namestore:
idx->wordstore->klass->close(idx->wordstore);*/
idx->wordindex->klass->close(idx->wordindex);
g_hash_table_destroy(idx->wordcache);
g_free(idx);
return 0;
}

65
libibex/wordindex.h Normal file
View File

@ -0,0 +1,65 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2000 Helix Code, Inc.
*
* Authors: Michael Zucchi <notzed@helixcode.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifndef _WORDINDEX_H
#define _WORDINDEX_H
#include <glib.h>
#include "block.h"
#include "index.h"
struct _IBEXWord;
/* not used yet */
typedef void (*IBEXNormaliseFunc)(char *source, int len, char *dest);
struct _IBEXWordClass {
int (*sync)(struct _IBEXWord *);
int (*flush)(struct _IBEXWord *);
int (*close)(struct _IBEXWord *);
void (*unindex_name)(struct _IBEXWord *, const char *name); /* unindex all entries for name */
gboolean (*contains_name)(struct _IBEXWord *, const char *name); /* index contains data for name */
GPtrArray *(*find)(struct _IBEXWord *, const char *word); /* returns all matches for word */
gboolean (*find_name)(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */
void (*add)(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name */
void (*add_list)(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
};
struct _IBEXWord {
struct _IBEXWordClass *klass;
struct _IBEXStore *wordstore;
struct _IBEXIndex *wordindex;
struct _IBEXStore *namestore;
struct _IBEXIndex *nameindex;
/* word caching info (should probably be modularised) */
GHashTable *wordcache; /* word->struct _wordcache mapping */
struct _list wordnodes; /* LRU list of wordcache structures */
int wordcount; /* how much space used in cache */
};
struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
#endif /* !_WORDINDEX_H */