This is the mail archive of the guile@cygnus.com mailing list for the guile project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: Scheme style auto-resizing hashtable (fwd)


/*****************************************************************************
 * Copyright (C) 1998, Jay Glascoe, NASA Goddard Institute for Space Studies
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *****************************************************************************/

/*****************************************************************************
 *  File "hashtab-type.c", by Jay Glascoe.  10/27/98
 *  Inspired by Jim Blandy's image-type example,
 *  the Python dictionary object, and a whole host
 *  of messages on the Guile mailing list.
 *  Special thanks to Maciej Stachowiak of MIT
 *  and Harvey J. Stein of BFM Financial Research.
 *
 *  This is a Guile extension providing procedures to
 *  create and operate on auto-resizing hash tables.
 *
 *  A hash table is a pair, it looks like this
 *
 *     (header . vector)
 *
 *  The car of the pair, the header, is a vector.  It's used to
 *  keep track of various book keeping details.  It looks
 *  like this
 *
 *     #(number-entries number-nonempty-buckets auto-shrink-flag)
 *
 *  The cdr of the hash table pair is also a vector.  It's
 *  elements are referred to as "buckets" or "entry lists".
 *  It looks like this
 *
 *     #(bucket1 bucket2 ... bucketN)
 *
 *  Each bucket is a list of entries
 *
 *     (entry1 entry2 ... entryN)
 *
 *  Each entry is a pair, or "association".  The car of the
 *  association is the "hash value" of the entry's key.  The
 *  cdr of the association is a key-value pair.  Entries look
 *  like this
 *
 *     (hash (key . value))
 *
 *  The procedures defined in this extension are listed below.
 *
 *  primitive: make-hashtab
 *      Return a new auto-resizing hash table (auto-shrink is
 *      disabled by default).
 *
 *  primitive: hashtab-enable hashtab symbol
 *  primitive: hashtab-disable hashtab symbol
 *      "symbol" should be one of 'auto-shrink, ... [no others yet]
 *      Turn the specified option on/off.
 *
 *  primitive: hashtab-set! hashtab key value
 *      Insert "value" in hash table "hashtab" under key "key".
 *      If "key" is already present in "hashtab", overwrite the
 *      previous associated value with "value".
 *
 *  primitive: hashtab-ref hashtab key [default]
 *      Look up "key" in the hash table "hashtab", and return the value
 *	(if any) associated with it. If key is not found, return
 *      default (or #f if no default argument is supplied). 
 *
 *  primitive: hashtab-del! hashtab key [default]
 *      Delete the entry in hash table "hashtab" having key "key",
 *      and return the value associated with "key" (if any).
 *      If key is not found, return default (or #f if no default
 *      argument is supplied). 
 *
 *****************************************************************************/

#include <math.h>
#include <libguile.h>
#include <guile/gh.h>

/* #include "my-hasher.h" */
/* this is okay for hashes with up to 1,000,000 entries */
#define my_hasher(hash_ptr, obj, message, proc_name) \
        *(hash_ptr) = scm_hasher((obj), 2097157L, 10)

#define DEFAULT_NUMBER_BUCKETS 4
#define MAX_MEAN_NONEMPTY_BUCKETS_SIZE 3  /* 3.14159265 ;) */

static SCM
my_scm_make_vector(register long i, SCM fill)
{
    /* modified to allow for a long i (rather than a SCM i) */
    SCM v;
    register long j;
    SCM *velts;
    SCM_NEWCELL(v);
    SCM_DEFER_INTS;
    SCM_SETCHARS(v, scm_must_malloc(i?(long)(i*sizeof(SCM)):1L,
				    "my_scm_make_vector"));
    SCM_SETLENGTH(v, i, scm_tc7_vector);
    velts = SCM_VELTS(v);
    j = 0;
    while (--i >= j)
	(velts)[i] = fill;
    
    SCM_ALLOW_INTS;
    return v;
}

#define make_dflt_bucket() SCM_EOL

#define make_entry(hash, key, value) \
        scm_cons(scm_long2num(hash), scm_cons(key, value))

static SCM auto_shrink_symbol;
static SCM default_bucket;

static void
init_mysymbols(void)  /* called by init_hashtab_type */
{
    default_bucket = 
	SCM_CDR(scm_sysintern("*default-hashtab-bucket*", make_dflt_bucket()));
    auto_shrink_symbol = SCM_CAR(scm_sysintern0("auto-shrink"));
    return;
}

#define NUMBER_ENTRIES_INDEX 0
#define NUMBER_NONEMPTY_BUCKETS_INDEX 1
#define AUTO_SHRINK_FLAG_INDEX 2
#define NUMBER_HEADER_ELEMENTS 3

static SCM
make_header(SCM number_entries, SCM number_nonempty_buckets,
	    SCM auto_shrink_flag)
{
    /*  #(number-entries number-nonempty-buckets auto-shrink-flag)
     */
    SCM vector = my_scm_make_vector(NUMBER_HEADER_ELEMENTS, SCM_UNDEFINED);
    SCM *velts = SCM_VELTS(vector);
    velts[NUMBER_ENTRIES_INDEX] = number_entries;
    velts[NUMBER_NONEMPTY_BUCKETS_INDEX] = number_nonempty_buckets;
    velts[AUTO_SHRINK_FLAG_INDEX] = auto_shrink_flag;
    return vector;
}

static SCM
my_make_hashtab(SCM, SCM);

SCM_PROC(s_make_hashtab, "make-hashtab", 0, 0, 0, make_hashtab);

static SCM
make_hashtab(void)
{
    /*  an argument-less constructor.  fool proof! ;)  
     */
    long inumber_buckets = DEFAULT_NUMBER_BUCKETS;
    SCM auto_shrink_flag = SCM_BOOL_F;
    return my_make_hashtab(inumber_buckets, auto_shrink_flag);
}

static SCM
my_make_hashtab(long inumber_buckets, SCM auto_shrink_flag)
{
    SCM header = make_header(SCM_INUM0, SCM_INUM0, auto_shrink_flag);
    SCM vector = my_scm_make_vector(inumber_buckets, SCM_EOL);
    SCM hashtab = scm_cons(header, vector);
    return hashtab;
}

SCM_PROC(s_hashtab_enable, "hashtab-enable", 2, 0, 0, hashtab_enable);
SCM_PROC(s_hashtab_disable, "hashtab-disable", 2, 0, 0, hashtab_disable);

static SCM
hashtab_enable(SCM hashtab, SCM symbol)
{
    /* turn auto-shrink on */
    SCM header = SCM_CAR(hashtab);
    SCM *header_elts = SCM_VELTS(header);
    SCM_ASSERT(symbol == auto_shrink_symbol,
	       symbol, SCM_ARG2, "hashtab-enable");
    header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_T;
    return SCM_BOOL_T;
}

static SCM
hashtab_disable(SCM hashtab, SCM symbol)
{
    /* turn auto-shrink off */
    SCM header = SCM_CAR(hashtab);
    SCM *header_elts = SCM_VELTS(header);
    SCM_ASSERT(symbol == auto_shrink_symbol,
	       symbol, SCM_ARG2, "hashtab-disable");
    header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_T;
    return SCM_BOOL_F;
}

SCM_PROC(s_hashtab_ref, "hashtab-ref", 2, 1, 0, hashtab_ref);

static SCM
hashtab_ref(SCM hashtab, SCM key, SCM not_here)
{
    /*  Given a hash table, a key, and (optionally) a "not-here"
     *  object, return the value associated with the key.
     *  Return "not-here" (#f by default) if there is no such
     *  key in the hashtable.
     */
    SCM vector = SCM_CDR(hashtab);
    SCM *velts = SCM_VELTS(vector);
    long vec_len = SCM_LENGTH(vector);
    
    long hash = 0;
    long i;
	
    SCM bucket, tail;

    my_hasher(&hash, key, SCM_ARG2, "hashtab-ref");
    i = hash & (vec_len - 1);
    bucket = velts[i];

    if (SCM_UNBNDP(not_here))
	not_here = SCM_BOOL_F;
    
    for (tail = bucket; tail != SCM_EOL; tail = SCM_CDR(tail))
    {
	SCM entry = SCM_CAR(tail);
	    
	SCM my_scm_hash = SCM_CAR(entry);
	long myhash =
	    scm_num2long(my_scm_hash, "woops!", "hashtab-ref");
	    
	if (myhash == hash)
	{
	    SCM pair = SCM_CDR(entry);
	    SCM mykey = SCM_CAR(pair);
	    if (scm_equal_p(mykey, key) == SCM_BOOL_T)
		return SCM_CDR(pair);
	}
    }
    return not_here;
}

static SCM
my_hashtab_set(SCM hashtab,long hash, SCM key, SCM value,int *resize_flag_ptr)
{
    /*   called by hashtab_setx
     *   Insert the key-value pair into the hashtable.
     *   Signal the need for resize if the table is too small.
     */
    SCM myvec = SCM_CDR(hashtab);
    SCM header = SCM_CAR(hashtab);
    SCM *header_elts = SCM_VELTS(header);
    long vec_len = SCM_LENGTH(myvec);
    
    long i = hash & (vec_len - 1);

    SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
    long inumber_entries = SCM_INUM(number_entries);

    SCM *velts = SCM_VELTS(myvec);
    SCM bucket = velts[i];
    int bucket_size = 1;
    SCM old_entry_list = bucket;
    SCM entry_list;

    if (bucket == SCM_EOL)
	bucket_size = 0;
    
    for (entry_list = old_entry_list;
	 entry_list != SCM_EOL;
	 entry_list = SCM_CDR(entry_list))
    {
	SCM entry = SCM_CAR(entry_list);
	SCM my_scm_hash = SCM_CAR(entry);
	SCM pair;
	SCM mykey;
	long myhash;
	myhash = scm_num2long(my_scm_hash, "bif!", "hashtab-set!");
	
	if (myhash != hash)
	    continue;

	pair = SCM_CDR(entry);
	mykey = SCM_CAR(pair);

	if (scm_equal_p(mykey, key) == SCM_BOOL_T)
	{
	    SCM_SETCDR(pair, value);
	    return value;
	}
    }

    /*   cons new entry onto bucket, and do some book-keeping
     */
    {
	SCM entry = make_entry(hash, key, value);

	SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
	long inumber_entries = SCM_INUM(number_entries);

	SCM number_nonempty_buckets =
	    header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX];
	long inumber_nonempty_buckets = SCM_INUM(number_nonempty_buckets);

	if (bucket_size == 0)
	{
	    ++inumber_nonempty_buckets;
	    header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
		SCM_MAKINUM(inumber_nonempty_buckets);
	    velts[i] = scm_cons(entry, SCM_EOL);
	}
	else
	    SCM_SETCDR(bucket, scm_cons(entry, SCM_CDR(bucket)));
	    
	
	++inumber_entries;
	header_elts[NUMBER_ENTRIES_INDEX] = SCM_MAKINUM(inumber_entries);

	if  (inumber_entries > 
	     MAX_MEAN_NONEMPTY_BUCKETS_SIZE * inumber_nonempty_buckets)
	  *resize_flag_ptr = 1;

	return value;
    }
}

SCM_PROC(s_hashtab_setx, "hashtab-set!", 3, 0, 0, hashtab_setx);

static SCM
hashtab_setx(SCM hashtab, SCM key, SCM value)
{
    /*  Calls my_hashtab_set after hashing the key argument.
     *  If my_hashtab_set signals that resize is necessary,
     *  then call resize_hashtab.
     */
    long hash = 0;
    int resize_flag = 0;
    SCM retval;
    my_hasher(&hash, key, SCM_ARG2, "hashtab-set!");
    retval = my_hashtab_set(hashtab, hash, key, value, &resize_flag);
    if (resize_flag == 1)
    {
	SCM new_hashtab = resize_hashtab(hashtab, 1);
	SCM_SETCAR(hashtab, SCM_CAR(new_hashtab));
	SCM_SETCDR(hashtab, SCM_CDR(new_hashtab));
    }
    return retval;
}

static SCM
resize_hashtab(SCM hashtab, int flag)
{
    /*  assuming (correctly) that each key from the old
     *  hash table is unique is a big win here, so we don't
     *  call hashtab_setx.  instead, we duplicate its code
     *  and make the optimizing modifications
     */
    SCM header = SCM_CAR(hashtab);
    SCM *header_elts = SCM_VELTS(header);

    SCM old_vec = SCM_CDR(hashtab);
    SCM *old_velts = SCM_VELTS(old_vec);

    register long old_len = SCM_LENGTH(old_vec);

    SCM auto_shrink_flag = header_elts[AUTO_SHRINK_FLAG_INDEX];
    
    register long inumber_nonempty_buckets = 0;
    register long inumber_entries = 0;

    long new_len;
    SCM new_hashtab;
    SCM *new_velts;
    SCM *new_header_elts;
    SCM my_inum1 = SCM_MAKINUM(1);
    long new_len_minus_one;

    register long i;

    if (flag == -1) /* shrink the hash table */
	new_len = old_len / 2;
    else            /* grow the hash table   */
	new_len = old_len * 2;
    
    new_len_minus_one = new_len - 1;
    new_hashtab = my_make_hashtab(new_len, auto_shrink_flag);
    new_velts = SCM_VELTS(SCM_CDR(new_hashtab));
    new_header_elts = SCM_VELTS(SCM_CAR(new_hashtab));
    
    for (i = 0; i < old_len; ++i)
    {
	SCM entry_list;
	SCM bucket = old_velts[i];

	for (entry_list = bucket;
	     entry_list != SCM_EOL;
	     entry_list = SCM_CDR(entry_list),   
		 SCM_SETCDR(bucket, entry_list))  /*   delete the old bucket
						   *   en passant.
						   */
	{
	    SCM entry = SCM_CAR(entry_list);
	    long myhash;
	    SCM my_scm_hash = SCM_CAR(entry);
	    long index = 0;
	    SCM new_bucket;

	    myhash = scm_num2long(my_scm_hash, "jeez!", "resize_hashtab");

	    index = myhash & new_len_minus_one;
	    new_bucket = new_velts[index];

	    if (new_bucket == SCM_EOL)
	    {
		new_velts[index] = scm_cons(entry, SCM_EOL);
		++inumber_nonempty_buckets;
	    }
	    else
		SCM_SETCDR(new_bucket, scm_cons(entry, SCM_CDR(new_bucket)));
	    ++inumber_entries;
	}
	/*  let go of the old bucket (what's left of it anyway)
	 *  so Guile may gc it.  Apologies to anyone holding on
	 *  to the old bucket, but any code depending on the previous
	 *  state of the hash should fail gracefully (any such code
	 *  shouldn't have been written in the first place).
	 */
	old_velts[i] = SCM_EOL;
    }

    /*   book keeping
     */
    {
	SCM new_number_nonempty_buckets =SCM_MAKINUM(inumber_nonempty_buckets);
	SCM new_number_entries = SCM_MAKINUM(inumber_entries);
	SCM new_header = SCM_CAR(new_hashtab);
	SCM *new_header_elts = SCM_VELTS(new_header);

	new_header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
	    new_number_nonempty_buckets;
	new_header_elts[NUMBER_ENTRIES_INDEX] =
	    new_number_entries;

	if (flag == -1 && inumber_entries
	    > MAX_MEAN_NONEMPTY_BUCKETS_SIZE * inumber_nonempty_buckets)
	{
	    fprintf(stderr, "PANIC: downsized hash table prematurely.  "
		    "disabling auto-shrink.\n");
	    fflush(stderr);
	    new_header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_F;
	}
	
	return new_hashtab;
    }
}

static void
my_scm_delete_x(register long hash, SCM *bucket_ptr, SCM key,
		SCM *value_ptr, register short *found_flag_ptr)
{
    /*   heavily modified version of scm_delete_x.
     *   now PH balanced for women.
     */
    SCM *prev;
    SCM walk;

    for (prev = bucket_ptr, walk = *bucket_ptr;
	 walk != SCM_EOL;
	 walk = SCM_CDR (walk)) 
    {
	SCM entry = SCM_CAR(walk);
	if (hash == scm_num2long(SCM_CAR(entry), "tweet!", "my_scm_delete_x"))
	{
	    SCM pair = SCM_CDR(entry);
	    if (SCM_BOOL_T == scm_equal_p(SCM_CAR(pair), key))
	    {
		*value_ptr = SCM_CDR(pair);
		*prev = SCM_CDR(walk);
		*found_flag_ptr = 1;
		break;
	    }
	    else
		prev = SCM_CDRLOC (walk);
	}
	else
	    prev = SCM_CDRLOC (walk);
    }
    return;
}

SCM_PROC(s_hashtab_delx, "hashtab-del!", 2, 1, 0, hashtab_delx);

static SCM
hashtab_delx(SCM hashtab, SCM key, SCM not_here)
{
    /*  calls my_scm_delete_x for the actual deletion
     *  then performs book keeping and auto-shrinks if necessary
     *
     *  NOTE: I've put as many optimizations as I can think of in
     *  here, but it's still slow (relative to Guile's fixed size
     *  hash tables).  Why?  Because (hash (key . value)) is
     *  significantly larger than just (key . value) and guile
     *  has an easier time gc'ing the smaller of the two.
     */
    SCM vector = SCM_CDR(hashtab);
    SCM *velts = SCM_VELTS(vector);
    long vec_len = SCM_LENGTH(vector);

    long hash = 0;
    long i;
    SCM value;
    short found_flag = 0;
    SCM *bucket_ptr;
    
    if (SCM_UNBNDP(not_here))
	not_here = SCM_BOOL_F;

    my_hasher(&hash, key, SCM_ARG2, "hashtab-del!");

    i = hash & (vec_len - 1);
    bucket_ptr = velts + i;

    my_scm_delete_x(hash, bucket_ptr, key, &value, &found_flag);

    if (! found_flag)
	return not_here;

    {
	SCM header = SCM_CAR(hashtab);
	SCM *header_elts = SCM_VELTS(header);
	
	SCM number_nonempty_buckets =
	    header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX];
	long inumber_nonempty_buckets = SCM_INUM(number_nonempty_buckets);

	SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
	long inumber_entries = SCM_INUM(number_entries);

	SCM auto_shrink_flag = header_elts[AUTO_SHRINK_FLAG_INDEX];

	if (*bucket_ptr == SCM_EOL)
	{
	    --inumber_nonempty_buckets;
	    number_nonempty_buckets = SCM_MAKINUM(inumber_nonempty_buckets);
	    header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
		number_nonempty_buckets;
	}
	--inumber_entries;
	number_entries = SCM_MAKINUM(inumber_entries);
	header_elts[NUMBER_ENTRIES_INDEX] = number_entries;

	if (auto_shrink_flag == SCM_BOOL_F)
	    return value;
	
	if (vec_len > DEFAULT_NUMBER_BUCKETS &&
	    2 * inumber_entries < inumber_nonempty_buckets
	    * MAX_MEAN_NONEMPTY_BUCKETS_SIZE)
	{
	    SCM new_hashtab = resize_hashtab(hashtab, -1);
	    SCM_SETCAR(hashtab, SCM_CAR(new_hashtab));
	    SCM_SETCDR(hashtab, SCM_CDR(new_hashtab));
	}
	return value;
    }
}

void
init_hashtab_type(void)
{
    init_mysymbols();
    scm_make_gsubr (s_make_hashtab, 0, 0, 0, make_hashtab);
    scm_make_gsubr (s_hashtab_ref, 2, 1, 0, hashtab_ref);
    scm_make_gsubr (s_hashtab_setx, 3, 0, 0, hashtab_setx);
    scm_make_gsubr (s_hashtab_delx, 2, 1, 0, hashtab_delx);
    scm_make_gsubr (s_hashtab_enable, 2, 0, 0, hashtab_enable);
    scm_make_gsubr (s_hashtab_disable, 2, 0, 0, hashtab_disable);
}