Index: Makefile.am
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/Makefile.am,v
retrieving revision 1.4
diff -u -r1.4 Makefile.am
--- Makefile.am	26 Dec 2003 12:22:59 -0000	1.4
+++ Makefile.am	13 Jun 2004 23:15:53 -0000
@@ -32,6 +32,7 @@
 	ft_search_exec.c ft_search_exec.h \
 	ft_search_obj.c  ft_search_obj.h \
 	ft_netorg.c      ft_netorg.h \
+	ft_tokenize.c    ft_tokenize.h \
 	ft_transfer.c    ft_transfer.h \
 	ft_http.c        ft_http.h \
 	ft_http_client.c ft_http_client.h \
Index: ft_search_db.c
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_search_db.c,v
retrieving revision 1.108
diff -u -r1.108 ft_search_db.c
--- ft_search_db.c	14 Apr 2004 10:19:35 -0000	1.108
+++ ft_search_db.c	13 Jun 2004 23:15:57 -0000
@@ -26,6 +26,7 @@
 #include "md5.h"
 
 #include "ft_search_db.h"
+#include "ft_tokenize.h"
 
 /*****************************************************************************/
 
@@ -55,6 +56,26 @@
 /* #define SEARCH_DB_BENCHMARK */
 
 /*
+ * Primitive interactive querying when using the test suite.
+ */
+/* #define INTERACTIVE */
+
+/* 
+ * Whenever a host is removed, check the entire tokens index to ensure
+ * there are no stray tokens left from that host. Expensive, use with
+ * caution.
+ */
+/* #define PARANOID */
+
+/*
+ * Track duplicates (identical files shared by the same host)
+ * explicitly, and bail at the first hint of inconsistency. Beware:
+ * this can abort if a remote node requests removal of a file they
+ * never shared.
+ */
+/* #define CHECK_DUPS */
+
+/*
  * Sync all databases after changes have been made to ease debugging.  This
  * greatly hurts performance and should not be enabled for every day usage.
  */
@@ -67,16 +88,23 @@
  *
  * NOTE:
  *
- * It has been determined that this causes a great deal of problems in DB 4.1
- * and presumably 4.0.  Once you create a database with a custom hash
- * function, it seems you no longer have the ability to remove it, even if
- * you reset the hash function prior to calling DB->remove.  It appears as
- * though DB 3.x is unaffected, but I have no further information than that.
+ * It has been determined that this causes a great deal of problems in
+ * DB 4.1 and presumably 4.0.  Once you create a database with a
+ * custom hash function in a file containing multiple databases, it
+ * seems you no longer have the ability to remove it, even if you
+ * reset the hash function prior to calling DB->remove.  It appears as
+ * though DB 3.x is unaffected, but I have no further information than
+ * that.
  *
  * UPDATE:
  *
  * We have discovered a work-around to this problem that is documented in
  * direct_md5_hash.
+ *
+ * UPDATE 2:
+ *
+ * The workaround is currently not required since the share data was
+ * moved into a single database.
  */
 #define SEARCH_DB_DIRHASH
 
@@ -149,7 +177,12 @@
 {
 	FTSearchDB   *sdb;                 /* raw pointer into the database */
 	unsigned char md5[16];
-};
+	unsigned char order;
+}
+#ifdef __GNUC__
+__attribute ((packed)); /* avoid alignment padding */
+#endif
+;
 
 /*****************************************************************************/
 
@@ -162,6 +195,7 @@
 struct sharedata_data
 {
 	off_t          size;               /* file size */
+	u_int16_t      order;              /* offset of order list */
 	u_int16_t      path;               /* offset of path */
 	u_int16_t      mime;               /* offset of mime type */
 	u_int16_t      meta;               /* offset of meta data */
@@ -233,7 +267,7 @@
 static BOOL db_remove_host_timer (FTSearchDB *sdb);
 static BOOL db_remove_sharedata (FTSearchDB *sdb, unsigned char *md5);
 static BOOL db_remove_shareidx (FTSearchDB *sdb, unsigned char *md5);
-static Share *db_lookup_md5 (FTSearchDB *sdb, unsigned char *md5);
+static Share *db_lookup_md5 (FTSearchDB *sdb, unsigned char *md5, uint8_t **order);
 static BOOL db_close (FTSearchDB *sdb, BOOL rm);
 static BOOL db_sync (FTSearchDB *sdb);
 static BOOL db_abort (FTSearchDB *sdb);
@@ -249,6 +283,9 @@
 
 	node->session->search_db = sdb;
 	sdb->node = node;
+#ifdef CHECK_DUPS
+	sdb->dups = NULL;
+#endif
 
 	return sdb;
 }
@@ -725,6 +762,12 @@
 	           FT_CFG_SEARCH_MINPEERS, FT_CFG_SEARCH_MAXPEERS,
 	           FT_CFG_MAX_CHILDREN, FT_CFG_SEARCH_TTL, FT_CFG_SEARCH_RESULTS);
 
+#ifdef SEARCH_DB_BENCHMARK
+#ifndef OPENFT_TEST_SUITE
+	FT->DBGFN (FT, "BENCHMARKING ON!");
+#endif
+#endif
+
 	if ((ret = env_search->set_cachesize (env_search, c_gbytes, c_bytes, 0)))
 	{
 		ERR_DB("DB_ENV->set_cachesize", ret);
@@ -747,8 +790,10 @@
 		return FALSE;
 	}
 
+#ifndef SEARCH_DB_BENCHMARK
 	if (!(remove_queue = array_new (NULL)))
 		return FALSE;
+#endif
 
 	return TRUE;
 }
@@ -763,6 +808,26 @@
 	clean_db_path (envpath);
 }
 
+#ifdef CHECK_DUPS
+static void free_dups (FTSearchDB *sdb)
+{
+	void *elem;
+	int count = 0;
+	
+	while ((elem = array_pop (&sdb->dups)))
+	{
+		free (elem);
+		count++;
+	}
+
+	if (count)
+		FT->DBGFN (FT, "%s: %d duplicates freed",
+			   sdb->share_idx_name, count);
+
+	array_unset (&sdb->dups);
+}
+#endif
+
 /*****************************************************************************/
 
 static u_int16_t serialize_fld (struct sharedata_data *datarec,
@@ -800,7 +865,7 @@
 }
 
 static int serialize_record (DBT *data, Hash *hash, uint32_t *tokens,
-                             Share *share)
+			     uint8_t *order, Share *share)
 {
 	static struct sharedata_data datarec;
 
@@ -810,8 +875,9 @@
 	datarec.size = share->size;
 
 	/* copy the variably sized fields */
-	datarec.path = serialize_fld (&datarec, share->path, STRLEN_0(share->path));
-	datarec.mime = serialize_fld (&datarec, share->mime, STRLEN_0(share->mime));
+	datarec.order = serialize_fld (&datarec, order, STRLEN_0(order));
+	datarec.path  = serialize_fld (&datarec, share->path, STRLEN_0(share->path));
+	datarec.mime  = serialize_fld (&datarec, share->mime, STRLEN_0(share->mime));
 
 	datarec.meta = datarec.data_len;
 	share_foreach_meta (share, DS_FOREACH(sharedata_meta), &datarec);
@@ -847,7 +913,7 @@
 }
 
 static Share *unserialize_record (FTSearchDB *sdb, unsigned char *md5,
-                                  struct sharedata_data *datarec)
+                                  struct sharedata_data *datarec, uint8_t **order)
 {
 	Share *share;
 
@@ -865,6 +931,9 @@
 	             datarec->data + datarec->meta,
 	             datarec->data_len - datarec->meta);
 
+	if (order)
+		*order = STRDUP (datarec->data + datarec->order);
+
 	return share;
 }
 
@@ -923,7 +992,7 @@
 }
 
 static BOOL db_insert_sharedata (FTSearchDB *sdb, Hash *hash,
-                                 uint32_t *tokens, Share *share)
+                                 uint32_t *tokens, uint8_t *order, Share *share)
 {
 	static struct sharedata_key keyrec;
 	DB *dbp;
@@ -949,18 +1018,20 @@
 	key.size = sizeof (keyrec);
 
 	/* get the complete serialized record */
-	if (!serialize_record (&data, hash, tokens, share))
+	if (!serialize_record (&data, hash, tokens, order, share))
 		return FALSE;
 
 	/* insert a unique entry, resulting in an error if data already exists at
 	 * this key */
 	if ((ret = dbp->put (dbp, NULL, &key, &data, DB_NOOVERWRITE)))
 	{
-		ERR_DB_SDB("DB->put", ret, sdb);
-
 		assert (ret == DB_KEYEXIST);
+#ifdef CHECK_DUPS
+		array_push (&sdb->dups, md5_dup (keyrec.md5));
+#else
 		FT->DBGFN (FT, "duplicate MD5 %s (%s)",
 		           md5_fmt (keyrec.md5), sdb->share_idx_name);
+#endif
 
 		return FALSE;
 	}
@@ -1014,6 +1085,7 @@
 	DBT  key;
 	DBT  data;
 	int  ret;
+	int  i;
 
 	if (!(dbp = db_tokenidx()))
 		return FALSE;
@@ -1037,11 +1109,12 @@
 
 	/* insert one record for each token in the stream, effectively
 	 * "pre-searching" for this file */
-	for (; tokens && *tokens; tokens++)
+	for (i=0; tokens && *tokens; i++, tokens++)
 	{
 		/* key.data points to &keyrec, so this is actually going to modify
 		 * the data libdb is seeing */
 		keyrec.token = *tokens;
+		datarec.order = i + ORDER_MIN;
 
 		if ((ret = dbcp->c_put (dbcp, &key, &data, DB_KEYFIRST)))
 		{
@@ -1060,6 +1133,7 @@
 {
 	Hash     *hash;
 	uint32_t *tokens;
+	uint8_t  *order;
 	BOOL      success;
 
 	/* make sure the master databases are open and ready to go */
@@ -1069,16 +1143,18 @@
 	if (!(hash = share_get_hash (share, "MD5")))
 		return FALSE;
 
-	if (!(tokens = ft_search_tokenizef (share)))
+	if (!(tokens = ft_tokenize_share (share, &order)))
 		return FALSE;
 
+	success = TRUE;
+
 	/*
 	 * Insert into the global primary and secondary databases, as well as the
 	 * host-specific primary database.  See ft_search_db.h for more details
 	 * on exactly how this stuff is designed, if you're willing to believe it
 	 * was designed at all :)
 	 */
-	if ((success = db_insert_sharedata (sdb, hash, tokens, share)))
+	if (db_insert_sharedata (sdb, hash, tokens, order, share))
 	{
 		success = db_insert_shareidx (sdb, hash, tokens);
 		assert (success == TRUE);
@@ -1107,6 +1183,7 @@
 	}
 
 	free (tokens);
+	free (order);
 
 	return success;
 }
@@ -1274,12 +1351,49 @@
 	if (!db_md5idx() || !db_tokenidx())
 		return FALSE;
 
+	FT->DBGFN(FT,"%s: single_remove %s", ft_node_fmt (sdb->node), md5_fmt(md5));
+
+#ifdef CHECK_DUPS
+	/* dup lookup first; yes, this is horribly inefficient,
+	 * and despite it probably being safe to check only if lookup
+	 * fails, I'm being cautious for now
+	 */
+	if (sdb->dups)
+	{
+		int i, len;
+		len = array_count (&sdb->dups);
+		for (i=0; i<len; i++)
+		{
+			unsigned char *dup;
+			dup = array_index (&sdb->dups, i);
+			assert (dup);
+			if (!memcmp (md5, dup, 16))
+			{
+				/* Now remove this dup, and don't touch the db */
+				free (dup);
+				array_splice (&sdb->dups, i, 1, NULL);
+
+#ifndef SEARCH_DB_BENCHMARK
+				FT->DBGFN (FT, "%s: removed duplicate %s", 
+					   sdb->share_idx_name, md5_fmt (md5));
+#endif
+
+				assert (len - array_count (&sdb->dups) == 1);
+
+				/* FIXME: stats */
+
+				return TRUE;
+			}
+		}
+	}
+#endif
+
 	/*
 	 * Grab the per-user data entry at the supplied key, which will contain
 	 * enough information to get the token list for removal from the
 	 * secondary database and the size for statistics purposes.
 	 */
-	if (!(share = db_lookup_md5 (sdb, md5)))
+	if (!(share = db_lookup_md5 (sdb, md5, NULL)))
 	{
 		FT->DBGFN (FT, "%s: unable to locate md5 %s for removal",
 		           ft_node_fmt (sdb->node), md5_fmt (md5));
@@ -1299,19 +1413,40 @@
 	 * drudging on as long as possible just to try to tidy up the database
 	 * as much as possible.  Perhaps this is unwise?
 	 */
-	failure  = ! db_remove_md5idx    (sdb, md5);
-	failure |= ! db_remove_sharedata (sdb, md5);
-	failure |= ! db_remove_shareidx  (sdb, md5);
+	if ((failure  = ! db_remove_md5idx    (sdb, md5))) 
+	{
+		FT->DBGFN (FT, "%s: remove_md5idx failed for '%s'", 
+			   ft_node_fmt (sdb->node), md5_fmt (md5));
+	}
+	if (! db_remove_sharedata (sdb, md5))
+	{
+		FT->DBGFN (FT, "%s: remove_sharedata failed for '%s'", 
+			   ft_node_fmt (sdb->node), md5_fmt (md5));
+		failure = TRUE;
+	}
+	if (! db_remove_shareidx  (sdb, md5))
+	{
+		FT->DBGFN (FT, "%s: remove_shareidx failed for '%s'", 
+			   ft_node_fmt (sdb->node), md5_fmt (md5));
+		failure = TRUE;
+	}
 
 	/* tokenize so that we know exactly what we're supposed to be removing
 	 * from the secondary database */
-	if (!(tokens = ft_search_tokenizef (share)))
-		failure = TRUE;
+
+	if (!(tokens = ft_tokenize_share (share, NULL)))
+		db_abort (sdb);
 	else
 	{
 		/* attempt to remove each token individually from the secondary
 		 * token index */
-		failure |= ! db_remove_tokenidx (sdb, tokens, md5);
+		if (! db_remove_tokenidx (sdb, tokens, md5))
+		{
+			FT->DBGFN (FT, "%s: remove_tokenidx failed for '%s'", 
+				   ft_node_fmt (sdb->node), md5_fmt (md5));
+			failure = TRUE;
+		}
+
 		free (tokens);
 	}
 
@@ -1322,6 +1457,60 @@
 	return !failure;
 }
 
+#ifdef PARANOID
+static void check_tokens (FTSearchDB *sdb)
+{
+	DB  *dbp;
+	DBC *dbcp;
+	int  ret;
+	DBT  key;
+	DBT  data;
+	struct tokenidx_key *keyrec;
+	struct tokenidx_data *datarec;
+	int  count = 0;
+
+	if (!(dbp = db_tokenidx()))
+		db_abort (sdb);
+	
+	FT->DBGFN (FT, "checking tokens idx after removing %p", sdb);
+
+	if ((ret = dbp->cursor (dbp, NULL, &dbcp, 0)) || !dbcp)
+	{
+		ERR_DB_SDB("DB->cursor", ret, sdb);
+		return;
+	}
+
+	memset (&key, 0, sizeof (key));
+	memset (&data, 0, sizeof (data));
+
+	while (!(ret = dbcp->c_get (dbcp, &key, &data, DB_NEXT)))
+	{
+		assert (key.size == sizeof (*keyrec));
+		assert (data.size == sizeof (*datarec));
+
+		keyrec = key.data;
+		datarec = data.data;
+		
+		if (datarec->sdb == sdb)
+			db_abort (sdb);
+
+		count++;
+	}
+
+	if (ret != DB_NOTFOUND)
+	{
+		ERR_DB_SDB("DBcursor->c_get", ret, sdb);
+		db_abort (sdb);
+	}
+
+	if ((ret = dbcp->c_close (dbcp)))
+		ERR_DB_SDB("DBcursor->c_close", ret, sdb);
+
+	FT->DBGFN (FT, "all done (%d tokens checked)", count);
+
+}
+#endif
+
 /*****************************************************************************/
 
 static BOOL db_remove_host_init (FTSearchDB *sdb)
@@ -1365,6 +1554,14 @@
 	/* close and remove db_shareidx */
 	db_close (sdb, TRUE);
 
+#ifdef PARANOID
+	check_tokens (sdb);
+#endif
+
+#ifdef CHECK_DUPS
+	free_dups (sdb);
+#endif
+
 	/* clean up the search database handle allocated to the node structure */
 	search_db_free (sdb);
 }
@@ -1639,7 +1836,7 @@
 
 /*****************************************************************************/
 
-static Share *db_lookup_md5 (FTSearchDB *sdb, unsigned char *md5)
+static Share *db_lookup_md5 (FTSearchDB *sdb, unsigned char *md5, uint8_t **order)
 {
 	static struct sharedata_key   keyrec;
 	static struct sharedata_data *datarec;
@@ -1663,6 +1860,31 @@
 
 	if ((ret = dbp->get (dbp, NULL, &key, &data, 0)))
 	{
+		assert (ret == DB_NOTFOUND);
+
+#ifdef CHECK_DUPS
+		/* make sure we actually have some dups */
+		if (!sdb->dups)
+			db_abort (sdb);
+
+		{
+			int i, len;
+			len = array_count (&sdb->dups);
+			for (i=0; i<len; i++)
+			{
+				unsigned char *dup;
+				dup = array_index (&sdb->dups, i);
+				assert (dup);
+				if (!memcmp (md5, dup, 16))
+					break;
+			}
+			
+			/* abort if we didn't find it in the dups list either */
+			if (i == len)
+				db_abort (sdb);
+		}
+#endif
+
 		ERR_DB_SDB("DB->get", ret, sdb);
 		return NULL;
 	}
@@ -1676,7 +1898,7 @@
 	 * to be called to gaurantee we are gathering the exact token stream that
 	 * was created at insert.
 	 */
-	if (!(share = unserialize_record (sdb, md5, datarec)))
+	if (!(share = unserialize_record (sdb, md5, datarec, order)))
 		return NULL;
 
 	return share;
@@ -1749,7 +1971,7 @@
 	 * completely unserialize the original FileShare object inserted into
 	 * the database.
 	 */
-	if (!(share = db_lookup_md5 (sdb, md5)))
+	if (!(share = db_lookup_md5 (sdb, md5, NULL)))
 	{
 		FT->DBGFN (FT, "%s: unable to lookup md5 %s",
 		           ft_node_fmt (sdb->node), md5_fmt (md5));
@@ -1827,14 +2049,19 @@
 {
 	DBC      *cursor;
 	u_int32_t flags;
-};
 
-static int cleanup_matches (DBT *data, void *udata)
-{
-	free (data->data);
-	free (data);
-	return TRUE;
-}
+	/* the token this is searching for */
+	uint32_t  token;
+	
+	/* where we should store the order after lookup */ 
+	uint8_t   *optr;
+
+	/* whether this is an exclude token */
+	BOOL       exclude;
+
+	/* duplicate count */
+	db_recno_t  len;
+};
 
 static int cleanup_cursors (struct cursor_stream *s, void *udata)
 {
@@ -1843,15 +2070,14 @@
 	return TRUE;
 }
 
-static void token_cleanup (List *matches, List *cursors)
+static void token_cleanup (List *cursors)
 {
-	list_foreach_remove (matches, (ListForeachFunc)cleanup_matches, NULL);
 	list_foreach_remove (cursors, (ListForeachFunc)cleanup_cursors, NULL);
 }
 
 static DBC *get_cursor (DB *dbp, uint32_t token)
 {
-	static struct tokenidx_key keyrec;
+	struct tokenidx_key keyrec;
 	DBC *dbcp;
 	DBT  key;
 	DBT  data;
@@ -1879,13 +2105,19 @@
 	return dbcp;
 }
 
-static List *token_gather_cursors (DB *dbp, uint32_t *tokens)
+static List *token_gather_cursors (DB *dbp, uint32_t *qtokens,
+				   uint32_t *etokens, uint8_t *ordmap)
 {
 	uint32_t *t;
 	List     *cursors = NULL;
 	DBC      *dbcp;
+	uint8_t  *ordptr;
 
-	for (t = tokens; t && *t; t++)
+	/* ignore queries with just exclude tokens */
+	if (!qtokens || !*qtokens)
+		return NULL;
+
+	for (t = qtokens, ordptr = ordmap; t && *t; t++, ordptr++)
 	{
 		struct cursor_stream *s;
 
@@ -1893,39 +2125,45 @@
 		 * abort the search (and return 0 results) */
 		if (!(dbcp = get_cursor (dbp, *t)))
 		{
-			token_cleanup (NULL, cursors);
+			token_cleanup (cursors);
 			return NULL;
 		}
 
 		if (!(s = malloc (sizeof (struct cursor_stream))))
 			continue;
 
-		s->cursor = dbcp;
-		s->flags  = DB_CURRENT;
+		s->cursor  = dbcp;
+		s->flags   = DB_CURRENT;
+		s->token   = *t;
+		s->optr    = ordmap ? ordptr : NULL;
+		s->exclude = FALSE;
 
 		cursors = list_prepend (cursors, s);
 	}
 
-	return cursors;
-}
+	for (t = etokens; t && *t; t++)
+	{
+		struct cursor_stream *s;
 
-static void token_add_result (List **results, DBT *data)
-{
-	DBT *copy;
+		/* skip any missing tokens: we didn't want them
+		 * anyway */
+		if (!(dbcp = get_cursor (dbp, *t)))
+			continue;
 
-	/* make a complete allocated copy and append to the list */
-	if (!(copy = MALLOC (sizeof (DBT))))
-		return;
+		if (!(s = malloc (sizeof (struct cursor_stream))))
+			continue;
 
-	copy->size = data->size;
+		s->cursor  = dbcp;
+		s->flags   = DB_CURRENT;
+		s->token   = *t;
+		s->optr    = NULL;
+		s->exclude = TRUE;
 
-	if (!(copy->data = gift_memdup (data->data, data->size)))
-	{
-		free (copy);
-		return;
+		cursors = list_prepend (cursors, s);
 	}
 
-	*results = list_prepend (*results, copy);
+
+	return cursors;
 }
 
 /*
@@ -1941,8 +2179,12 @@
 	assert (data->size == sizeof (*datarec));
 	datarec = data->data;
 
+#ifndef SEARCH_DB_BENCHMARK
 	if (datarec->sdb->node == NULL)
 		assert (remove_active == TRUE);
+#else
+	assert (datarec->sdb->node != NULL);
+#endif
 
 	/* if node is NULL, stale is TRUE */
 	return BOOL_EXPR (datarec->sdb->node == NULL);
@@ -1950,8 +2192,8 @@
 
 static BOOL look_for (struct cursor_stream *s, DBT *data_cmp)
 {
-	static struct tokenidx_data *datarec_cmp;
 	static struct tokenidx_data *datarec;
+	static struct tokenidx_data *datarec_cmp;
 	DBT key;
 	DBT data;
 	int cmp;
@@ -1967,7 +2209,7 @@
 	 * while attempting to locate any possible token intersection according
 	 * to the md5sum (compare cmp_data vs data).
 	 */
-	for (; (s->cursor)->c_get (s->cursor, &key, &data, s->flags) == 0;
+	for (; s->flags && (s->cursor)->c_get (s->cursor, &key, &data, s->flags) == 0;
 	     s->flags = DB_NEXT_DUP)
 	{
 		/* ignore nodes currently being removed */
@@ -1997,17 +2239,25 @@
 		/* matched, note that we will not reset flags as this exact position
 		 * will be passed by the parent cursor as well */
 		if (cmp == 0)
+		{
+			/* fill in the order while we have the
+			 * tokenidx_data record available */
+			if (s->optr)
+				(*s->optr) = ((struct tokenidx_data *)
+					      (data.data))->order;
+
 			return TRUE;
+		}
 	}
 
-	/* this set has exhausted, no more data left...we should really set
-	 * some special cursor flag so that we stop searching this stream. */
+	s->flags = 0;
+
+	/* this set has exhausted, no more data left */
 	return FALSE;
 }
 
-static void calc_shortest (struct cursor_stream *s, void **args)
+static void calc_length (struct cursor_stream *s, void *udata)
 {
-	db_recno_t *count = args[0];
 	db_recno_t  scnt;
 	int         ret;
 
@@ -2021,47 +2271,79 @@
 		return;
 	}
 
-	/*
-	 * This cursor's length is shorter than the last known cursor stream, so
-	 * we should reset the smallest length and current "located" stream
-	 * value on the args data.
-	 */
-	if (*count == 0 || scnt < *count)
-	{
-		*count = scnt;
-		args[1] = s;
-	}
+	s->len = scnt;
+}
+
+static int compare_length (struct cursor_stream *a, struct cursor_stream *b)
+{
+	if (a->len > b->len)
+		return 1;
+
+	if (a->len < b->len)
+		return -1;
+
+	return 0;
 }
 
 static struct cursor_stream *get_start_cursor (List **qt)
 {
 	struct cursor_stream *s;
 	List      *link;
-	void      *args[2];
-	db_recno_t count = 0;
-
-	args[0] = &count;
-	args[1] = NULL;
 
 	/*
 	 * Loop through all cursor streams in order to calculate the shortest
-	 * cursor (in terms of number of duplicates).  See below (match_qt) for an
+	 * cursor (in terms of number of duplicates).  See below (match_tokens) for an
 	 * explanation of why we do this.  Note that if we only have one
 	 * element in this list we can assume it is the shortest and skip the
 	 * cursor count retrieval.
 	 */
 	if (list_next (*qt))
-		list_foreach (*qt, (ListForeachFunc)calc_shortest, args);
+	{
+		List *ptr;
+		uint32_t last_token = 0;
+		
+		list_foreach (*qt, (ListForeachFunc)calc_length, NULL);
 
-	/*
-	 * If args[1] is non-NULL, we have located an appropriate node, remove from
-	 * the cursor list and return the beginning cursor stream.  If args[1]
-	 * is NULL, we should simply pop off the 0th element and return it.
-	 */
-	if (args[1])
-		link = list_find (*qt, args[1]);
+		/*
+		 * We sort separately rather than doing sorted inserts
+		 * so that we can avoid counting if some of the tokens
+		 * don't exist.
+		 */
+		*qt = list_sort (*qt, (CompareFunc)compare_length);
+
+		/* 
+		 * Verify that there are no duplicate tokens (which
+		 * will now be adjacent). Duplicates can occur only
+		 * when the same token appears in both the query and
+		 * exclude lists, as each list was uniq'd
+		 * individually.
+		 */
+		for (ptr = *qt; ptr; ptr = list_next (ptr))
+		{
+			s = ptr->data;
+			
+			if (s->token == last_token)
+				return NULL;
+			
+			last_token = s->token;
+		}
+
+		/* loop until we find a query token */
+		for (ptr = *qt; ptr; ptr = list_next (ptr))
+		{
+			s = ptr->data;
+
+			if (s->exclude == FALSE)
+				break;
+		}
+
+		link = ptr;
+	}
 	else
-		link = list_nth (*qt, 0);
+		link = *qt;
+
+	/* remove from the cursor list and return the beginning cursor
+	 * stream */
 
 	if (!link)
 		return NULL;
@@ -2069,12 +2351,67 @@
 	/* we need to assign this before we remove the link as it will be
 	 * freed by removal */
 	s = link->data;
+
 	*qt = list_remove_link (*qt, link);
 
 	return s;
 }
 
-static int match_qt (List **results, List **qt, int max_results)
+/*
+ * We keep a list of the ordering of tokens in the query. This is
+ * mapped to the actual values of the "order" fields in tokenidx_data
+ * for each token, which give the token numbers used when this result
+ * was added (and thus those used in the order list in
+ * sharedata_data). Simple string comparison is then used to check
+ * that each search result contains the mapped ordering(s) in the
+ * share data. The order list for each share is a temporary copy, so,
+ * after matching, we overwrite the matched orders with separators so
+ * they can only match once.
+ */
+static BOOL check_order (uint8_t *share_order, uint8_t *order, uint8_t *ordmap)
+{
+	uint8_t *newmap, *ptr, *sptr;
+	int ordlen, i;
+	BOOL ret = TRUE;
+
+	if (order == NULL ||
+	    share_order == NULL)
+		return TRUE;
+
+	ordlen = strlen (order);
+	
+	newmap = MALLOC (ordlen + 1);
+	
+	for (i = 0, ptr = newmap; i <= ordlen; i++)
+	{
+		if (order[i] > ORDER_SEP)
+			*(ptr++) = ordmap[order[i] - ORDER_MIN];
+		else
+		{
+			/* we have a full phrase to check */
+			*ptr = 0;
+			sptr = strstr (share_order, newmap);
+
+			if (sptr == NULL)
+			{			
+				ret = FALSE;
+				break;
+			}
+			
+			/* erase it so it won't match next time */
+			memset (sptr, ORDER_SEP, ptr-newmap);
+
+			ptr = newmap;
+		}
+	}
+
+	free (newmap);
+
+	return ret;
+}
+
+static int match_tokens (Array **results, List **qt, int max_results,
+		     uint8_t *order, uint8_t *ordmap)
 {
 	struct cursor_stream *s;
 	List      *ptr;
@@ -2115,6 +2452,9 @@
 
 		lost = FALSE;
 
+		if (s->optr)
+			(*s->optr) = ((struct tokenidx_data *)(data.data))->order;
+
 		/*
 		 * Walk along all the other tokens looking for an intersection.  Note
 		 * that this code holds the last position of the cursor so that we
@@ -2122,7 +2462,8 @@
 		 */
 		for (ptr = *qt; ptr; ptr = list_next (ptr))
 		{
-			if (!look_for (ptr->data, &data))
+			if (look_for (ptr->data, &data) ==
+			    ((struct cursor_stream *)(ptr->data))->exclude)
 			{
 				lost = TRUE;
 				break;
@@ -2135,11 +2476,36 @@
 		 */
 		if (lost == FALSE)
 		{
-			token_add_result (results, &data);
+			struct tokenidx_data *datarec = data.data;
+			uint8_t *share_order;
+			Share *share;
+			
+			/* grab the Share and order list */
+			if (!(share = db_lookup_md5 (datarec->sdb, datarec->md5,
+						     &share_order)))
+			{
+				FT->DBGFN (FT, "%s: unable to lookup md5 %s",
+					   ft_node_fmt (datarec->sdb->node),
+					   md5_fmt (datarec->md5));
+				continue;
+			}
 
-			/* make sure we cap the size of the results
-			 * TODO: this is a major bug here!  we cant cap the size of
-			 * the results until after we apply the exclusion set! */
+			/* do some more verification first */
+			if (order && check_order (share_order, order, ordmap) == FALSE)
+			{
+				ft_share_unref (share);
+				free (share_order);
+
+				continue;
+			}
+			
+			free (share_order);
+
+			/* and add it */
+			if (max_results)
+				array_push (results, share);
+
+			/* make sure we cap the size of the results */
 			matches++;
 
 			if (max_results && matches >= max_results)
@@ -2156,66 +2522,6 @@
 	return matches;
 }
 
-static int match_et (List **results, List **et, int max_results)
-{
-	if (!(*results) || !(*et))
-		return 0;
-
-	return 0;
-}
-
-static List *token_lookup_match (List *qt, List *et, int max_results)
-{
-	List *results = NULL;
-
-	match_qt (&results, &qt, max_results);
-	match_et (&results, &et, max_results);
-
-	token_cleanup (NULL, qt);
-	token_cleanup (NULL, et);
-
-	return results;
-}
-
-static int lookup_ret (DBT *dbt, void **args)
-{
-	Array **a           = args[0];
-	int    *max_results = args[1];
-	int    *matches     = args[2];
-	static struct tokenidx_data *datarec;
-
-	/* do not process more results than we were allowed */
-	if (*max_results && *matches >= *max_results)
-		return TRUE;
-
-	assert (dbt->size == sizeof (struct tokenidx_data));
-	datarec = dbt->data;
-
-	/*
-	 * WARNING/TODO: We do not match realm here, and we REALLY NEED TO.
-	 * Realm matches need to use a partial database lookup before the result
-	 * is fully selected and added to the list.
-	 */
-	if (add_search_result (a, datarec->sdb, datarec->md5))
-		(*matches)++;
-
-	cleanup_matches (dbt, NULL);
-
-	return TRUE;
-}
-
-static int token_lookup_ret (Array **a, List *cursors, char *realm,
-                             int max_results)
-{
-	int nmatches = 0;
-	void *args[] = { a, &max_results, &nmatches, realm };
-
-	cursors = list_foreach_remove (cursors, (ListForeachFunc)lookup_ret, args);
-	list_free (cursors);
-
-	return nmatches;
-}
-
 /*
  * Perform a search through the query and exclude token sets.  This adds a
  * huge level of complexity to the search algorithm, and uses a specialized
@@ -2231,42 +2537,35 @@
  */
 static int db_search_tokens (Array **a, char *realm,
                              uint32_t *query, uint32_t *exclude,
-                             int max_results)
+			     uint8_t *order, int max_results)
 {
 	DB   *dbp;
-    List *qt_cursors = NULL;
-	List *et_cursors = NULL;
 	List *cursors = NULL;
 	int   results = 0;
+	uint8_t *ordmap = NULL;
 
 	if (!(dbp = db_tokenidx()))
 		return 0;
 
+	/* map the tokens list to their ordering in the original
+	 * record */
+	if (order)
+	{
+		int len = get_tokens_len (query);
+
+		ordmap = CALLOC (len, 1);
+	}
+
 	/* construct a list of all positioned cursors, effectively retrieving a
 	 * list of token result streams */
-	qt_cursors = token_gather_cursors (dbp, query);
-	et_cursors = token_gather_cursors (dbp, exclude);
+	cursors = token_gather_cursors (dbp, query, exclude, ordmap);
 
-	/*
-	 * Find the list of cursors which successfully matched this query by
-	 * first identifying the intersection of all cursors within qt_cursors,
-	 * and then excluding all matches from et_cursor.  Returns a newly
-	 * allocated list containing all share host ip addresses and MD5s that
-	 * matched the search.
-	 *
-	 * NOTE:
-	 * The cursors list result is not in the same "format" as qt_cursors,
-	 * the data held within is completely different.
-	 */
-	cursors = token_lookup_match (qt_cursors, et_cursors, max_results);
+	results = match_tokens (a, &cursors, max_results,
+				      order, ordmap);
 
-	/*
-	 * Add all results to the main result list, after unserialization
-	 * occurs.  This logic also handles cleanup of all non-returned data
-	 * held within the cursors list.
-	 */
-	if (cursors)
-		results = token_lookup_ret (a, cursors, realm, max_results);
+	token_cleanup (cursors);
+	
+	free (ordmap);
 
 	return results;
 }
@@ -2315,6 +2614,8 @@
 		env_search_path = NULL;
 	}
 #endif /* USE_LIBDB */
+
+	db_initialized = FALSE;
 }
 
 /*****************************************************************************/
@@ -2360,6 +2661,11 @@
 	{
 		FT_SEARCH_DB(node)->shares--;
 		FT_SEARCH_DB(node)->size -= ((float)size / 1024.0) / 1024.0;
+		FT->DBGFN (FT, "%s: removed '%s' (%d, %d left)", ft_node_fmt(node), md5_fmt(md5), size, FT_SEARCH_DB(node)->shares);
+	}
+	else
+	{
+		FT->DBGFN (FT, "%s: '%s' removal failed", ft_node_fmt(node), md5_fmt(md5));
 	}
 #endif /* USE_LIBDB */
 
@@ -2383,6 +2689,11 @@
 	if (!sdb)
 		return TRUE;
 
+#ifndef SEARCH_DB_BENCHMARK
+	if (openft->shutdown)
+		return TRUE;
+#endif
+
 	/*
 	 * Schedule the removal and leave.  We cannot block for the entire
 	 * duration of this operation, but we can disassociate the FTSearchDB
@@ -2468,7 +2779,7 @@
 		return NULL;
 
 #ifdef USE_LIBDB
-	ret = db_lookup_md5 (FT_SEARCH_DB(node), md5);
+	ret = db_lookup_md5 (FT_SEARCH_DB(node), md5, NULL);
 #endif /* USE_LIBDB */
 
 	return ret;
@@ -2490,7 +2801,7 @@
 
 int ft_search_db_tokens (Array **a, char *realm,
                          uint32_t *query, uint32_t *exclude,
-                         int max_results)
+			 uint8_t *order, int max_results)
 {
 	int results = 0;
 
@@ -2498,7 +2809,7 @@
 		return results;
 
 #ifdef USE_LIBDB
-	results = db_search_tokens (a, realm, query, exclude, max_results);
+	results = db_search_tokens (a, realm, query, exclude, order, max_results);
 #endif /* USE_LIBDB */
 
 	return results;
@@ -2540,8 +2851,8 @@
 static int nodes = 0;
 static int files = 0;
 
-static int minnodes = 100;
-static int maxnodes = 500;
+static int minnodes = 150;
+static int maxnodes = 150;
 
 static int minqueries =  5000;
 static int maxqueries = 10000;
@@ -2603,10 +2914,16 @@
 		Share         *record;
 		unsigned char *md5 = key.data;
 
-		record = unserialize_record (FT_SEARCH_DB(node), md5, data.data);
+		record = unserialize_record (FT_SEARCH_DB(node), md5, data.data, NULL);
 		assert (record != NULL);
 
 		array_push (files, record);
+
+#if 0
+		/* make some duplicates for good measure */
+		if (rand() > RAND_MAX/100)
+			array_push (files, record);
+#endif
 		nfiles++;
 	}
 
@@ -2641,7 +2958,7 @@
 	}
 
 	nodes = stats->bt_ndata;
-	FT->dbg (FT, "%s: contains %d nodes\n", dbfile, nodes);
+	FT->dbg (FT, "%s: contains %d nodes", dbfile, nodes);
 
 	if (nodes > maxnodes)
 		nodes = maxnodes;
@@ -2718,7 +3035,7 @@
 
 		bm_close_db (hostdb);
 
-		if (i > maxnodes)
+		if (i >= maxnodes)
 			break;
 	}
 
@@ -2772,6 +3089,39 @@
 	return n;
 }
 
+static void free_queries (void)
+{
+	char *query;
+
+	while ((query = array_shift (&queries)))
+	{
+		free (query);
+	}
+
+	array_unset (&queries);
+}
+
+static void free_test_data (void)
+{
+	int i;
+	for (i = 0; i < nodes; i++)
+	{
+		FTNode *node = nodelist[i].node;
+		Array *files = nodelist[i].files;
+		Share *record;
+
+		while ((record = array_shift (&files)))
+			ft_share_unref (record);
+
+		array_unset (&files);
+
+		ft_node_free (node);
+	}
+	 
+	free (nodelist);
+}
+
+
 static double run_insert (void)
 {
 	StopWatch *gsw;
@@ -2794,6 +3144,10 @@
 			Share *share = array_index (&hl->files, j);
 			assert (share != NULL);
 
+#if 0
+			FT->dbg (FT, "inserting file %s",
+				 md5_fmt (share_get_hash (share,"MD5")->data));
+#endif
 			if (!(ft_search_db_insert (hl->node, share)))
 			{
 				FT->err (FT, "%s(%s): error inserting file",
@@ -2821,25 +3175,56 @@
 	gsw = stopwatch_new (TRUE);
 	assert (gsw != NULL);
 
+#ifndef INTERACTIVE
 	for (i = 0; i < nqueries; i++)
 	{
 		char *query = array_index (&queries, i);
+		char *exclude = "";
+#else
+	char query[100];
+	char exclude[100];
+
+	db_sync (NULL);
+
+	/* fixed size buffers *and* gets()?! 
+	 * hey, this is only for testing... */
+	while (gets (query))
+	{
+		StopWatch *sw;
+
+#endif
 		uint32_t *qtokens;
-		uint32_t etokens = 0;
+		uint32_t *etokens;
+		uint8_t *order;
 		Array *matches = NULL;
 		int hits;
 		int j;
-
-		qtokens = ft_search_tokenize (query);
+		
+#ifdef INTERACTIVE
+		gets (exclude);
+		sw = stopwatch_new (TRUE);
+#endif
+		qtokens = ft_tokenize_query (query, &order);
 		assert (qtokens != NULL);
+		etokens = ft_tokenize_query (exclude, NULL);
 
-		hits = ft_search_db_tokens (&matches, NULL, qtokens, &etokens, 100000);
+		hits = ft_search_db_tokens (&matches, NULL, qtokens, etokens,
+					    order, 100000);
 		free (qtokens);
+		free (etokens);
+		free (order);
 
+#ifdef INTERACTIVE
+		printf("'%s' (-'%s'): %d hits, %.06f elapsed\n", query, exclude, hits, stopwatch_free_elapsed (sw));
+#endif
 		for (j = 0; j < hits; j++)
 		{
 			Share *share = array_index (&matches, j);
-			share_unref (share);
+#ifdef INTERACTIVE
+			if (hits < 30)
+				printf ("%s\n", share->path);
+#endif
+			ft_share_unref (share);
 		}
 
 		array_unset (&matches);
@@ -2850,7 +3235,7 @@
 	return stime;
 }
 
-static double run_remove (void)
+static double run_remove (BOOL remove_singly)
 {
 	StopWatch *gsw;
 	double rtime;
@@ -2868,20 +3253,77 @@
 		assert (sw != NULL);
 #endif
 
-		if (!(ft_search_db_remove_host (node)))
+		if (remove_singly)
 		{
-			FT->err (FT, "error removing node %s", ft_node_fmt (node));
-			abort ();
+			int j;
+			Array *f=nodelist[i].files;
+			int flen=array_count(&f);
+
+			ft_search_db_open (node);
+
+			for(j=0;j<flen/2;j++) {
+				Share *file=array_splice (&f, j, 0, NULL);
+				Hash *hash=share_get_hash (file, "MD5");
+				if (hash) {
+					FT->dbg (FT,"removing file %s (%s)", file->path, ft_node_fmt(node));
+
+					if (!ft_search_db_remove(node,hash->data))
+						FT->err (FT,"error removing file %s (%s)", file->path, ft_node_fmt(node));
+				} else 
+					FT->err (FT, "error reading file array");
+			}
+			
+			ft_search_db_close (node, FALSE);
+
+			{
+				DB_BTREE_STAT *stats;
+				DB *db=FT_SEARCH_DB(node)->share_idx;
+				if (db)
+				{
+					if (!db->stat(db, &stats, 0))
+						assert (stats->bt_ndata == 0);
+					else
+					{
+						FT->err(FT, "failed to get sdb stats");
+						abort ();
+					}
+				}
+			}
+		}
+
+		if (1)
+		{
+			if (!(ft_search_db_remove_host (node)))
+			{
+				FT->err (FT, "error removing node %s", ft_node_fmt (node));
+				abort ();
+			}
 		}
 
 #if 0
 		FT->dbg (FT, "delete %s(%lu): %.06fs elapsed", ft_node_fmt (node),
 				 shares, stopwatch_free_elapsed (sw));
 #endif
-
 		ft_search_db_sync (node);
 	}
 
+	{
+		DB_BTREE_STAT *stats;
+		DB *db=db_tokenidx();
+		assert (db);
+
+		if (!db->stat(db, &stats, 0))
+		{
+			FT->dbg (FT, "tokenidx: %d key, %d data", stats->bt_nkeys, stats->bt_ndata);
+			assert (stats->bt_ndata == 0);
+		}
+		else
+		{
+			FT->err(FT, "failed to get sdb stats");
+			abort ();
+		}
+	}
+	
 	rtime = stopwatch_free_elapsed (gsw);
 
 	return rtime;
@@ -2903,13 +3345,20 @@
 	         nqueries, stime, (float)nqueries / stime);
 
 	/* remove */
-	rtime = run_remove ();
+	rtime = run_remove (FALSE);
 	FT->dbg (FT, "remove(%lu): %.06fs elapsed (avg %.02f files/s)",
 	         files, rtime, (float)files / rtime);
+
+	db_sync (NULL);
 }
 
 BOOL test_suite_search_db (Protocol *p)
 {
+#ifndef SEARCH_DB_BENCHMARK
+	FT->err (FT, "benchmarking requested but disabled");
+	return FALSE;
+
+#else
 	int ret;
 
 	if ((ret = load_test_data ("test.data")) <= 0)
@@ -2924,10 +3373,14 @@
 		return FALSE;
 	}
 
-	ft_search_db_init ("benchtemp", 209715200);
+	ft_search_db_init ("benchtemp", 20971520);
 	test_benchmarks ();
 
+	free_test_data ();
+	free_queries ();
+
 	return TRUE;
+#endif
 }
 
 #endif /* OPENFT_TEST_SUITE */
Index: ft_search_db.h
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_search_db.h,v
retrieving revision 1.21
diff -u -r1.21 ft_search_db.h
--- ft_search_db.h	26 Oct 2003 13:15:43 -0000	1.21
+++ ft_search_db.h	13 Jun 2004 23:15:57 -0000
@@ -111,6 +111,7 @@
 
 	unsigned long shares;              /**< total files currently shared */
 	double        size;                /**< total size (MB) */
+	Array   *dups;                     /* duplicated hashes */
 } FTSearchDB;
 
 /* shorthand */
@@ -239,7 +240,7 @@
  * queried and the search is much more complex and expensive.
  */
 int ft_search_db_tokens (Array **a, char *realm, uint32_t *query,
-						 uint32_t *exclude, int max_results);
+			 uint32_t *exclude, uint8_t *order, int max_results);
 
 /*****************************************************************************/
 
Index: ft_search_exec.c
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_search_exec.c,v
retrieving revision 1.58
diff -u -r1.58 ft_search_exec.c
--- ft_search_exec.c	7 Jan 2004 12:06:42 -0000	1.58
+++ ft_search_exec.c	13 Jun 2004 23:15:58 -0000
@@ -35,22 +35,7 @@
 
 #include "ft_search_exec.h"
 
-/*****************************************************************************/
-
-/*
- * Defines a special set of characters which will be skipped when tokenizing
- * each individual word.  For example, "foo!bar" would produce the same
- * token as "foobar".
- */
-#define SEARCH_TOKEN_PUNCT ",`'!?*"
-
-/*
- * Defines the set of characters which will be used a delimiters between
- * individual token words.  In addition to this set, numbers are treated
- * as special delimiters through a second string scan.  For example,
- * "s03e21" will produce the token word "s03e21", "3", and "21".
- */
-#define SEARCH_TOKEN_DELIM "\\/ _-.[]()\t"
+#include "ft_tokenize.h"
 
 /*****************************************************************************/
 
@@ -93,6 +78,7 @@
 	char             *f_exclude;       /* exclude string */
 	uint32_t         *f_qtokens;       /* query tokens list */
 	uint32_t         *f_etokens;       /* exclude tokens list */
+	uint8_t          *f_order;
 	char             *f_realm;         /* optional realm to filter by */
 
 	/**
@@ -201,268 +187,6 @@
 
 /*****************************************************************************/
 
-static BOOL is_token_punct (int c)
-{
-	const char *ptr;
-
-	/* TODO: Lots of room for optimization here */
-	for (ptr = SEARCH_TOKEN_PUNCT; *ptr != '\0'; ptr++)
-	{
-		if (*ptr == c)
-			return TRUE;
-	}
-
-	return FALSE;
-}
-
-static int next_letter (const char **strref, size_t *lenref)
-{
-	const char *str = *strref;
-	size_t len = *lenref;
-	int c;
-
-	if (len == 0)
-		return 0;
-
-	/* skip any punctuation characters while scanning the word so that we
-	 * don't need to actually modify the word */
-	while (is_token_punct (*str) == TRUE)
-	{
-		if (len == 1)
-			return 0;
-
-		str++;
-		len--;
-	}
-
-	c = tolower (*str);
-	assert (c != '\0');
-
-	*strref = str + 1;
-	*lenref = len - 1;
-
-	return c;
-}
-
-/*
- * I believe this came from an old version of GLib or something.  Verify
- * later and add the appropriate credit.
- */
-static uint32_t make_token (const char *word, size_t len)
-{
-	uint32_t hash = 0;
-	int letter;
-
-	if (word == NULL)
-		return 0;
-
-	if ((letter = next_letter (&word, &len)) == 0)
-		return 0;
-
-	hash = letter;
-
-	while ((letter = next_letter (&word, &len)) != 0)
-		hash = (hash << 5) - hash + letter;
-
-	return hash;
-}
-
-/*****************************************************************************/
-
-struct token_list
-{
-	uint32_t *tokens;
-	size_t    nmemb;
-	size_t    size;
-};
-
-static void tlist_init (struct token_list *tlist)
-{
-	tlist->tokens = NULL;
-	tlist->nmemb = 0;
-	tlist->size = 0;
-}
-
-static BOOL tlist_resize_min (struct token_list *tlist, size_t nmemb)
-{
-	uint32_t *newalloc;
-	size_t size;
-
-	if (tlist->size >= nmemb)
-		return TRUE;
-
-	if ((size = tlist->size) == 0)
-		size = 1;
-
-	while (size < nmemb)
-		size *= 2;
-
-	if (!(newalloc = realloc (tlist->tokens, size * sizeof (uint32_t))))
-		return FALSE;
-
-	tlist->tokens = newalloc;
-	tlist->size = size;
-
-	return TRUE;
-}
-
-static BOOL tlist_add (struct token_list *tlist, uint32_t token)
-{
-	if (tlist_resize_min (tlist, tlist->nmemb + 1) == FALSE)
-		return FALSE;
-
-	tlist->tokens[tlist->nmemb++] = token;
-
-	return TRUE;
-}
-
-static BOOL tlist_addword (struct token_list *tlist,
-                           const char *word, size_t wordlen)
-{
-	uint32_t token;
-
-	if ((token = make_token (word, wordlen)) > 0)
-		return tlist_add (tlist, token);
-
-	return FALSE;
-}
-
-static void add_numbers (struct token_list *tlist, const char *str)
-{
-	char *ptr;
-	size_t numlen;
-
-	/* implicitly scan past leading 0's as they are found only for padding,
-	 * not to represent octal numbers */
-	while ((ptr = strpbrk (str, "123456789")))
-	{
-		numlen = strspn (ptr, "0123456789");
-		assert (numlen > 0);
-
-		tlist_addword (tlist, ptr, numlen);
-
-		str = ptr + numlen;
-	}
-}
-
-static void add_words (struct token_list *tlist, const char *str)
-{
-	size_t wordlen;
-
-	while (1)
-	{
-		if ((wordlen = strcspn (str, SEARCH_TOKEN_DELIM)) > 0)
-			tlist_addword (tlist, str, wordlen);
-
-		if (str[wordlen] == '\0')
-			break;
-
-		str += wordlen + 1;
-	}
-}
-
-static void tlist_addstr (struct token_list *tlist, const char *str)
-{
-	/* lets us be lazy in the usage */
-	if (str == NULL)
-		return;
-
-	add_numbers (tlist, str);
-	add_words (tlist, str);
-}
-
-static int token_cmp (const void *a, const void *b)
-{
-	return INTCMP (*(uint32_t *)a, *(uint32_t *)b);
-}
-
-static void sort_and_uniq (struct token_list *tlist)
-{
-	size_t i;
-	size_t nmemb = 0;
-	uint32_t lasttoken = 0;
-
-	if (tlist->nmemb == 0)
-		return;
-
-	/* sort */
-	qsort (tlist->tokens, tlist->nmemb, sizeof (uint32_t), token_cmp);
-
-	/* ... and uniq */
-	for (i = 0; i < tlist->nmemb; i++)
-	{
-		if (lasttoken > 0)
-		{
-			/* skip duplicates */
-			if (tlist->tokens[i] == lasttoken)
-				continue;
-		}
-
-		lasttoken = tlist->tokens[i];
-		assert (lasttoken != 0);
-
-		/*
-		 * Only update the token list position if we have actually detected a
-		 * duplicate and the number of elements in the new list will differ
-		 * from the old list.
-		 */
-		if (nmemb != i)
-			tlist->tokens[nmemb] = lasttoken;
-
-		nmemb++;
-	}
-
-	tlist->nmemb = nmemb;
-}
-
-static uint32_t *tlist_finish (struct token_list *tlist)
-{
-	/* sort the token list, then remove duplicates (by way of rewinding the
-	 * stream) */
-	sort_and_uniq (tlist);
-
-	/* add the sentinel (token=0) */
-	tlist_add (tlist, 0);
-
-	return tlist->tokens;
-}
-
-/*****************************************************************************/
-
-uint32_t *ft_search_tokenize (const char *string)
-{
-	struct token_list tlist;
-
-	if (string == NULL)
-		return NULL;
-
-	tlist_init (&tlist);
-	tlist_addstr (&tlist, string);
-
-	return tlist_finish (&tlist);
-}
-
-uint32_t *ft_search_tokenizef (Share *file)
-{
-	struct token_list tlist;
-
-	if (file == NULL)
-		return NULL;
-
-	tlist_init (&tlist);
-
-	tlist_addstr (&tlist, SHARE_DATA(file)->path);
-	tlist_addstr (&tlist, share_get_meta (file, "tracknumber"));
-	tlist_addstr (&tlist, share_get_meta (file, "artist"));
-	tlist_addstr (&tlist, share_get_meta (file, "album"));
-	tlist_addstr (&tlist, share_get_meta (file, "title"));
-	tlist_addstr (&tlist, share_get_meta (file, "genre"));
-
-	return tlist_finish (&tlist);
-}
-
-/*****************************************************************************/
-
 static int fill_sdata (SearchData *sdata, int nmax,
                        FTSearchResultFn resultfn, void *udata,
                        ft_search_flags_t type, const char *realm,
@@ -484,6 +208,7 @@
 		{
 			uint32_t *qtokens;
 			uint32_t *etokens;
+			uint8_t *order;
 
 			/* hidden searches are pretokenized */
 			if (sdata->type & FT_SEARCH_HIDDEN)
@@ -493,13 +218,14 @@
 
 				query   = NULL;
 				exclude = NULL;
+				order = NULL;
 			}
 			else
 			{
-				if (!(qtokens = ft_search_tokenize (query)))
+				if (!(qtokens = ft_tokenize_query (query, &order)))
 					return FALSE;
 
-				etokens = ft_search_tokenize (exclude);
+				etokens = ft_tokenize_query (exclude, NULL);
 			}
 
 			sdata->sfunc     = (FTSearchFunc)cmp_filename;
@@ -507,6 +233,7 @@
 			sdata->f_exclude = exclude;
 			sdata->f_qtokens = qtokens;
 			sdata->f_etokens = etokens;
+			sdata->f_order   = order;
 			sdata->f_realm   = (char *)realm;
 		}
 		break;
@@ -550,6 +277,7 @@
 		{
 			free (sdata->f_qtokens);
 			free (sdata->f_etokens);
+			free (sdata->f_order);
 		}
 	}
 	else if (FT_SEARCH_METHOD(sdata->type) == FT_SEARCH_MD5)
@@ -634,7 +362,7 @@
 	 case FT_SEARCH_FILENAME:
 		hits = ft_search_db_tokens (&matches, sdata->f_realm,
 		                            sdata->f_qtokens, sdata->f_etokens,
-		                            max_hits);
+					    sdata->f_order, max_hits);
 		break;
 	 default:
 		abort ();                      /* shouldnt happen */
Index: ft_search_obj.c
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_search_obj.c,v
retrieving revision 1.16
diff -u -r1.16 ft_search_obj.c
--- ft_search_obj.c	15 Nov 2003 12:15:58 -0000	1.16
+++ ft_search_obj.c	13 Jun 2004 23:15:59 -0000
@@ -20,6 +20,7 @@
 #include "ft_search_exec.h"
 
 #include "ft_search_obj.h"
+#include "ft_tokenize.h"
 
 /*****************************************************************************/
 
@@ -42,8 +43,8 @@
 	dst->realm   = STRDUP (realm);
 	dst->query   = STRDUP (query);
 	dst->exclude = STRDUP (exclude);
-	dst->qtokens = ft_search_tokenize (query);
-	dst->etokens = ft_search_tokenize (exclude);
+	dst->qtokens = ft_tokenize_query (query, &dst->order);
+	dst->etokens = ft_tokenize_query (exclude, NULL);
 }
 
 static int search_timeout (FTSearch *srch)
@@ -88,6 +89,7 @@
 	free (params->exclude);
 	free (params->qtokens);
 	free (params->etokens);
+	free (params->order);
 }
 
 static void search_free (FTSearch *srch)
Index: ft_search_obj.h
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_search_obj.h,v
retrieving revision 1.8
diff -u -r1.8 ft_search_obj.h
--- ft_search_obj.h	2 Nov 2003 12:09:06 -0000	1.8
+++ ft_search_obj.h	13 Jun 2004 23:15:59 -0000
@@ -90,6 +90,8 @@
 
 	uint32_t         *qtokens;         /**< Query tokens */
 	uint32_t         *etokens;         /**< Exclude tokens */
+
+	uint8_t          *order;           /**< Order of quoted phrases */
 } ft_search_parms_t;
 
 /**
Index: ft_share_file.c
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_share_file.c,v
retrieving revision 1.21
diff -u -r1.21 ft_share_file.c
--- ft_share_file.c	2 Nov 2003 12:09:07 -0000	1.21
+++ ft_share_file.c	13 Jun 2004 23:16:00 -0000
@@ -21,6 +21,7 @@
 #include "ft_search.h"
 #include "ft_search_exec.h"
 #include "ft_share_file.h"
+#include "ft_tokenize.h"
 
 /*****************************************************************************/
 
@@ -40,7 +41,7 @@
 
 	share->node = node;
 	share->ninfo = ninfo;
-	share->tokens = ft_search_tokenizef (file);
+	share->tokens = ft_tokenize_share (file, &share->order);
 
 	return share;
 }
@@ -88,6 +89,7 @@
 		return;
 
 	free (share->tokens);
+	free (share->order);
 	free (share);
 }
 
@@ -138,7 +140,7 @@
 
 	/* tokenize this query for fast searching */
 	if (!share->tokens)
-		share->tokens = ft_search_tokenizef (file);
+		share->tokens = ft_tokenize_share (file, &share->order);
 
 	if (!share->tokens)
 		return FALSE;
Index: ft_share_file.h
===================================================================
RCS file: /cvsroot/gift/OpenFT/src/ft_share_file.h,v
retrieving revision 1.14
diff -u -r1.14 ft_share_file.h
--- ft_share_file.h	2 Nov 2003 12:09:07 -0000	1.14
+++ ft_share_file.h	13 Jun 2004 23:16:00 -0000
@@ -47,6 +47,7 @@
 	                                    *   search results */
 	uint32_t           *tokens;        /**< List of searchable tokens for
 	                                    *   for this file */
+	uint8_t            *order;         /**< Order of tokens */
 } FTShare;
 
 /*****************************************************************************/
