MonetDB: default - Move tokenizer to the attic

Stefan Manegold Stefan.Manegold at cwi.nl
Mon Feb 24 09:42:06 CET 2014


Martin, et al.,

please be aware that this means that also
monetdb5/extras/rdf (aka. --enable-rdf)
does no longer compile,
as it depends on the (now gone) tokenizer.

Best,
Stefan

----- Original Message -----
> Changeset: 155c3a3fcfdb for MonetDB
> URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=155c3a3fcfdb
> Removed Files:
> 	monetdb5/modules/mal/tokenizer.c
> 	monetdb5/modules/mal/tokenizer.h
> 	monetdb5/modules/mal/tokenizer.mal
> Modified Files:
> 	monetdb5/modules/mal/Makefile.ag
> Branch: default
> Log Message:
> 
> Move tokenizer to the attic
> The experiment to organise urls using a variation of graph-based
> tokenization requires a major overhaul to support the void-headed approach.
> 
> 
> diffs (truncated from 737 to 300 lines):
> 
> diff --git a/monetdb5/modules/mal/Makefile.ag
> b/monetdb5/modules/mal/Makefile.ag
> --- a/monetdb5/modules/mal/Makefile.ag
> +++ b/monetdb5/modules/mal/Makefile.ag
> @@ -54,7 +54,6 @@ lib_mal = {
>  		sabaoth.c sabaoth.h \
>  		sysmon.c sysmon.h \
>  		tablet.c tablet.h \
> -		tokenizer.c tokenizer.h \
>  		trader.c trader.h \
>  		transaction.c \
>  		txtsim.c txtsim.h \
> @@ -76,7 +75,7 @@ headers_mal = {
>  		mal_mapi.mal sabaoth.mal remote.mal  \
>  		txtsim.mal recycle.mal \
>  		cluster.mal trader.mal \
> -		tokenizer.mal zorder.mal sample.mal json_util.mal \
> +		zorder.mal sample.mal json_util.mal \
>  		calc.mal batcalc.mal batmtime.mal querylog.mal sysmon.mal
>  }
>  
> @@ -84,7 +83,7 @@ EXTRA_DIST = batExtensions.mal iterator.
>  	groupby.mal mal_init.mal manual.mal mkey.mal manifold.mal pcre.mal \
>  	profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal \
>  	transaction.mal txtsim.mal tablet.mal tablet.h sample.mal json_util.mal \
> -	mal_mapi.mal mat.mal tokenizer.mal pqueue.mal calc.mal \
> +	mal_mapi.mal mat.mal pqueue.mal calc.mal \
>  	batcalc.mal batmtime.mal querylog.mal sysmon.mal
>  
>  EXTRA_DIST_DIR = Tests
> diff --git a/monetdb5/modules/mal/tokenizer.c
> b/monetdb5/modules/mal/tokenizer.c
> deleted file mode 100644
> --- a/monetdb5/modules/mal/tokenizer.c
> +++ /dev/null
> @@ -1,585 +0,0 @@
> -/*
> - * The contents of this file are subject to the MonetDB Public License
> - * Version 1.1 (the "License"); you may not use this file except in
> - * compliance with the License. You may obtain a copy of the License at
> - * http://www.monetdb.org/Legal/MonetDBLicense
> - *
> - * Software distributed under the License is distributed on an "AS IS"
> - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
> - * License for the specific language governing rights and limitations
> - * under the License.
> - *
> - * The Original Code is the MonetDB Database System.
> - *
> - * The Initial Developer of the Original Code is CWI.
> - * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
> - * Copyright August 2008-2014 MonetDB B.V.
> - * All Rights Reserved.
> - */
> -
> -/*
> - * author Lefteris Sidirourgos
> - * Tokenizer
> - * This module implements a vertical fragmented tokenizer for strings.
> - * It is based on the ideas of the urlbox module by mk.
> - *
> - * The input string is tokenized according to a separator character.
> - * Each token is inserted to the next BAT with the same order of
> - * appearance in the string. We currently support 255 tokens in each
> - * string as this module is intended for use with short and similar
> - * strings such as URLs. In addition we maintain a 2-dimensional index
> - * that points to the depth and height of the last token of each string.
> - * The 2-dimensional index is combined to one BAT where the 8 least
> - * significant bits represent the depth, and the rest bits the height.
> - *
> - * The tokenizer can be accessed in two ways. Given the oid retrieve the
> - * re-constructed string, or given a string return its oid if present,
> - * otherwise nil.
> - *
> - * Strings can be added either in batch (from a file or a bat of
> - * strings) and by appending a single string. Duplicate elimination is
> - * always performed.
> - *
> - * There can be only one tokenizer open at the same time. This is
> - * achieved by setting a TRANSaction bat. This might change in the
> - * future. However there can be more than one tokenizers stored in the
> - * disk, each of which is identified by its name (usually the name of
> - * the active schema of the db). These administrative issues and
> - * security aspects (e.g., opening a tokenizer of a different schema)
> - * should be addressed more thoroughly.
> - */
> -#include "monetdb_config.h"
> -#include "bat5.h"
> -#include "tokenizer.h"
> -#include "mal_linker.h"
> -
> -#define MAX_TKNZR_DEPTH 256
> -#define INDEX MAX_TKNZR_DEPTH
> -static int tokenDepth = 0;
> -static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1];
> -static BAT *TRANS = NULL;   /* the catalog of tokenizers */
> -static char name[128];
> -
> -#if SIZEOF_OID == 4 /* 32-bit oid */
> -#define MAX_h ((((oid) 1) << 23) - 1)
> -#else /* 64-bit oid */
> -#define MAX_h ((((oid) 1) << 55) - 1)
> -#endif
> -
> -#define COMP(h, d) ((h << 8) | (d & 255))
> -#define GET_d(x) ((sht) ((x) & 255))
> -#define GET_h(x) ((x) >> 8)
> -
> -static int prvlocate(BAT* b, oid *prv, str part)
> -{
> -	BAT *m = BATmirror(b);
> -	BATiter mi = bat_iterator(m);
> -	BUN p;
> -	if (m->H->hash == NULL)
> -		BAThash(m, 2 * BATcount(m));
> -	HASHloop_str(mi, m->H->hash, p, part)
> -	{
> -		if (*((oid *) BUNtail(mi, p)) == *prv) {
> -			*prv = (oid) p;
> -			return TRUE;
> -		}
> -	}
> -	return FALSE;
> -}
> -
> -str
> -TKNZRopen(int *ret, str *in)
> -{
> -	int depth, r;
> -	bat idx;
> -	str batname = NULL;
> -	BAT *b;
> -
> -	(void) ret;
> -	if (strlen(*in) > 127)
> -		throw(MAL, "tokenizer.open",
> -				ILLEGAL_ARGUMENT " tokenizer name too long");
> -
> -	MT_lock_set(&mal_contextLock, "tokenizer");
> -	if (TRANS != NULL) {
> -		MT_lock_unset(&mal_contextLock, "tokenizer");
> -		throw(MAL, "tokenizer.open", "Another tokenizer is already open");
> -	}
> -
> -	for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
> -		tokenBAT[depth] = 0;
> -	}
> -	tokenDepth = 0;
> -
> -	TRANS = BATnew(TYPE_void, TYPE_str, MAX_TKNZR_DEPTH + 1);
> -	if (TRANS == NULL) {
> -		MT_lock_unset(&mal_contextLock, "tokenizer");
> -		throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
> -	}
> -	/* now we are sure that none overwrites the tokenizer table*/
> -	MT_lock_unset(&mal_contextLock, "tokenizer");
> -	BATseqbase(TRANS, 0);
> -
> -	snprintf(name, 128, "%s", *in);
> -	batname = (str) GDKmalloc(134 * sizeof(char));
> -	if( batname == NULL)
> -		throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
> -
> -	snprintf(batname, 134, "%s_index", name);
> -	idx = BBPindex(batname);
> -
> -	if (idx == 0) { /* new tokenizer */
> -		b = BATnew(TYPE_void, TYPE_oid, 1024);
> -		if (b == NULL)
> -			throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
> -		BATkey(b, FALSE);
> -		BATseqbase(b, 0);
> -		tokenBAT[INDEX] = b;
> -		if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname) !=
> MAL_SUCCEED)
> -			throw(MAL, "tokenizer.open", OPERATION_FAILED);
> -		if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) != MAL_SUCCEED)
> -			throw(MAL, "tokenizer.open", OPERATION_FAILED);
> -		BUNappend(TRANS, batname, FALSE);
> -	} else { /* existing tokenizer */
> -		tokenBAT[INDEX] = BATdescriptor(idx);
> -		BUNappend(TRANS, batname, FALSE);
> -
> -		for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
> -			snprintf(batname, 128, "%s_%d", name, depth);
> -			idx = BBPindex(batname);
> -			if (idx == 0)
> -				break;
> -			tokenBAT[depth] = BATdescriptor(idx);
> -			BUNappend(TRANS, batname, FALSE);
> -		}
> -		tokenDepth = depth;
> -	}
> -
> -	GDKfree(batname);
> -	return MAL_SUCCEED;
> -}
> -
> -str
> -TKNZRclose(int *r)
> -{
> -	int i;
> -	(void) r;
> -
> -	if (TRANS == NULL)
> -		throw(MAL, "tokenizer", "no tokenizer store open");
> -
> -	TMsubcommit(TRANS);
> -
> -	for (i = 0; i < tokenDepth; i++) {
> -		BBPunfix(tokenBAT[i]->batCacheid);
> -	}
> -	BBPunfix(tokenBAT[INDEX]->batCacheid);
> -
> -	tokenDepth = 0;
> -
> -	BBPreclaim(TRANS);
> -	TRANS = NULL;
> -	return MAL_SUCCEED;
> -}
> -
> -/*
> - * Tokenize operations
> - * The tokenizer operation assumes a private copy to mark the end of the
> - * token separators with a zero byte. Tokens are separated by a single
> - * character for simplicity.  Might be a good scheme to assume that
> - * strings to be broken are properly ended with either 0 or nl, not
> - * both.  It seems 0 can be assumed.
> - */
> -static int
> -TKNZRtokenize(str in, str *parts, char tkn)
> -{
> -	char *s, *t;
> -	int depth = 0;
> -
> -	s = in;
> -	while (*s && *s != '\n') {
> -		t = s;
> -		while (*t != tkn && *t != '\n' && *t)
> -			t++;
> -		parts[depth++] = s;
> -		s = t + (*t != 0);
> -		*t = 0;
> -		if (depth > MAX_TKNZR_DEPTH)
> -			break;
> -	}
> -	return depth;
> -}
> -
> -str
> -TKNZRappend(oid *pos, str *s)
> -{
> -	str url;
> -	str batname;
> -	str parts[MAX_TKNZR_DEPTH];
> -	int i, new, r, depth;
> -	BAT *b;
> -	BUN p;
> -	BUN idx = 0;
> -	oid prv = 0;
> -	oid comp;
> -
> -	if (TRANS == NULL)
> -		throw(MAL, "tokenizer", "no tokenizer store open");
> -
> -	if ((url = GDKstrdup(*s)) == NULL) {
> -		throw(MAL, "tokenizer.append", OPERATION_FAILED MAL_MALLOC_FAIL);
> -	}
> -
> -	depth = TKNZRtokenize(url, parts, '/');
> -	new = depth;
> -
> -	if (depth == 0) {
> -		GDKfree(url);
> -		return MAL_SUCCEED;
> -	}
> -	if (depth > MAX_TKNZR_DEPTH) {
> -		GDKfree(url);
> -		throw(MAL, "tokenizer",
> -				ILLEGAL_ARGUMENT "input string breaks to too many parts");
> -	}
> -	if (depth > tokenDepth || tokenBAT[0] == NULL) {
> -		new = tokenDepth;
> -		for (i = tokenDepth; i < depth; i++) {
> -			/* make new bat */
> -			batname = (str) GDKmalloc(128 * sizeof(char));
> -			snprintf(batname, 128, "%s_%d", name, i);
> -			b = BATnew(TYPE_oid, TYPE_str, 1024);
> -			if (b == NULL) {
> -				GDKfree(batname);
> -				GDKfree(url);
> -				throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
> -			}
> -			BATkey(b, FALSE);
> -			tokenBAT[i] = b;
> -
> -			if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname)
> -				!= MAL_SUCCEED) {
> -				GDKfree(batname);
> -				GDKfree(url);
> -				throw(MAL, "tokenizer.open", OPERATION_FAILED);
> -			}
> -			if (BKCsetPersistent(&r, (int *) &(b->batCacheid))
> _______________________________________________
> checkin-list mailing list
> checkin-list at monetdb.org
> https://www.monetdb.org/mailman/listinfo/checkin-list
> 

-- 
| Stefan.Manegold at CWI.nl | DB Architectures   (DA) |
| www.CWI.nl/~manegold/  | Science Park 123 (L321) |
| +31 (0)20 592-4212     | 1098 XG Amsterdam  (NL) |




More information about the developers-list mailing list