Re: MonetDB: default - Move tokenizer to the attic
Martin, et al.,
please be aware that this means that also monetdb5/extras/rdf (aka. --enable-rdf) does no longer compile, as it depends on the (now gone) tokenizer.
Best, Stefan
----- Original Message -----
Changeset: 155c3a3fcfdb for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=155c3a3fcfdb Removed Files: monetdb5/modules/mal/tokenizer.c monetdb5/modules/mal/tokenizer.h monetdb5/modules/mal/tokenizer.mal Modified Files: monetdb5/modules/mal/Makefile.ag Branch: default Log Message:
Move tokenizer to the attic The experiment to organise urls using a variation of graph-based tokenization requires a major overhaul to support the void-headed approach.
diffs (truncated from 737 to 300 lines):
diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag --- a/monetdb5/modules/mal/Makefile.ag +++ b/monetdb5/modules/mal/Makefile.ag @@ -54,7 +54,6 @@ lib_mal = { sabaoth.c sabaoth.h \ sysmon.c sysmon.h \ tablet.c tablet.h \
trader.c trader.h \ transaction.c \ txtsim.c txtsim.h \tokenizer.c tokenizer.h \
@@ -76,7 +75,7 @@ headers_mal = { mal_mapi.mal sabaoth.mal remote.mal \ txtsim.mal recycle.mal \ cluster.mal trader.mal \
tokenizer.mal zorder.mal sample.mal json_util.mal \
calc.mal batcalc.mal batmtime.mal querylog.mal sysmon.malzorder.mal sample.mal json_util.mal \
}
@@ -84,7 +83,7 @@ EXTRA_DIST = batExtensions.mal iterator. groupby.mal mal_init.mal manual.mal mkey.mal manifold.mal pcre.mal \ profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal \ transaction.mal txtsim.mal tablet.mal tablet.h sample.mal json_util.mal \
- mal_mapi.mal mat.mal tokenizer.mal pqueue.mal calc.mal \
- mal_mapi.mal mat.mal pqueue.mal calc.mal \ batcalc.mal batmtime.mal querylog.mal sysmon.mal
EXTRA_DIST_DIR = Tests diff --git a/monetdb5/modules/mal/tokenizer.c b/monetdb5/modules/mal/tokenizer.c deleted file mode 100644 --- a/monetdb5/modules/mal/tokenizer.c +++ /dev/null @@ -1,585 +0,0 @@ -/*
- The contents of this file are subject to the MonetDB Public License
- Version 1.1 (the "License"); you may not use this file except in
- compliance with the License. You may obtain a copy of the License at
- Software distributed under the License is distributed on an "AS IS"
- basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
- License for the specific language governing rights and limitations
- under the License.
- The Original Code is the MonetDB Database System.
- The Initial Developer of the Original Code is CWI.
- Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
- Copyright August 2008-2014 MonetDB B.V.
- All Rights Reserved.
- */
-/*
- author Lefteris Sidirourgos
- Tokenizer
- This module implements a vertical fragmented tokenizer for strings.
- It is based on the ideas of the urlbox module by mk.
- The input string is tokenized according to a separator character.
- Each token is inserted to the next BAT with the same order of
- appearance in the string. We currently support 255 tokens in each
- string as this module is intended for use with short and similar
- strings such as URLs. In addition we maintain a 2-dimensional index
- that points to the depth and height of the last token of each string.
- The 2-dimensional index is combined to one BAT where the 8 least
- significant bits represent the depth, and the rest bits the height.
- The tokenizer can be accessed in two ways. Given the oid retrieve the
- re-constructed string, or given a string return its oid if present,
- otherwise nil.
- Strings can be added either in batch (from a file or a bat of
- strings) and by appending a single string. Duplicate elimination is
- always performed.
- There can be only one tokenizer open at the same time. This is
- achieved by setting a TRANSaction bat. This might change in the
- future. However there can be more than one tokenizers stored in the
- disk, each of which is identified by its name (usually the name of
- the active schema of the db). These administrative issues and
- security aspects (e.g., opening a tokenizer of a different schema)
- should be addressed more thoroughly.
- */
-#include "monetdb_config.h" -#include "bat5.h" -#include "tokenizer.h" -#include "mal_linker.h"
-#define MAX_TKNZR_DEPTH 256 -#define INDEX MAX_TKNZR_DEPTH -static int tokenDepth = 0; -static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1]; -static BAT *TRANS = NULL; /* the catalog of tokenizers */ -static char name[128];
-#if SIZEOF_OID == 4 /* 32-bit oid */ -#define MAX_h ((((oid) 1) << 23) - 1) -#else /* 64-bit oid */ -#define MAX_h ((((oid) 1) << 55) - 1) -#endif
-#define COMP(h, d) ((h << 8) | (d & 255)) -#define GET_d(x) ((sht) ((x) & 255)) -#define GET_h(x) ((x) >> 8)
-static int prvlocate(BAT* b, oid *prv, str part) -{
- BAT *m = BATmirror(b);
- BATiter mi = bat_iterator(m);
- BUN p;
- if (m->H->hash == NULL)
BAThash(m, 2 * BATcount(m));
- HASHloop_str(mi, m->H->hash, p, part)
- {
if (*((oid *) BUNtail(mi, p)) == *prv) {
*prv = (oid) p;
return TRUE;
}
- }
- return FALSE;
-}
-str -TKNZRopen(int *ret, str *in) -{
- int depth, r;
- bat idx;
- str batname = NULL;
- BAT *b;
- (void) ret;
- if (strlen(*in) > 127)
throw(MAL, "tokenizer.open",
ILLEGAL_ARGUMENT " tokenizer name too long");
- MT_lock_set(&mal_contextLock, "tokenizer");
- if (TRANS != NULL) {
MT_lock_unset(&mal_contextLock, "tokenizer");
throw(MAL, "tokenizer.open", "Another tokenizer is already open");
- }
- for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
tokenBAT[depth] = 0;
- }
- tokenDepth = 0;
- TRANS = BATnew(TYPE_void, TYPE_str, MAX_TKNZR_DEPTH + 1);
- if (TRANS == NULL) {
MT_lock_unset(&mal_contextLock, "tokenizer");
throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
- }
- /* now we are sure that none overwrites the tokenizer table*/
- MT_lock_unset(&mal_contextLock, "tokenizer");
- BATseqbase(TRANS, 0);
- snprintf(name, 128, "%s", *in);
- batname = (str) GDKmalloc(134 * sizeof(char));
- if( batname == NULL)
throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
- snprintf(batname, 134, "%s_index", name);
- idx = BBPindex(batname);
- if (idx == 0) { /* new tokenizer */
b = BATnew(TYPE_void, TYPE_oid, 1024);
if (b == NULL)
throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
BATkey(b, FALSE);
BATseqbase(b, 0);
tokenBAT[INDEX] = b;
if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname) !=
MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) != MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
BUNappend(TRANS, batname, FALSE);
- } else { /* existing tokenizer */
tokenBAT[INDEX] = BATdescriptor(idx);
BUNappend(TRANS, batname, FALSE);
for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
snprintf(batname, 128, "%s_%d", name, depth);
idx = BBPindex(batname);
if (idx == 0)
break;
tokenBAT[depth] = BATdescriptor(idx);
BUNappend(TRANS, batname, FALSE);
}
tokenDepth = depth;
- }
- GDKfree(batname);
- return MAL_SUCCEED;
-}
-str -TKNZRclose(int *r) -{
- int i;
- (void) r;
- if (TRANS == NULL)
throw(MAL, "tokenizer", "no tokenizer store open");
- TMsubcommit(TRANS);
- for (i = 0; i < tokenDepth; i++) {
BBPunfix(tokenBAT[i]->batCacheid);
- }
- BBPunfix(tokenBAT[INDEX]->batCacheid);
- tokenDepth = 0;
- BBPreclaim(TRANS);
- TRANS = NULL;
- return MAL_SUCCEED;
-}
-/*
- Tokenize operations
- The tokenizer operation assumes a private copy to mark the end of the
- token separators with a zero byte. Tokens are separated by a single
- character for simplicity. Might be a good scheme to assume that
- strings to be broken are properly ended with either 0 or nl, not
- both. It seems 0 can be assumed.
- */
-static int -TKNZRtokenize(str in, str *parts, char tkn) -{
- char *s, *t;
- int depth = 0;
- s = in;
- while (*s && *s != '\n') {
t = s;
while (*t != tkn && *t != '\n' && *t)
t++;
parts[depth++] = s;
s = t + (*t != 0);
*t = 0;
if (depth > MAX_TKNZR_DEPTH)
break;
- }
- return depth;
-}
-str -TKNZRappend(oid *pos, str *s) -{
- str url;
- str batname;
- str parts[MAX_TKNZR_DEPTH];
- int i, new, r, depth;
- BAT *b;
- BUN p;
- BUN idx = 0;
- oid prv = 0;
- oid comp;
- if (TRANS == NULL)
throw(MAL, "tokenizer", "no tokenizer store open");
- if ((url = GDKstrdup(*s)) == NULL) {
throw(MAL, "tokenizer.append", OPERATION_FAILED MAL_MALLOC_FAIL);
- }
- depth = TKNZRtokenize(url, parts, '/');
- new = depth;
- if (depth == 0) {
GDKfree(url);
return MAL_SUCCEED;
- }
- if (depth > MAX_TKNZR_DEPTH) {
GDKfree(url);
throw(MAL, "tokenizer",
ILLEGAL_ARGUMENT "input string breaks to too many parts");
- }
- if (depth > tokenDepth || tokenBAT[0] == NULL) {
new = tokenDepth;
for (i = tokenDepth; i < depth; i++) {
/* make new bat */
batname = (str) GDKmalloc(128 * sizeof(char));
snprintf(batname, 128, "%s_%d", name, i);
b = BATnew(TYPE_oid, TYPE_str, 1024);
if (b == NULL) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
}
BATkey(b, FALSE);
tokenBAT[i] = b;
if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname)
!= MAL_SUCCEED) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.open", OPERATION_FAILED);
}
if (BKCsetPersistent(&r, (int *) &(b->batCacheid))
checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
participants (1)
-
Stefan Manegold