[Monetdb-developers] [Monetdb-pf-checkins] pathfinder/modules/pftijah nexi.c, 1.49, 1.50 pftijah_tokenize.l, 1.12, 1.13 pftijah_util.mx, 1.2, 1.3 serialize_pftijah.mx, 1.41, 1.42

Sjoerd Mullender sjoerd at acm.org
Tue Feb 27 16:49:48 CET 2007


On 2007-02-27 16:43, Jan Flokstra wrote:
> Update of /cvsroot/monetdb/pathfinder/modules/pftijah
> In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686
> 
> Modified Files:
> 	nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx 
> Log Message:
> * repair BBP refcount bug for BAT

Is this a fix which also applies to the stable branch?

> * reimplement the direct bat acces methods in pftijah serialization for more
>   speed (and clarity).
> 
> * Start optimizing the the pftijah tokenizer. The flex functions are called once
>   per handle_character() call. This leads to 2 malloc's per call. I tried to
>   do without the malloc's but this caused to a lot of strange results:-)
>   I am now planning to craft the flexer by hand. The first small experiment
>   shows there is a lot to gain there. (25% speedup in indexing time).
> 
> 
> 
> Index: serialize_pftijah.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
> retrieving revision 1.41
> retrieving revision 1.42
> diff -u -d -r1.41 -r1.42
> --- serialize_pftijah.mx	23 Feb 2007 15:11:07 -0000	1.41
> +++ serialize_pftijah.mx	27 Feb 2007 15:43:37 -0000	1.42
> @@ -31,8 +31,8 @@
>  
>  extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
>  
> -extern int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx); /* FLEX */
> -extern char* flexScanOneTerm(char* buf, int len);
> +extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /* FLEX */
> +extern char* flexScanOneTerm(char* buf);
>  
>  extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
>  
> @@ -70,15 +70,10 @@
>  typedef struct dbat_struct {
>  	const char*	name;
>  	BAT*		bat;
> -	int		oid_mark;
> -	int		max_i;
> -	int		max_sz;
> -	bit		dflt;	  /* fill with default value during extend */
> -	int		dflt_int; /* the default int value */
> -	chr		dflt_chr; /* the default chr value */
> -	oid		dflt_oid; /* the default oid value */
> -	/* */
> -        union { /* cast to perform direct indexex insert in [void,any] BATs */
> +	oid		raw_max;
> +	oid		seqbase;
> +	oid		seq_max;
> +        union { /* cast to perform direct indexe insert in [void,any] BATs */
>              void* voidCAST; /* the basecast */
>              chr*  chrCAST;  /* cast for [void,chr] BAT */
>              int*  intCAST;  /* cast for [void,int] BAT */
> @@ -89,7 +84,6 @@
>  int dbat_init(const char* name, dbat* dbat, BAT* b) {
>  	dbat->name = name;
>  	dbat->bat  = b;
> -	dbat->dflt = FALSE;
>  	if ( dbat->bat->htype != TYPE_void ) {
>  	    stream_printf(GDKerr,"ERROR: dbat_init(%s) non void BAT\n",dbat->name);
>  	    return 0;
> @@ -98,31 +92,25 @@
>  	    stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown ttype(%d)\n",dbat->name,dbat->bat->ttype);
>  	    return 0;
>  	}
> -        dbat->oid_mark = b->hseqbase;
> -	dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
> +        dbat->seqbase = (oid)b->hseqbase;
> +	dbat->raw_max = (oid)BATcount(dbat->bat);
> +	dbat->seq_max = dbat->raw_max + dbat->seqbase;
>  	dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>  	/* */
>  	return 1;
>  }
>  
> -int dbat_finalize(dbat* dbat) {
> -        BAT* b = dbat->bat;
>  
> +int dbat_finalize(dbat* dbat, int topidx) {
>          void* top;
> +        BAT* b = dbat->bat;
>          
> -        int bottomTop = dbat->max_i;
> -	if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
> +	topidx -= (int)dbat->seqbase;
> +        int bottomTop = topidx;
>          switch( b->ttype ) {
>           case TYPE_int :
>                  top = &dbat->cast.intCAST[bottomTop];
>                  break;
> -         case TYPE_chr: {
> -                b->batBuns->free = dbat->max_i; 
> -                BATsetcount(b, dbat->max_i);
> -                b->tsorted = 0;
> -		b->batDirty = TRUE; /* VERY important this one */
> -                return 1;
> -                }
>           case TYPE_oid:
>                  top = &dbat->cast.oidCAST[bottomTop];
>                  break;
> @@ -137,7 +125,7 @@
>  	/* */
>  	dbat->name  = NULL;
>  	dbat->bat   = NULL;
> -	dbat->max_i = dbat->max_sz = 0;
> +	dbat->raw_max = dbat->seqbase = 0;
>  	/* */
>  	return 1;
>  }
> @@ -145,9 +133,14 @@
>  #define MINCHUNK 8192
>  #define MAXCHUNK 67108864
>  
> -int dbat_extend(dbat* dbat, int i_mark) {
> -    /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
> -    size_t newsize = MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
> +int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
> +    size_t newsize;
> +    
> +    if ( forced_size ) {
> +       newsize = forced_size;
> +    } else {
> +       newsize = MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
> +    }
>  
>      /* first check if the number of BUN's < INT_MAX. If this was the case
>       * and the previous time INT_MAX was returned this means the BAT cannot
> @@ -156,94 +149,34 @@
>      if ( newsize > INT_MAX ) {
>      	newsize = INT_MAX;
>  
> -	if ( dbat->max_sz == INT_MAX ) {
> +	if ( dbat->raw_max == INT_MAX ) {
>          	GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX) fails\n","incomplete");
>  		return -1;
>  	}
>      }
> -    if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d -> %d)\n",dbat->name,dbat->max_sz,newsize); }
> -    dbat->max_sz= newsize;
> +#if 0
> +    stream_printf(GDKout,"dbat_extend[%s](%d -> %d)\n",dbat->name,dbat->raw_max,newsize);
> +#endif
> +    dbat->raw_max= newsize;
> +    dbat->seq_max = dbat->raw_max + dbat->seqbase;
>      if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
>          GDKerror("dbat_extend: BATextend[\"%s\"](to %d) fails\n","incomplete",newsize);
>          return -1;
>      }
>      dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
> -    /*
> -     * now check if there's a default value handler used  
> -     *
> -     */
> -    if ( dbat->dflt ) {
> -        switch( dbat->bat->ttype ) {
> -         case TYPE_int : {
> -		int v   = dbat->dflt_int;
> -		int *to = &dbat->cast.intCAST[dbat->max_sz];
> -		for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
> -		    *p++ = v;
> -                break;
> -		}
> -         case TYPE_chr: {
> -		chr v   = dbat->dflt_chr;
> -		chr *to = &dbat->cast.chrCAST[dbat->max_sz];
> -		for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
> -		    *p++ = v;
> -                break;
> -                }
> -         case TYPE_oid: {
> -		oid v   = dbat->dflt_oid;
> -		oid *to = &dbat->cast.oidCAST[dbat->max_sz];
> -		for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
> -		    *p++ = v;
> -                break;
> -		}
> -         default:
> -                GDKerror("dbat_extend: bad ttype\n");
> -                return -1;
> -        }
> -    }
> -    /* */
>      return 1;
>  }
>  
>  int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
> -        int sizeHint = sizeHint_mark - dbat->oid_mark;
> -	int estimate = dbat->max_i + sizeHint;
> -
> -	return dbat_extend(dbat, estimate);
> -}
> -
> -INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
> -	register int pos;
> +        int sizeHint = sizeHint_mark - dbat->seqbase;
> +	int estimate = dbat->raw_max + sizeHint;
>  
> -	if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
> -	    dbat->cast.oidCAST[pos] = v;
> -	    return 1;
> -	} else {
> -	    if ( pos >= dbat->max_sz ) {
> -		if ( dbat_extend(dbat,pos) < 0 )
> -		    return -1;
> -	    }
> -	    dbat->max_i = pos + 1;
> -	    dbat->cast.oidCAST[pos] = v;
> -	    return 1;
> -	}
> +	return dbat_extend(dbat, estimate, 0);
>  }
>  
> -INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
> -	register int pos;
> +#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
>  
> -	if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
> -	    dbat->cast.intCAST[pos] = v;
> -	    return 1;
> -	} else {
> -	    if ( pos >= dbat->max_sz ) {
> -		if ( dbat_extend(dbat,pos) < 0 )
> -		    return -1;
> -	    }
> -	    dbat->max_i = pos + 1;
> -	    dbat->cast.intCAST[pos] = v;
> -	    return 1;
> -	}
> -}
> +#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
>  
>  /************************************************
>   *
> @@ -315,9 +248,6 @@
>  
>  /************************************************
>   *
> - *
> - * First the temporary shredder for Tijah by JF
> - *
>   */
>  
>  INLINE static oid
> @@ -328,15 +258,15 @@
>      BUN bun;
>  
>      HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
> -    if ( bun )
> -    /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
> +    if ( bun ) {
>          return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
> -    else {
> -    	if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
> +    } else {
> +    	if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
> +    	    return tjctx->n_globalTag++;
> +        } else {
>      	    GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
>      	    return oid_nil;
> -        } else
> -    	    return tjctx->n_globalTag++;
> +        }
>      }
>  #endif
>  }
> @@ -349,10 +279,7 @@
>     BUN bun;
>  
>     HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
> -   if ( bun ) {
> -       return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
> -   } else
> -       return oid_nil;
> +   return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
>  }
>  
>  INLINE static oid
> @@ -366,22 +293,35 @@
>      if ( bun )
>          return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>      else {
> -    	if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
> +    	if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
> +    	    return tjctx->n_globalTerm++;
> +        } else { 
>      	    GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
>      	    return oid_nil;
> -        } else 
> -    	    return tjctx->n_globalTerm++;
> +        }
>      }
>  #endif
>  }
>  
> -#define tj_add2plane(TJCTX,O) \
> -    ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
> -    	   	? oid_nil : ((oid)(TJCTX)->tijahPre++))
> +INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
> +    oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase; 
>  
> -#define insertPreSize(TJCTX,POS,SIZE) \
> -    dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
> +    if ( base >= tjctx->dbat_collPre.raw_max ) {
> +	if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
> +	    return oid_nil;
> +	/* IMPORTANT: the size of the two bats is synchronized by the use
> +	 * of the forced size (last) parameter of dbat_extend
> +	 */
> +	if ( dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
> +	    return oid_nil;
> +    }
> +    return tjctx->tijahPre++;
> +}
>  
> +#define tj_newPre(TJCTX) \
> +	(((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
> +	? \
> +	((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
>  
>  int
>  handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
> @@ -397,13 +337,13 @@
>  	    }
>            }
>            if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
> -    	    return 0;
> +    	    return -1;
>  	}
>  	if ( termOid ) { /* term is not a stopword */
> -            if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
> +            if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>      	        return 0;
> -            if ( insertPreSize(tjctx,tjPre,0) < 0 )
> -       	        return -1;
> +            dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
> +            dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
>  #ifdef TJ_TRACE
>              if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\", termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
>  #endif
> @@ -416,40 +356,13 @@
>       return 1;
>  }
>  
> -/************
> - *
> - * The part where the Strings from Pathfinder are shredded into words
> - * by Tijah. The USE_FLEX macro determines if the strings is shredded
> - * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
> - */
> -
> -const char* obsoleteNexiChars = " \t\n\r,:;&*%$#!@=";
> -
> -int 
> -useStrtokScanner(tjCtx* tjctx, char* s)
> -{
> -    char *t;
> -    int  sz = 0;
> -
> -#ifdef TJ_TRACE
> -    if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
> -#endif
> -    if ( (t = strtok(s,obsoleteNexiChars)) ) do {
> -	/* not the empty string here */
> -        if ( handleTijahTerm(tjctx,t) < 0 )
> -             return -1;
> -    	sz++;
> -    } while ( (t=strtok(NULL,obsoleteNexiChars)) );
> -    return 1;
> -}
> -
>  /************************************************
>   *
>   * Now the real output handlers
>   */
>  
>  
> -#ifdef notused
> +#if 0
>  static int
>  handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
> @@ -502,14 +415,12 @@
>  	    return (str)str_nil;
>  }
>  
> -#define GUESSFORCE FALSE
> -
>  /* 
>   * Replace the value of a collection parameter int the collection parameter
>   * bat
>   */
>  static int replaceCollParam(tjCtx* tjctx, str param, str val) {
> -	return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
> +	return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
>  }
>  
>  static BAT*
> @@ -894,10 +805,10 @@
>      /* if ( DOEMIT(tjctx) ) { */
>          if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
>      	    return 0;
> -        if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
> +        if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>      	    return 0;
> +        dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>          if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
> -        if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", termoid=%d, Tijah pre#=%d, Pathfinder pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>  #ifdef TJ_TRACE
>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", termoid=%d, Tijah pre#=%d, Pathfinder pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>  #endif
> @@ -913,8 +824,7 @@
>      --tjctx->doc_height;
>      oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>      int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
> -    if ( insertPreSize(tjctx,start,size) < 0 )
> -	    return 0;
> +    dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>  #ifdef TJ_TRACE
>      if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement: \"%s\"\n", tjctx->name,"");
>  #endif
> @@ -934,8 +844,7 @@
>      /* if ( DOEMIT(tjctx) ) { */
>          oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>          int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
> -	if ( insertPreSize(tjctx,start,size) < 0 )
> -	    return 0;
> +	dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>  #ifdef TJ_TRACE
>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n", tjctx->name,name);
>  #endif
> @@ -944,8 +853,6 @@
>      return 1;
>  }
>  
> -#define USE_FLEX 1
> -
>  /**
>   * Output generation handler. Handles equivalent of * SAX characters() event.
>   */
> @@ -954,28 +861,23 @@
>      EMPTY_CHECK;
>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>  
> +    register char* p = (char*)ch;
> +    while( *p && isspace(*p) ) p++;
> +    if ( !*p )
> +        return 1;
>  #ifdef TJ_TRACE
> -    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) start\n",tjctx->name, (char*)ch);
> +    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) start\n",tjctx->name, p);
>  #endif
>  
>      if ( DOEMIT(tjctx) ) {
> -#ifdef USE_FLEX
> -        return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
> -#else
> -        return useStrtokScanner(tjctx,(char*)ch);
> -#endif
> +        return useFlexScanner(p,tjctx);
>      }
>      return 1;
>  }
>  
>  char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
>  	char *res;
> -#ifdef USE_FLEX
> -        res = flexScanOneTerm((char*)term,strlen((char*)term));
> -#else
> -	res = strtok(term,obsoleteNexiChars);
> -#endif
> -	/* INCOMPLETE, should make shure tijahContext is always avail. here */
> +        res = flexScanOneTerm((char*)term);
>          if ( res && tjctx && tjctx->stemCtx->stem) {
>      	    if ( !(res = (char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
>  	        /* must be a stopword */
> @@ -986,14 +888,6 @@
>  }
>  
>  int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
> -//Leave tokenization disabled for now
> -//    char* tokenized;
> -//#ifdef USE_FLEX
> -//    tokenized = flexScanOneTerm(term,strlen(term));
> -//#else
> -//    tokenized = strtok(term,obsoleteNexiChars);
> -//#endif
> -
>      tjStemCtx* stemCtx = getStemmingContext( stemmer );
>  
>      if ( stemCtx->stem ) {
> @@ -1123,13 +1017,9 @@
>  #ifdef TJ_TRACE
>  	if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH INDEXING\n",tjctx->name);
>  #endif
> -
> -        /* feature not used anymore ????? */
> -	if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
> -	        insertPreSize(tjctx,0,tjctx->tijahPre - 1);
> -	if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
> +	if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
>  		return GDK_FAIL;
> -	if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
> +	if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
>  		return GDK_FAIL;
>  #ifdef TJ_TRACE
>  	if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT BATS\n",tjctx->name);
> 
> Index: pftijah_tokenize.l
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
> retrieving revision 1.12
> retrieving revision 1.13
> diff -u -d -r1.12 -r1.13
> --- pftijah_tokenize.l	9 Jan 2007 15:44:39 -0000	1.12
> +++ pftijah_tokenize.l	27 Feb 2007 15:43:37 -0000	1.13
> @@ -115,7 +115,40 @@
>  
>  %%
>  
> -int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
> +int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
> +  /* UPDATE: this delivers very strange testset results and should not be
> +   * used I think.
> +   */
> +  /* This is an optimized version of the flex scanner which does not copy the
> +   * input buffer. The only strange thing about this interface is that it
> +   * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
> +   * size of the buffer is inclusive the 2 0's.
> +   * The last zero is toggled with its original value to prevent corruption
> +   * of memory management tables. This was for me the only way to prevent
> +   * copying here.
> +   */
> +  int len = strlen(buf);
> +  char remember = buf[len+1];
> +  buf[len+1] = YY_END_OF_BUFFER_CHAR;
> +  YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
> +
> +  if ( !myBuf ) {
> +      stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy buffer.");
> +      return 0;
> +  }
> +  while ( pftijah_tokenizelex() ) {
> +      /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
> +      if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
> +          return 0;
> +  }
> +  yy_delete_buffer(myBuf);
> +  buf[len+1] = remember;
> +  return 1;
> +}
> +
> +int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
> +  // the original
> +  int len = strlen(buf);
>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>    while (pftijah_tokenizelex()) {
>        if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
> @@ -125,6 +158,40 @@
>    return 1;
>  }
>  
> +int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
> +{
> +    /* the fast function. This function is in the pftijah context with lots
> +     * of small strings to tokenize many times faster as the flex and the 
> +     * strtok() methods which seem to have a rather larger overhead
> +     */
> +    register char* s = input;
> +    register char x;
> +// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if (!handleTijahTerm(tjctx,base)) return 0; *s=x
> +#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
> +
> +    while ( 1 ) {
> +      while ( isspace( *s ) ) s++;
> +      if ( *s ) {
> +	  char* base = s;
> +	  if ( isalnum(*s) ) {
> +	      if ( isdigit(*s) ) {
> +	          while ( isdigit(*++s) ) ;
> +		  EMIT;
> +	      } else {
> +	          if (isupper(*s)) *s=tolower(*s);
> +	          while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
> +		  EMIT;
> +	      }
> +	  } else {
> +	      // INCOMPLETE, ENTITIES HERE
> +	      // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
> +	      s++;
> +	  }
> +      } else 
> +          return 1;
> +    }
> +}
> +
>  char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
>    int cnt = 0;
>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
> @@ -137,9 +204,10 @@
>    return outbuf;
>  }
>  
> -char* flexScanOneTerm(char* buf, int len) {
> +char* flexScanOneTerm(char* buf) {
>    char *res;
>    char resBUFF[256];
> +  int len = strlen(buf);
>  
>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>    if ( pftijah_tokenizelex() ) {
> 
> Index: nexi.c
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
> retrieving revision 1.49
> retrieving revision 1.50
> diff -u -d -r1.49 -r1.50
> --- nexi.c	23 Feb 2007 15:11:05 -0000	1.49
> +++ nexi.c	27 Feb 2007 15:43:37 -0000	1.50
> @@ -455,6 +455,7 @@
>      /*
>       * Now find out if the collection is fragmented or not.
>       */
> +    /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
>      BAT* fb = pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
>      if ( ! fb ) {
>             stream_printf(GDKerr,"Error: cannot find fragments bat for collection \"%s\".\n",parserCtx->collection);
> @@ -471,6 +472,8 @@
>                parserCtx->ffPfx        = "";
>                parserCtx->flastPfx     = ", str(1)";
>      }
> +    BBPunfix(BBPcacheid(fb));
> +    fb = NULL;
>      // Some special cases for NLLR, since NLLR only works with COARSE2 at the moment
>      if ( txt_retr_model->model == MODEL_NLLR ) {
>          // Switch to COARSE2 algebra for NLLR
> 
> Index: pftijah_util.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
> retrieving revision 1.2
> retrieving revision 1.3
> diff -u -d -r1.2 -r1.3
> --- pftijah_util.mx	9 Jan 2007 17:15:23 -0000	1.2
> +++ pftijah_util.mx	27 Feb 2007 15:43:37 -0000	1.3
> @@ -73,6 +73,7 @@
>      if ( b == bat_nil ) {
>      	return NULL;
>      } else {
> +        BBPfix(b);
>      	return BBPdescriptor(b);
>      }
>  }
> 
> 
> -------------------------------------------------------------------------
> Take Surveys. Earn Cash. Influence the Future of IT
> Join SourceForge.net's Techsay panel and you'll get the chance to share your
> opinions on IT & business topics through brief surveys-and earn cash
> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
> _______________________________________________
> Monetdb-pf-checkins mailing list
> Monetdb-pf-checkins at lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins


-- 
Sjoerd Mullender

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 370 bytes
Desc: OpenPGP digital signature
URL: <http://www.monetdb.org/pipermail/developers-list/attachments/20070227/da75987b/attachment.sig>


More information about the developers-list mailing list