[Monetdb-developers] [Monetdb-pf-checkins] pathfinder/modules/pftijah nexi.c, 1.49, 1.50 pftijah_tokenize.l, 1.12, 1.13 pftijah_util.mx, 1.2, 1.3 serialize_pftijah.mx, 1.41, 1.42

flokstra flokstra at cs.utwente.nl
Tue Feb 27 18:44:52 CET 2007


On 2/27/2007, "Sjoerd Mullender" <sjoerd at acm.org> wrote:

>On 2007-02-27 16:43, Jan Flokstra wrote:
>> Update of /cvsroot/monetdb/pathfinder/modules/pftijah
>> In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686
>> 
>> Modified Files:
>> 	nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx 
>> Log Message:
>> * repair BBP refcount bug for BAT
>
>Is this a fix which also applies to the stable branch?

I'm not shure yet. The bug only shows in the HEAD branch and does not
occur in the release branch. Problem was I did:

bat b = BBPindex(......);
if ( b != bat_nil )
    return BBPdescriptor(b)

The refcount assert crash occurs in the BBPdescriptor(). I used this
construction before and never had any problem. The bug made the
"Current" branch useless so I decided to (un)fix(:) it quickly with a
BBPfix() / BBPunfix(). I will try to figure out what to do next in the
near future. Maybe I even try to consult the CWI people :-)

>
>> * reimplement the direct bat acces methods in pftijah serialization for more
>>   speed (and clarity).
>> 
>> * Start optimizing the the pftijah tokenizer. The flex functions are called once
>>   per handle_character() call. This leads to 2 malloc's per call. I tried to
>>   do without the malloc's but this caused to a lot of strange results:-)
>>   I am now planning to craft the flexer by hand. The first small experiment
>>   shows there is a lot to gain there. (25% speedup in indexing time).
>> 
>> 
>> 
>> Index: serialize_pftijah.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
>> retrieving revision 1.41
>> retrieving revision 1.42
>> diff -u -d -r1.41 -r1.42
>> --- serialize_pftijah.mx	23 Feb 2007 15:11:07 -0000	1.41
>> +++ serialize_pftijah.mx	27 Feb 2007 15:43:37 -0000	1.42
>> @@ -31,8 +31,8 @@
>>  
>>  extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
>>  
>> -extern int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx); /* FLEX */
>> -extern char* flexScanOneTerm(char* buf, int len);
>> +extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /* FLEX */
>> +extern char* flexScanOneTerm(char* buf);
>>  
>>  extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
>>  
>> @@ -70,15 +70,10 @@
>>  typedef struct dbat_struct {
>>  	const char*	name;
>>  	BAT*		bat;
>> -	int		oid_mark;
>> -	int		max_i;
>> -	int		max_sz;
>> -	bit		dflt;	  /* fill with default value during extend */
>> -	int		dflt_int; /* the default int value */
>> -	chr		dflt_chr; /* the default chr value */
>> -	oid		dflt_oid; /* the default oid value */
>> -	/* */
>> -        union { /* cast to perform direct indexex insert in [void,any] BATs */
>> +	oid		raw_max;
>> +	oid		seqbase;
>> +	oid		seq_max;
>> +        union { /* cast to perform direct indexe insert in [void,any] BATs */
>>              void* voidCAST; /* the basecast */
>>              chr*  chrCAST;  /* cast for [void,chr] BAT */
>>              int*  intCAST;  /* cast for [void,int] BAT */
>> @@ -89,7 +84,6 @@
>>  int dbat_init(const char* name, dbat* dbat, BAT* b) {
>>  	dbat->name = name;
>>  	dbat->bat  = b;
>> -	dbat->dflt = FALSE;
>>  	if ( dbat->bat->htype != TYPE_void ) {
>>  	    stream_printf(GDKerr,"ERROR: dbat_init(%s) non void BAT\n",dbat->name);
>>  	    return 0;
>> @@ -98,31 +92,25 @@
>>  	    stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown ttype(%d)\n",dbat->name,dbat->bat->ttype);
>>  	    return 0;
>>  	}
>> -        dbat->oid_mark = b->hseqbase;
>> -	dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
>> +        dbat->seqbase = (oid)b->hseqbase;
>> +	dbat->raw_max = (oid)BATcount(dbat->bat);
>> +	dbat->seq_max = dbat->raw_max + dbat->seqbase;
>>  	dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>>  	/* */
>>  	return 1;
>>  }
>>  
>> -int dbat_finalize(dbat* dbat) {
>> -        BAT* b = dbat->bat;
>>  
>> +int dbat_finalize(dbat* dbat, int topidx) {
>>          void* top;
>> +        BAT* b = dbat->bat;
>>          
>> -        int bottomTop = dbat->max_i;
>> -	if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
>> +	topidx -= (int)dbat->seqbase;
>> +        int bottomTop = topidx;
>>          switch( b->ttype ) {
>>           case TYPE_int :
>>                  top = &dbat->cast.intCAST[bottomTop];
>>                  break;
>> -         case TYPE_chr: {
>> -                b->batBuns->free = dbat->max_i; 
>> -                BATsetcount(b, dbat->max_i);
>> -                b->tsorted = 0;
>> -		b->batDirty = TRUE; /* VERY important this one */
>> -                return 1;
>> -                }
>>           case TYPE_oid:
>>                  top = &dbat->cast.oidCAST[bottomTop];
>>                  break;
>> @@ -137,7 +125,7 @@
>>  	/* */
>>  	dbat->name  = NULL;
>>  	dbat->bat   = NULL;
>> -	dbat->max_i = dbat->max_sz = 0;
>> +	dbat->raw_max = dbat->seqbase = 0;
>>  	/* */
>>  	return 1;
>>  }
>> @@ -145,9 +133,14 @@
>>  #define MINCHUNK 8192
>>  #define MAXCHUNK 67108864
>>  
>> -int dbat_extend(dbat* dbat, int i_mark) {
>> -    /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
>> -    size_t newsize = MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
>> +int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
>> +    size_t newsize;
>> +    
>> +    if ( forced_size ) {
>> +       newsize = forced_size;
>> +    } else {
>> +       newsize = MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
>> +    }
>>  
>>      /* first check if the number of BUN's < INT_MAX. If this was the case
>>       * and the previous time INT_MAX was returned this means the BAT cannot
>> @@ -156,94 +149,34 @@
>>      if ( newsize > INT_MAX ) {
>>      	newsize = INT_MAX;
>>  
>> -	if ( dbat->max_sz == INT_MAX ) {
>> +	if ( dbat->raw_max == INT_MAX ) {
>>          	GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX) fails\n","incomplete");
>>  		return -1;
>>  	}
>>      }
>> -    if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d -> %d)\n",dbat->name,dbat->max_sz,newsize); }
>> -    dbat->max_sz= newsize;
>> +#if 0
>> +    stream_printf(GDKout,"dbat_extend[%s](%d -> %d)\n",dbat->name,dbat->raw_max,newsize);
>> +#endif
>> +    dbat->raw_max= newsize;
>> +    dbat->seq_max = dbat->raw_max + dbat->seqbase;
>>      if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
>>          GDKerror("dbat_extend: BATextend[\"%s\"](to %d) fails\n","incomplete",newsize);
>>          return -1;
>>      }
>>      dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>> -    /*
>> -     * now check if there's a default value handler used  
>> -     *
>> -     */
>> -    if ( dbat->dflt ) {
>> -        switch( dbat->bat->ttype ) {
>> -         case TYPE_int : {
>> -		int v   = dbat->dflt_int;
>> -		int *to = &dbat->cast.intCAST[dbat->max_sz];
>> -		for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
>> -		    *p++ = v;
>> -                break;
>> -		}
>> -         case TYPE_chr: {
>> -		chr v   = dbat->dflt_chr;
>> -		chr *to = &dbat->cast.chrCAST[dbat->max_sz];
>> -		for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
>> -		    *p++ = v;
>> -                break;
>> -                }
>> -         case TYPE_oid: {
>> -		oid v   = dbat->dflt_oid;
>> -		oid *to = &dbat->cast.oidCAST[dbat->max_sz];
>> -		for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
>> -		    *p++ = v;
>> -                break;
>> -		}
>> -         default:
>> -                GDKerror("dbat_extend: bad ttype\n");
>> -                return -1;
>> -        }
>> -    }
>> -    /* */
>>      return 1;
>>  }
>>  
>>  int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
>> -        int sizeHint = sizeHint_mark - dbat->oid_mark;
>> -	int estimate = dbat->max_i + sizeHint;
>> -
>> -	return dbat_extend(dbat, estimate);
>> -}
>> -
>> -INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
>> -	register int pos;
>> +        int sizeHint = sizeHint_mark - dbat->seqbase;
>> +	int estimate = dbat->raw_max + sizeHint;
>>  
>> -	if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> -	    dbat->cast.oidCAST[pos] = v;
>> -	    return 1;
>> -	} else {
>> -	    if ( pos >= dbat->max_sz ) {
>> -		if ( dbat_extend(dbat,pos) < 0 )
>> -		    return -1;
>> -	    }
>> -	    dbat->max_i = pos + 1;
>> -	    dbat->cast.oidCAST[pos] = v;
>> -	    return 1;
>> -	}
>> +	return dbat_extend(dbat, estimate, 0);
>>  }
>>  
>> -INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
>> -	register int pos;
>> +#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
>>  
>> -	if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> -	    dbat->cast.intCAST[pos] = v;
>> -	    return 1;
>> -	} else {
>> -	    if ( pos >= dbat->max_sz ) {
>> -		if ( dbat_extend(dbat,pos) < 0 )
>> -		    return -1;
>> -	    }
>> -	    dbat->max_i = pos + 1;
>> -	    dbat->cast.intCAST[pos] = v;
>> -	    return 1;
>> -	}
>> -}
>> +#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
>>  
>>  /************************************************
>>   *
>> @@ -315,9 +248,6 @@
>>  
>>  /************************************************
>>   *
>> - *
>> - * First the temporary shredder for Tijah by JF
>> - *
>>   */
>>  
>>  INLINE static oid
>> @@ -328,15 +258,15 @@
>>      BUN bun;
>>  
>>      HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
>> -    if ( bun )
>> -    /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
>> +    if ( bun ) {
>>          return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
>> -    else {
>> -    	if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
>> +    } else {
>> +    	if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
>> +    	    return tjctx->n_globalTag++;
>> +        } else {
>>      	    GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
>>      	    return oid_nil;
>> -        } else
>> -    	    return tjctx->n_globalTag++;
>> +        }
>>      }
>>  #endif
>>  }
>> @@ -349,10 +279,7 @@
>>     BUN bun;
>>  
>>     HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
>> -   if ( bun ) {
>> -       return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>> -   } else
>> -       return oid_nil;
>> +   return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
>>  }
>>  
>>  INLINE static oid
>> @@ -366,22 +293,35 @@
>>      if ( bun )
>>          return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>>      else {
>> -    	if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
>> +    	if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
>> +    	    return tjctx->n_globalTerm++;
>> +        } else { 
>>      	    GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
>>      	    return oid_nil;
>> -        } else 
>> -    	    return tjctx->n_globalTerm++;
>> +        }
>>      }
>>  #endif
>>  }
>>  
>> -#define tj_add2plane(TJCTX,O) \
>> -    ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
>> -    	   	? oid_nil : ((oid)(TJCTX)->tijahPre++))
>> +INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
>> +    oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase; 
>>  
>> -#define insertPreSize(TJCTX,POS,SIZE) \
>> -    dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
>> +    if ( base >= tjctx->dbat_collPre.raw_max ) {
>> +	if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
>> +	    return oid_nil;
>> +	/* IMPORTANT: the size of the two bats is synchronized by the use
>> +	 * of the forced size (last) parameter of dbat_extend
>> +	 */
>> +	if ( dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
>> +	    return oid_nil;
>> +    }
>> +    return tjctx->tijahPre++;
>> +}
>>  
>> +#define tj_newPre(TJCTX) \
>> +	(((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
>> +	? \
>> +	((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
>>  
>>  int
>>  handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
>> @@ -397,13 +337,13 @@
>>  	    }
>>            }
>>            if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
>> -    	    return 0;
>> +    	    return -1;
>>  	}
>>  	if ( termOid ) { /* term is not a stopword */
>> -            if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> +            if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>>      	        return 0;
>> -            if ( insertPreSize(tjctx,tjPre,0) < 0 )
>> -       	        return -1;
>> +            dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>> +            dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
>>  #ifdef TJ_TRACE
>>              if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\", termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
>>  #endif
>> @@ -416,40 +356,13 @@
>>       return 1;
>>  }
>>  
>> -/************
>> - *
>> - * The part where the Strings from Pathfinder are shredded into words
>> - * by Tijah. The USE_FLEX macro determines if the strings is shredded
>> - * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
>> - */
>> -
>> -const char* obsoleteNexiChars = " \t\n\r,:;&*%$#!@=";
>> -
>> -int 
>> -useStrtokScanner(tjCtx* tjctx, char* s)
>> -{
>> -    char *t;
>> -    int  sz = 0;
>> -
>> -#ifdef TJ_TRACE
>> -    if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
>> -#endif
>> -    if ( (t = strtok(s,obsoleteNexiChars)) ) do {
>> -	/* not the empty string here */
>> -        if ( handleTijahTerm(tjctx,t) < 0 )
>> -             return -1;
>> -    	sz++;
>> -    } while ( (t=strtok(NULL,obsoleteNexiChars)) );
>> -    return 1;
>> -}
>> -
>>  /************************************************
>>   *
>>   * Now the real output handlers
>>   */
>>  
>>  
>> -#ifdef notused
>> +#if 0
>>  static int
>>  handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
>>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>> @@ -502,14 +415,12 @@
>>  	    return (str)str_nil;
>>  }
>>  
>> -#define GUESSFORCE FALSE
>> -
>>  /* 
>>   * Replace the value of a collection parameter int the collection parameter
>>   * bat
>>   */
>>  static int replaceCollParam(tjCtx* tjctx, str param, str val) {
>> -	return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
>> +	return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
>>  }
>>  
>>  static BAT*
>> @@ -894,10 +805,10 @@
>>      /* if ( DOEMIT(tjctx) ) { */
>>          if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
>>      	    return 0;
>> -        if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> +        if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>>      	    return 0;
>> +        dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>>          if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
>> -        if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", termoid=%d, Tijah pre#=%d, Pathfinder pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>>  #ifdef TJ_TRACE
>>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", termoid=%d, Tijah pre#=%d, Pathfinder pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>>  #endif
>> @@ -913,8 +824,7 @@
>>      --tjctx->doc_height;
>>      oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>>      int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
>> -    if ( insertPreSize(tjctx,start,size) < 0 )
>> -	    return 0;
>> +    dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>>  #ifdef TJ_TRACE
>>      if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement: \"%s\"\n", tjctx->name,"");
>>  #endif
>> @@ -934,8 +844,7 @@
>>      /* if ( DOEMIT(tjctx) ) { */
>>          oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>>          int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
>> -	if ( insertPreSize(tjctx,start,size) < 0 )
>> -	    return 0;
>> +	dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>>  #ifdef TJ_TRACE
>>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n", tjctx->name,name);
>>  #endif
>> @@ -944,8 +853,6 @@
>>      return 1;
>>  }
>>  
>> -#define USE_FLEX 1
>> -
>>  /**
>>   * Output generation handler. Handles equivalent of * SAX characters() event.
>>   */
>> @@ -954,28 +861,23 @@
>>      EMPTY_CHECK;
>>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>>  
>> +    register char* p = (char*)ch;
>> +    while( *p && isspace(*p) ) p++;
>> +    if ( !*p )
>> +        return 1;
>>  #ifdef TJ_TRACE
>> -    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) start\n",tjctx->name, (char*)ch);
>> +    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) start\n",tjctx->name, p);
>>  #endif
>>  
>>      if ( DOEMIT(tjctx) ) {
>> -#ifdef USE_FLEX
>> -        return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
>> -#else
>> -        return useStrtokScanner(tjctx,(char*)ch);
>> -#endif
>> +        return useFlexScanner(p,tjctx);
>>      }
>>      return 1;
>>  }
>>  
>>  char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
>>  	char *res;
>> -#ifdef USE_FLEX
>> -        res = flexScanOneTerm((char*)term,strlen((char*)term));
>> -#else
>> -	res = strtok(term,obsoleteNexiChars);
>> -#endif
>> -	/* INCOMPLETE, should make shure tijahContext is always avail. here */
>> +        res = flexScanOneTerm((char*)term);
>>          if ( res && tjctx && tjctx->stemCtx->stem) {
>>      	    if ( !(res = (char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
>>  	        /* must be a stopword */
>> @@ -986,14 +888,6 @@
>>  }
>>  
>>  int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
>> -//Leave tokenization disabled for now
>> -//    char* tokenized;
>> -//#ifdef USE_FLEX
>> -//    tokenized = flexScanOneTerm(term,strlen(term));
>> -//#else
>> -//    tokenized = strtok(term,obsoleteNexiChars);
>> -//#endif
>> -
>>      tjStemCtx* stemCtx = getStemmingContext( stemmer );
>>  
>>      if ( stemCtx->stem ) {
>> @@ -1123,13 +1017,9 @@
>>  #ifdef TJ_TRACE
>>  	if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH INDEXING\n",tjctx->name);
>>  #endif
>> -
>> -        /* feature not used anymore ????? */
>> -	if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
>> -	        insertPreSize(tjctx,0,tjctx->tijahPre - 1);
>> -	if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
>> +	if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
>>  		return GDK_FAIL;
>> -	if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
>> +	if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
>>  		return GDK_FAIL;
>>  #ifdef TJ_TRACE
>>  	if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT BATS\n",tjctx->name);
>> 
>> Index: pftijah_tokenize.l
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
>> retrieving revision 1.12
>> retrieving revision 1.13
>> diff -u -d -r1.12 -r1.13
>> --- pftijah_tokenize.l	9 Jan 2007 15:44:39 -0000	1.12
>> +++ pftijah_tokenize.l	27 Feb 2007 15:43:37 -0000	1.13
>> @@ -115,7 +115,40 @@
>>  
>>  %%
>>  
>> -int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
>> +int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> +  /* UPDATE: this delivers very strange testset results and should not be
>> +   * used I think.
>> +   */
>> +  /* This is an optimized version of the flex scanner which does not copy the
>> +   * input buffer. The only strange thing about this interface is that it
>> +   * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
>> +   * size of the buffer is inclusive the 2 0's.
>> +   * The last zero is toggled with its original value to prevent corruption
>> +   * of memory management tables. This was for me the only way to prevent
>> +   * copying here.
>> +   */
>> +  int len = strlen(buf);
>> +  char remember = buf[len+1];
>> +  buf[len+1] = YY_END_OF_BUFFER_CHAR;
>> +  YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
>> +
>> +  if ( !myBuf ) {
>> +      stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy buffer.");
>> +      return 0;
>> +  }
>> +  while ( pftijah_tokenizelex() ) {
>> +      /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
>> +      if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> +          return 0;
>> +  }
>> +  yy_delete_buffer(myBuf);
>> +  buf[len+1] = remember;
>> +  return 1;
>> +}
>> +
>> +int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> +  // the original
>> +  int len = strlen(buf);
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>>    while (pftijah_tokenizelex()) {
>>        if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> @@ -125,6 +158,40 @@
>>    return 1;
>>  }
>>  
>> +int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
>> +{
>> +    /* the fast function. This function is in the pftijah context with lots
>> +     * of small strings to tokenize many times faster as the flex and the 
>> +     * strtok() methods which seem to have a rather larger overhead
>> +     */
>> +    register char* s = input;
>> +    register char x;
>> +// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +
>> +    while ( 1 ) {
>> +      while ( isspace( *s ) ) s++;
>> +      if ( *s ) {
>> +	  char* base = s;
>> +	  if ( isalnum(*s) ) {
>> +	      if ( isdigit(*s) ) {
>> +	          while ( isdigit(*++s) ) ;
>> +		  EMIT;
>> +	      } else {
>> +	          if (isupper(*s)) *s=tolower(*s);
>> +	          while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
>> +		  EMIT;
>> +	      }
>> +	  } else {
>> +	      // INCOMPLETE, ENTITIES HERE
>> +	      // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
>> +	      s++;
>> +	  }
>> +      } else 
>> +          return 1;
>> +    }
>> +}
>> +
>>  char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
>>    int cnt = 0;
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>> @@ -137,9 +204,10 @@
>>    return outbuf;
>>  }
>>  
>> -char* flexScanOneTerm(char* buf, int len) {
>> +char* flexScanOneTerm(char* buf) {
>>    char *res;
>>    char resBUFF[256];
>> +  int len = strlen(buf);
>>  
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>>    if ( pftijah_tokenizelex() ) {
>> 
>> Index: nexi.c
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
>> retrieving revision 1.49
>> retrieving revision 1.50
>> diff -u -d -r1.49 -r1.50
>> --- nexi.c	23 Feb 2007 15:11:05 -0000	1.49
>> +++ nexi.c	27 Feb 2007 15:43:37 -0000	1.50
>> @@ -455,6 +455,7 @@
>>      /*
>>       * Now find out if the collection is fragmented or not.
>>       */
>> +    /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
>>      BAT* fb = pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
>>      if ( ! fb ) {
>>             stream_printf(GDKerr,"Error: cannot find fragments bat for collection \"%s\".\n",parserCtx->collection);
>> @@ -471,6 +472,8 @@
>>                parserCtx->ffPfx        = "";
>>                parserCtx->flastPfx     = ", str(1)";
>>      }
>> +    BBPunfix(BBPcacheid(fb));
>> +    fb = NULL;
>>      // Some special cases for NLLR, since NLLR only works with COARSE2 at the moment
>>      if ( txt_retr_model->model == MODEL_NLLR ) {
>>          // Switch to COARSE2 algebra for NLLR
>> 
>> Index: pftijah_util.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
>> retrieving revision 1.2
>> retrieving revision 1.3
>> diff -u -d -r1.2 -r1.3
>> --- pftijah_util.mx	9 Jan 2007 17:15:23 -0000	1.2
>> +++ pftijah_util.mx	27 Feb 2007 15:43:37 -0000	1.3
>> @@ -73,6 +73,7 @@
>>      if ( b == bat_nil ) {
>>      	return NULL;
>>      } else {
>> +        BBPfix(b);
>>      	return BBPdescriptor(b);
>>      }
>>  }
>> 
>> 
>> -------------------------------------------------------------------------
>> Take Surveys. Earn Cash. Influence the Future of IT
>> Join SourceForge.net's Techsay panel and you'll get the chance to share your
>> opinions on IT & business topics through brief surveys-and earn cash
>> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
>> _______________________________________________
>> Monetdb-pf-checkins mailing list
>> Monetdb-pf-checkins at lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins
>
>
>-- 
>Sjoerd Mullender
>




More information about the developers-list mailing list