[Monetdb-developers] [Monetdb-checkins] MonetDB/src/gdk gdk_posix.mx, Feb2010, 1.176.2.24, 1.176.2.25 gdk_storage.mx, Feb2010, 1.149.2.36, 1.149.2.37

Stefan Manegold Stefan.Manegold at cwi.nl
Fri Feb 19 16:59:24 CET 2010


On Fri, Feb 19, 2010 at 03:40:10PM +0000, Peter Boncz wrote:
> Update of /cvsroot/monetdb/MonetDB/src/gdk
> In directory sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv10221
> 
> Modified Files:
>       Tag: Feb2010
> 	gdk_posix.mx gdk_storage.mx 
> Log Message:
> - fix newly introduced bug (BUF_SEQUENTIAL passed to posix_madvise iso MMAP_SEQUENTIAL)
>   most of this was done by Stefan, but the intent of the code was when all users
>   are gone, to give uniform (sequential) advise to the whole heap
> - make sure the length of advise is page aligned (not strictly needed for 
>   Linux, but maybe of other Un*ces)
> - in case of shared vheaps (as done by leftfetchjoin into a string bat), assume that
>   access to the string heap will be random.

shouldn't
	len & ~MT_pagesize()
then rather be
	len & ~(MT_pagesize()-1)

(assuming pages size is usually a power of 2)

?

Stefan


> 
> 
> Index: gdk_storage.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_storage.mx,v
> retrieving revision 1.149.2.36
> retrieving revision 1.149.2.37
> diff -u -d -r1.149.2.36 -r1.149.2.37
> --- gdk_storage.mx	19 Feb 2010 13:13:38 -0000	1.149.2.36
> +++ gdk_storage.mx	19 Feb 2010 15:40:07 -0000	1.149.2.37
> @@ -707,17 +707,19 @@
>   * Peter Feb2010: I tried to do prefetches further apart, to trigger multiple readahead
>   *                units in parallel, but it does improve performance visibly 
>   */
> -static size_t access_heap(str id, str hp, Heap *h, char* base, size_t sz, int touch, int preload, int advise) {
> +static size_t access_heap(str id, str hp, Heap *h, char* base, size_t sz, int touch, int preload, int adv) {
>  	size_t v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 =0, v6 = 0, v7 = 0, page = MT_pagesize();
> +	str advise = (adv==MMAP_WILLNEED)?"WILLNEED":(adv==MMAP_SEQUENTIAL)?"SEQUENTIAL":(adv==MMAP_RANDOM)?"RANDOM":(adv==MMAP_NORMAL)?"NORMAL":NULL;
>  	int t = GDKms();
> +	assert(advise);
>  	if (h->storage != STORE_MEM && h->size > MT_MMAP_TILE) {
> -		MT_mmap_inform(h->base, h->size, preload, advise, 0);
> +		MT_mmap_inform(h->base, h->size, preload, adv, 0);
>  		if (preload > 0) {
> -			void* alignedbase = (void*) (((size_t) base) & ~(page-1));
> -			size_t alignedsz = (sz + (page-1)) & ~(page-1);
> -                	int ret = posix_madvise(alignedbase, sz, advise);
> -        		if (ret) THRprintf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, base="PTRFMT", len="SZFMT"MB, advice=%d) = %d\n", 
> -					h->filename, PTRFMTCAST alignedbase, alignedsz >> 20, advise, errno);
> +			size_t alignskip = (page - (((size_t) base) & (page-1))) & (page-1);
> +			size_t alignedsz = (size_t) (((sz < alignskip)?0:((size_t) (sz-alignskip))) & ~(page-1));
> +                	int ret = posix_madvise(base + alignskip, alignedsz, adv);
> +        		if (ret) THRprintf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, base="PTRFMT", len="SZFMT"MB, advice=%s) = %d\n", 
> +					h->filename, PTRFMTCAST (base + alignskip), alignedsz >> 20, advise, errno);
>  		}
>  	}
>  	if (touch && preload > 0) {
> @@ -731,8 +733,7 @@
>  		}
>  		for (hi += 7*page; lo <= hi; lo +=page) v0 += *lo;
>  	}
> -	IODEBUG THRprintf(GDKout,"#BATpreload(%s->%s,preload=%d,sz=%dMB,%s) = %dms \n", id, hp, preload, (int) (sz>>20), 
> -		(advise==MMAP_WILLNEED)?"WILLNEED":(advise==MMAP_SEQUENTIAL)?"SEQUENTIAL":"UNKNOWN", GDKms()-t);
> +	IODEBUG THRprintf(GDKout,"#BATpreload(%s->%s,preload=%d,sz=%dMB,%s) = %dms \n", id, hp, preload, (int) (sz>>20), advise, GDKms()-t);
>  	return v0+v1+v2+v3+v4+v5+v6+v7;
>  }
>  
> @@ -743,7 +744,6 @@
>  	str id = BATgetId(b);
>  	BATiter bi = bat_iterator(b);
>  
> -	assert(advise==MMAP_NORMAL||advise==MMAP_RANDOM||advise==MMAP_SEQUENTIAL||advise==MMAP_WILLNEED||advise==MMAP_DONTNEED);
>  	if (BATcount(b) == 0) return 0;
>  
>  	/* HASH indices (inherent random access). handle first as they *will* be access randomly (one can always hope for locality on the other heaps) */
> @@ -760,30 +760,46 @@
>  		gdk_unset_lock(GDKhashLock(ABS(b->batCacheid) & BBP_BATMASK), "BATaccess");
>  	}
>  
> -	/* we only touch stuff that is going to be read randomly (WILLNEED). Note varheaps are sequential wrt to the references, or small */
> -	if ( what&USE_HEAD) {
> +	/* vheaps next, as shared vheaps are not seq-correlated needing WILLNEED (use prefetch budget for this first) */
> +	if ( what&USE_HEAD ) {
> +		if (b->H->vheap && b->H->vheap->base) {
> +			char *lo = BUNhead(bi, BUNfirst(b)), *hi = BUNhead(bi, BUNlast(b)-1);
> +			int heap_advise = advise;
> +			if (b->H->vheap->copied) { /* shared string heaps are not (likely) to be sequentially correlated */
> +				lo = b->H->vheap->base; hi = lo + b->H->vheap->free;
> +				heap_advise = MADV_WILLNEED;
> +			}
> +			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
> +			v += access_heap(id, "hheap", b->H->vheap, lo, sz, (advise == BUF_WILLNEED), preload, heap_advise);
> +		}
> +	}
> +	if ( what&USE_TAIL ) {
> +		if (b->T->vheap && b->T->vheap->base) {
> +			char *lo = BUNtail(bi, BUNfirst(b)), *hi = BUNtail(bi, BUNlast(b)-1);
> +			int heap_advise = advise;
> +			if (b->T->vheap->copied) { /* shared string heaps are not (likely) to be sequentially correlated */
> +				lo = b->T->vheap->base; hi = lo + b->T->vheap->free;
> +				heap_advise = MADV_WILLNEED;
> +			}
> +			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
> +			v += access_heap(id, "theap", b->T->vheap, lo, sz, (advise == BUF_WILLNEED), preload, heap_advise);
> +		}
> +	}
> +
> +	/* BUN heaps are last in line for prefetch budget */
> +	if ( what&USE_HEAD ) {
>  		if (b->H->heap.base) {
>  			char *lo = BUNhloc(bi, BUNfirst(b)), *hi = BUNhloc(bi, BUNlast(b)-1);
>  			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
>  			v += access_heap(id, "hbuns", &b->H->heap, lo, sz, (advise == MMAP_WILLNEED), preload, advise);
>  		}
> -		if (b->H->vheap && b->H->vheap->base) {
> -			char *lo = BUNhead(bi, BUNfirst(b)), *hi = BUNhead(bi, BUNlast(b)-1);
> -			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
> -			v += access_heap(id, "hheap", b->H->vheap, lo, sz, (advise == MMAP_WILLNEED), preload, advise);
> -		}
>  	}
> -	if ( what&USE_TAIL) {
> +	if ( what&USE_TAIL ) {
>  		if (b->T->heap.base) {
>  			char *lo = BUNtloc(bi, BUNfirst(b)), *hi = BUNtloc(bi, BUNlast(b)-1);
>  			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
>  			v += access_heap(id, "tbuns", &b->T->heap, lo, sz, (advise == MMAP_WILLNEED), preload, advise);
>  		}
> -		if (b->T->vheap && b->T->vheap->base) {
> -			char *lo = BUNtail(bi, BUNfirst(b)), *hi = BUNtail(bi, BUNlast(b)-1);
> -			budget -= sz = ((hi-lo) > budget)?budget:(hi-lo);
> -			v += access_heap(id, "theap", b->T->vheap, lo, sz, (advise == MMAP_WILLNEED), preload, advise);
> -		}
>  	}
>  	return v;
>  }
> 
> Index: gdk_posix.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_posix.mx,v
> retrieving revision 1.176.2.24
> retrieving revision 1.176.2.25
> diff -u -d -r1.176.2.24 -r1.176.2.25
> --- gdk_posix.mx	19 Feb 2010 13:13:38 -0000	1.176.2.24
> +++ gdk_posix.mx	19 Feb 2010 15:40:07 -0000	1.176.2.25
> @@ -675,7 +675,7 @@
>  #ifdef HAVE_POSIX_FADVISE
>  			if (!do_not_use_posix_fadvise && MT_mmap_tab[victim].fd >= 0) {
>  				/* tell the OS quite clearly that you want to drop this */
> -				ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len, POSIX_FADV_DONTNEED);
> +				ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len & ~MT_pagesize(), POSIX_FADV_DONTNEED);
>  #ifdef MMAP_DEBUG
>  				stream_printf(GDKerr, "#MT_mmap_del: posix_fadvise(%s,fd=%d,%uMB,POSIX_FADV_DONTNEED) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, (unsigned int) (MT_mmap_tab[victim].len >> 20), ret);
>  #endif
> @@ -709,7 +709,7 @@
>  		i = MT_mmap_idx(base, len);
>  		if (i >= 0) {
>  			if (MT_mmap_tab[i].fd >= 0) {
> -				ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len, advice);
> +				ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len & ~MT_pagesize(), advice);
>  #ifdef MMAP_DEBUG
>  				stream_printf(GDKerr, "#MT_fadvise: posix_fadvise(%s,fd=%d,%uMB,%d) = %d\n", MT_mmap_tab[i].path, MT_mmap_tab[i].fd, (unsigned int) (len >> 20), advice, ret);
>  #endif
> @@ -733,7 +733,7 @@
>  {
>  	size_t len = MIN((size_t) MT_MMAP_TILE, MT_mmap_tab[i].len - off);
>  	/* tell Linux to please stop caching this stuff */
> -	int ret = posix_madvise(MT_mmap_tab[i].base + off, len, POSIX_MADV_DONTNEED);
> +	int ret = posix_madvise(MT_mmap_tab[i].base + off, len & ~MT_pagesize(), POSIX_MADV_DONTNEED);
>  
>  	if (err) {
>  		stream_printf(err, "#MT_mmap_unload_tile: posix_madvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n",
> @@ -743,7 +743,7 @@
>  #ifdef HAVE_POSIX_FADVISE
>  	if (!do_not_use_posix_fadvise) {
>  		/* tell the OS quite clearly that you want to drop this */
> -		ret = posix_fadvise(MT_mmap_tab[i].fd, off, len, POSIX_FADV_DONTNEED);
> +		ret = posix_fadvise(MT_mmap_tab[i].fd, off, len & ~MT_pagesize(), POSIX_FADV_DONTNEED);
>  		if (err) {
>  			stream_printf(err, "#MT_mmap_unload_tile: posix_fadvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n",
>  				      MT_mmap_tab[i].path, (unsigned int) (off >> 20),
> @@ -908,10 +908,9 @@
>  		MT_mmap_tab[i].random += preload * (advise == MMAP_WILLNEED); /* done as a counter to keep track of multiple threads */
>  		MT_mmap_tab[i].usecnt += preload; /* active thread count */
>  		unload = MT_mmap_tab[i].usecnt == 0;
> +		if (unload) ret = posix_madvise(MT_mmap_tab[i].base, MT_mmap_tab[i].len & ~MT_pagesize(), MMAP_SEQUENTIAL);
>  	}
>  	(void) pthread_mutex_unlock(&MT_mmap_lock);
> -	if (unload)
> -		ret = posix_madvise(base, len, MMAP_SEQUENTIAL);
>  	if (ret) {
>  		stream_printf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, fd=%d, base="PTRFMT", len="SZFMT"MB, advice=%d) = %d\n",
>  			      (i >= 0 ? MT_mmap_tab[i].path : ""), (i >= 0 ? MT_mmap_tab[i].fd : -1),
> @@ -1156,7 +1155,7 @@
>  
>  	if (ret != (void *) -1L) {
>  		if (hdl->mode & MMAP_ADVISE) {
> -			(void) MT_madvise(ret, len, hdl->mode & MMAP_ADVISE);
> +			(void) MT_madvise(ret, len & ~MT_pagesize(), hdl->mode & MMAP_ADVISE);
>  		}
>  		hdl->fixed = (void *) ((char *) ret + len);
>  	}
> @@ -1199,7 +1198,7 @@
>  int
>  MT_madvise(void *p, size_t len, int advise)
>  {
> -	int ret = posix_madvise(p, len, advise);
> +	int ret = posix_madvise(p, len & ~MT_pagesize(), advise);
>  
>  #ifdef MMAP_DEBUG
>  	stream_printf(GDKerr, "#posix_madvise(" PTRFMT "," SZFMT ",%d) = %d\n", PTRFMTCAST p, len, advise, ret);
> 
> 
> ------------------------------------------------------------------------------
> Download Intel® Parallel Studio Eval
> Try the new software tools for yourself. Speed compiling, find bugs
> proactively, and fine-tune applications for parallel performance.
> See why Intel Parallel Studio got high marks during beta.
> http://p.sf.net/sfu/intel-sw-dev
> _______________________________________________
> Monetdb-checkins mailing list
> Monetdb-checkins at lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
> 

-- 
| Dr. Stefan Manegold | mailto:Stefan.Manegold at cwi.nl |
| CWI,  P.O.Box 94079 | http://www.cwi.nl/~manegold/  |
| 1090 GB Amsterdam   | Tel.: +31 (20) 592-4212       |
| The Netherlands     | Fax : +31 (20) 592-4199       |




More information about the developers-list mailing list