MonetDB: default - The storage cost calculator

Stefan Manegold Stefan.Manegold at cwi.nl
Thu Oct 25 21:34:02 CEST 2012


This checkin --- strangely dated Sep 21 2012; cf. http://dev.monetdb.org/hg/MonetDB/rev/523120729e8d !?? --- breaks compilation:
http://monetdb.cwi.nl/testweb/web/status.php


Martin Kersten <commits at monetdb.org> wrote:

Changeset: 523120729e8d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=523120729e8d
Added Files:
	sql/scripts/75_storagemodel.sql
Modified Files:
	monetdb5/mal/mal_profiler.c
	sql/backends/monet5/sql.mx
	sql/scripts/Makefile.ag
Branch: default
Log Message:

The storage cost calculator
This script gives the database administrator insight in the actual
footprint of the persistent tables and the maximum playground used
when indices are introduced upon them.
By chancing the storagemodelinput table directly, the footprint for
yet to be loaded databases can be assessed.

The actual storage footprint of an existing database can be
obtained by the table procuding function storage()
It represents the actual state of affairs, i.e. storage on disk
of columns and foreign key indices, and possible temporary hash indices.
For strings we take a sample to determine their average length.


diffs (truncated from 425 to 300 lines):

diff --git a/monetdb5/mal/mal_profiler.c b/monetdb5/mal/mal_profiler.c
--- a/monetdb5/mal/mal_profiler.c
+++ b/monetdb5/mal/mal_profiler.c
@@ -182,6 +182,10 @@ profilerEvent(int idx, MalBlkPtr mb, Mal
 	}
 	if (profileCounter[PROFstart].status == 0 && start)
 		return;
+	if (myname == 0)
+		myname = putName("profiler", 8);
+	if (getModuleId(getInstrPtr(mb, pc)) == myname)
+		return;
 	if (offlineProfiling)
 		offlineProfilerEvent(idx, mb, stk, pc,start);
 	if (cachedProfiling)
diff --git a/sql/backends/monet5/sql.mx b/sql/backends/monet5/sql.mx
--- a/sql/backends/monet5/sql.mx
+++ b/sql/backends/monet5/sql.mx
@@ -504,12 +504,13 @@ pattern storage()(
 	schema:bat[:oid,:str],
 	table:bat[:oid,:str],
 	column:bat[:oid,:str],
+	type:bat[:oid,:str],
 	location:bat[:oid,:str],
 	count:bat[:oid,:lng],
-	capacity:bat[:oid,:lng],
-	width:bat[:oid,:int],
-	size:bat[:oid,:lng],
-	hashsize:bat[:oid,:lng],
+	atomwidth:bat[:oid,:int],
+	columnsize:bat[:oid,:lng],
+	heap:bat[:oid,:lng],
+	indices:bat[:oid,:lng],
 	sorted:bat[:oid,:bit])
 address sql_storage
 comment "return a table with storage information ";
@@ -7347,13 +7348,13 @@ str SQLoptimizersUpdate(Client cntxt, Ma
  * Inspection of the actual storage footprint is a recurring question of users.
  * This is modelled as a generic SQL table producing function.
  * create function storage()
- * returns table ("schema" string, "table" string, "column" string, location string, "count" bigint, capacity bigint, width int, size bigint, hashsize bigint, sorted int)
+ * returns table ("schema" string, "table" string, "column" string, "type" string, location string, "count" bigint, width int, columnsize bigint, heapsize bigint indices bigint, sorted int)
  * external name sql.storage;
  */
 str
 sql_storage(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
-	BAT *sch, *tab, *col, *loc, *cnt, *cap, *atom, *size, *aux, *sort;
+	BAT *sch, *tab, *col, *type, *loc, *cnt, *atom, *size, *heap, *indices, *sort;
 	mvc *m = NULL;
 	str msg = getSQLContext(cntxt,mb, &m, NULL);
 	sql_trans *tr = m->session->tr;
@@ -7362,13 +7363,14 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 	int *rsch = (int*) getArgReference(stk,pci,0);
 	int *rtab = (int*) getArgReference(stk,pci,1);
 	int *rcol = (int*) getArgReference(stk,pci,2);
-	int *rloc = (int*) getArgReference(stk,pci,3);
-	int *rcnt = (int*) getArgReference(stk,pci,4);
-	int *rcap = (int*) getArgReference(stk,pci,5);
+	int *rtype = (int*) getArgReference(stk,pci,3);
+	int *rloc = (int*) getArgReference(stk,pci,4);
+	int *rcnt = (int*) getArgReference(stk,pci,5);
 	int *ratom = (int*) getArgReference(stk,pci,6);
 	int *rsize = (int*) getArgReference(stk,pci,7);
-	int *raux = (int*) getArgReference(stk,pci,8);
-	int *rsort = (int*) getArgReference(stk,pci,9);
+	int *rheap = (int*) getArgReference(stk,pci,8);
+	int *rindices = (int*) getArgReference(stk,pci,9);
+	int *rsort = (int*) getArgReference(stk,pci,10);
 
 	if (msg)
 		return msg;
@@ -7379,31 +7381,34 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 	BATseqbase(tab, 0);
 	col = BATnew(TYPE_void,TYPE_str, 0);
 	BATseqbase(col, 0);
+	type = BATnew(TYPE_void,TYPE_str, 0);
+	BATseqbase(type, 0);
 	loc = BATnew(TYPE_void,TYPE_str, 0);
 	BATseqbase(loc, 0);
 	cnt = BATnew(TYPE_void,TYPE_lng, 0);
 	BATseqbase(cnt, 0);
-	cap = BATnew(TYPE_void,TYPE_lng, 0);
-	BATseqbase(cap, 0);
 	atom = BATnew(TYPE_void,TYPE_int, 0);
 	BATseqbase(atom, 0);
 	size = BATnew(TYPE_void,TYPE_lng, 0);
 	BATseqbase(size, 0);
-	aux = BATnew(TYPE_void,TYPE_lng, 0);
-	BATseqbase(aux, 0);
+	heap = BATnew(TYPE_void,TYPE_lng, 0);
+	BATseqbase(heap, 0);
+	indices = BATnew(TYPE_void,TYPE_lng, 0);
+	BATseqbase(indices, 0);
 	sort = BATnew(TYPE_void,TYPE_bit, 0);
 	BATseqbase(sort, 0);
-	if ( sch == NULL || tab == NULL || col == NULL || loc == NULL || sort == NULL ||
-		 cnt == NULL || cap == NULL || atom == NULL || size == NULL || aux == NULL){
+	if ( sch == NULL || tab == NULL || col == NULL || type == NULL || loc == NULL || sort == NULL ||
+		 cnt == NULL || atom == NULL || size == NULL || heap == NULL ||indices == NULL){
 		if ( sch ) BBPreleaseref(sch->batCacheid);
 		if ( tab ) BBPreleaseref(tab->batCacheid);
 		if ( col ) BBPreleaseref(col->batCacheid);
 		if ( loc ) BBPreleaseref(loc->batCacheid);
 		if ( cnt ) BBPreleaseref(cnt->batCacheid);
-		if ( cap ) BBPreleaseref(cap->batCacheid);
+		if ( type ) BBPreleaseref(type->batCacheid);
 		if ( atom ) BBPreleaseref(atom->batCacheid);
 		if ( size ) BBPreleaseref(size->batCacheid);
-		if ( aux ) BBPreleaseref(aux->batCacheid);
+		if ( heap ) BBPreleaseref(heap->batCacheid);
+		if ( indices ) BBPreleaseref(indices->batCacheid);
 		if ( sort ) BBPreleaseref(sort->batCacheid);
 		throw(SQL,"sql.storage", MAL_MALLOC_FAIL);
 	}
@@ -7412,7 +7417,8 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 		sql_schema *s= (sql_schema*) nsch->data;
 		if ( isalpha((int)b->name[0]) )
 
-		if (s->tables.set) for(ntab= (s)->tables.set->h ;ntab; ntab= ntab->next){
+		if (s->tables.set) 
+		for(ntab= (s)->tables.set->h ;ntab; ntab= ntab->next){
 			sql_base *bt= ntab->data;
 			sql_table *t= (sql_table*) bt;
 			if (isTable(t))
@@ -7426,28 +7432,49 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 				sch = BUNappend(sch, b->name, FALSE);
 				tab = BUNappend(tab, bt->name, FALSE);
 				col = BUNappend(col, bc->name, FALSE);
+				type = BUNappend(type, c->type.type->sqlname, FALSE);
 
 				/*printf(" cnt "BUNFMT, BATcount(bn));*/
 				sz= BATcount(bn);
 				cnt = BUNappend(cnt, &sz, FALSE);
-				/*printf(" cap "BUNFMT, BATcapacity(bn));*/
-				sz= BATcapacity(bn);
-				cap = BUNappend(cap, &sz, FALSE);
 
 				/*printf(" loc %s", BBP_physical(bn->batCacheid));*/
 				loc = BUNappend(loc, BBP_physical(bn->batCacheid), FALSE);
 				/*printf(" width %d", bn->T->width);*/
 				w= bn->T->width;
+				if ( bn->ttype == TYPE_str){
+					BUN p,q;
+					double sum=0;
+					BATiter bi = bat_iterator(bn);
+					lng cnt1,cnt2= cnt1=(lng) BATcount(bn);
+					
+					/* just take a sample */
+					if ( cnt1 > 512)
+						cnt1 = cnt2 = 512;
+					BATloop(bn,p,q){
+						str s = BUNtail(bi,p);
+						if( s != NULL && strcmp(s, str_nil))
+							sum += (int) strlen(s);
+						if ( --cnt1 <= 0)
+							break;
+					}
+					if ( cnt2)
+						w = (int) (sum/cnt2);
+				}
 				atom = BUNappend(atom, &w, FALSE);
-				/*printf(" size "BUNFMT, tailsize(bn,BATcount(bn)) + (bn->T->vheap? bn->T->vheap->size:0));*/
-				sz = tailsize(bn,BATcapacity(bn)) + (bn->T->vheap? bn->T->vheap->size:0);
-				sz += headsize(bn,BATcapacity(bn)) + (bn->H->vheap? bn->H->vheap->size:0);
+
+				sz = tailsize(bn,BATcount(bn));
+				sz += headsize(bn,BATcount(bn));
 				size = BUNappend(size, &sz, FALSE);
 				
+				sz = bn->T->vheap? bn->T->vheap->size:0;
+				sz += bn->H->vheap? bn->H->vheap->size:0;
+				heap = BUNappend(heap, &sz, FALSE);
+
 				sz =  bn->T->hash?bn->T->hash->heap->size:0;
 				sz += bn->H->hash?bn->H->hash->heap->size:0;
-				aux = BUNappend(aux, &sz, FALSE);
-				/*printf(" auxsize "BUNFMT, bn->T->hash?bn->T->hash->heap->size:0);*/
+				indices = BUNappend(indices, &sz, FALSE);
+				/*printf(" indices "BUNFMT, bn->T->hash?bn->T->hash->heap->size:0);*/
 				/*printf("\n");*/
 
 				w =  BATtordered(bn);
@@ -7468,28 +7495,49 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 							sch = BUNappend(sch, b->name, FALSE);
 							tab = BUNappend(tab, bt->name, FALSE);
 							col = BUNappend(col, bc->name, FALSE);
+							type = BUNappend(type, "oid", FALSE);
 			
 							/*printf(" cnt "BUNFMT, BATcount(bn));*/
 							sz= BATcount(bn);
 							cnt = BUNappend(cnt, &sz, FALSE);
-							/*printf(" cap "BUNFMT, BATcapacity(bn));*/
-							sz= BATcapacity(bn);
-							cap = BUNappend(cap, &sz, FALSE);
 			
 							/*printf(" loc %s", BBP_physical(bn->batCacheid));*/
 							loc = BUNappend(loc, BBP_physical(bn->batCacheid), FALSE);
 							/*printf(" width %d", bn->T->width);*/
 							w= bn->T->width;
+							if ( bn->ttype == TYPE_str){
+								BUN p,q;
+								double sum=0;
+								BATiter bi = bat_iterator(bn);
+								lng cnt1, cnt2= cnt1 = BATcount(bn);
+								
+								/* just take a sample */
+								if ( cnt1 > 512)
+									cnt1 = cnt2 = 512;
+								BATloop(bn,p,q){
+									str s = BUNtail(bi,p);
+									if( s != NULL && strcmp(s, str_nil))
+										sum += (int) strlen(s);
+									if ( --cnt1 <= 0)
+										break;
+								}
+								if ( cnt2)
+									w = (int) (sum/cnt2);
+							}
 							atom = BUNappend(atom, &w, FALSE);
 							/*printf(" size "BUNFMT, tailsize(bn,BATcount(bn)) + (bn->T->vheap? bn->T->vheap->size:0));*/
-							sz = tailsize(bn,BATcapacity(bn)) + (bn->T->vheap? bn->T->vheap->size:0);
-							sz += headsize(bn,BATcapacity(bn)) + (bn->H->vheap? bn->H->vheap->size:0);
+							sz = tailsize(bn,BATcount(bn));
+							sz += headsize(bn,BATcount(bn));
 							size = BUNappend(size, &sz, FALSE);
 							
+							sz = bn->T->vheap? bn->T->vheap->size:0;
+							sz += bn->H->vheap? bn->H->vheap->size:0;
+							heap = BUNappend(heap, &sz, FALSE);
+
 							sz =  bn->T->hash?bn->T->hash->heap->size:0;
 							sz += bn->H->hash?bn->H->hash->heap->size:0;
-							aux = BUNappend(aux, &sz, FALSE);
-							/*printf(" auxsize "BUNFMT, bn->T->hash?bn->T->hash->heap->size:0);*/
+							indices = BUNappend(indices, &sz, FALSE);
+							/*printf(" indices "BUNFMT, bn->T->hash?bn->T->hash->heap->size:0);*/
 							/*printf("\n");*/
 							w =  BATtordered(bn);
 							sort = BUNappend(sort, &w, FALSE);
@@ -7504,11 +7552,12 @@ sql_storage(Client cntxt, MalBlkPtr mb, 
 	BBPkeepref(*rtab = tab->batCacheid);
 	BBPkeepref(*rcol = col->batCacheid);
 	BBPkeepref(*rloc = loc->batCacheid);
+	BBPkeepref(*rtype = type->batCacheid);
 	BBPkeepref(*rcnt = cnt->batCacheid);
-	BBPkeepref(*rcap = cap->batCacheid);
 	BBPkeepref(*ratom = atom->batCacheid);
 	BBPkeepref(*rsize = size->batCacheid);
-	BBPkeepref(*raux = aux->batCacheid);
+	BBPkeepref(*rheap = heap->batCacheid);
+	BBPkeepref(*rindices = indices->batCacheid);
 	BBPkeepref(*rsort = sort->batCacheid);
 	return MAL_SUCCEED;
 }
diff --git a/sql/scripts/75_storagemodel.sql b/sql/scripts/75_storagemodel.sql
new file mode 100644
--- /dev/null
+++ b/sql/scripts/75_storagemodel.sql
@@ -0,0 +1,154 @@
+-- The contents of this file are subject to the MonetDB Public License
+-- Version 1.1 (the "License"); you may not use this file except in
+-- compliance with the License. You may obtain a copy of the License at
+-- http://www.monetdb.org/Legal/MonetDBLicense
+--
+-- Software distributed under the License is distributed on an "AS IS"
+-- basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+-- License for the specific language governing rights and limitations
+-- under the License.
+--
+-- The Original Code is the MonetDB Database System.
+--
+-- The Initial Developer of the Original Code is CWI.
+-- Copyright August 2008-2012 MonetDB B.V.
+-- All Rights Reserved.
+
+
+-- Author M.Kersten
+-- This script gives the database administrator insight in the actual
+-- footprint of the persistent tables and the maximum playground used
+-- when indices are introduced upon them.
+-- By chancing the storagemodelinput table directly, the footprint for
+-- yet to be loaded databases can be assessed.
+
+-- The actual storage footprint of an existing database can be 
+-- obtained by the table procuding function storage()
+-- It represents the actual state of affairs, i.e. storage on disk
+-- of columns and foreign key indices, and possible temporary hash indices.
+-- For strings we take a sample to determine their average length.
+
+create function storage()
+returns table ("schema" string, "table" string, "column" string, "type" string, location string, "count" bigint, typewidth int, columnsize bigint, heapsize bigint, indices bigint, sorted boolean)
+external name sql.storage;
+
+-- To determine the footprint of an arbitrary database, we first have
+-- to define its schema, followed by an indication of the properties of each column.
+-- A storage model input table for the size prediction is shown below:
+create table storagemodelinput(
+	"schema" string,
+	"table" string,
+	"column" string,
_______________________________________________
checkin-list mailing list
checkin-list at monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list
_______________________________________________
developers-list mailing list
developers-list at monetdb.org
http://mail.monetdb.org/mailman/listinfo/developers-list



More information about the developers-list mailing list