Problems executing custom aggregation function on large datasets

Sebastian Dorok sebastian.dorok at st.ovgu.de
Thu Jul 4 09:34:48 CEST 2013


Okay got it: executing generated MAL explain plan step by step is bad 
idea. Debugging it worked better but lead to the same error even with 
'nthreads' property of my database set to '1' (also without debugging ;).

But I think I found my problem: My aggregation function is a holistic 
one like MEDIAN means I can't compute the final result using 
intermediate results. Luckily I figured out that MITOSIS was enabled in 
the SQL optimizer pipeline (I don't know why). When disabling it using 
NO_MITOSIS_PIPE everything works fine because the computation is not 
splitted horizontally. See attached MCLIENT output for details.

So questions that remain:
- How do I have to announce/ implement my aggregation function so that 
it can be computed even if MITOSIS is enabled (like MEDIAN)?
- And a more general one: How do I implement a non holistic aggregation 
function? When I look at the explain plan for my aggregation function I 
can't see that the optimizer tries to compute intermediate results. How 
to achieve that?


On 03.07.2013 15:11, Lefteris wrote:
> Hi, the error that you are getting has nothing to do in my opinion
> with your function. There are other reasons. Try to run your queries
> through SQL mclient, passing one MAL instruction by hand in general
> will not work.
>
> I see in your MAL explain plan
>
>       X_16:bat[:any,:str]  := udf.subgenotype(X_10,X_14,r1_27,true);
>
> which is your function. X_10 here is the "base" column you want to
> aggregate and X_14 is the result of subgroupby of the "pos" column.
> Try to run monetdb with gdk_nr_threads=1 so you only have 1 thread and
> avoid fragmentation of the base tables, so is simpler to see what is
> going on.
>
>
>
> On Wed, Jul 3, 2013 at 2:48 PM, Sebastian Dorok
> <sebastian.dorok at st.ovgu.de> wrote:
>> Hi all,
>>
>> I'm developing a custom aggregation function. Everything works fine until I
>> use it on large datasets (ca. 270 Mio. rows).
>>
>> Here is what my function signature looks like:
>>
>> str AGGSimpleGenotype(int *ret, int *bid, int *gid, int *eid, bit
>> ignore_nils);
>>
>> I assume that the BATs identified by bid and gid have the same length.
>> That's why my function checks this circumstance. On my test dataset (ca. 200
>> rows) everything works fine. When I use my production dataset my aggregation
>> function returns an error complaining that the BATs have not the same length
>> and stops aggregating (by design)! How can this happen?
>>
>> The SQL query is:
>> select grpgenotype(base) from test.bases group by pos;
>> I attached the MAL plan generated for the large dataset. When I execute it
>> step by step I get an error in line 39 saying:
>>
>> mal>X_106 := sql.delta(X_56,X_70,X_71,X_8);
>> MAPI  = (monetdb) /tmp/.s.monetdb.50000
>> QUERY = X_106 := sql.delta(X_56,X_70,X_71,X_8);
>> ERROR = !TypeException:user.main[1]:'sql.delta' undefined in: X_106:any :=
>> sql.delta(X_56:bat[:oid,:str], X_70:bat[:oid,:oid], X_71:bat[:oid,:str],
>> X_8:bat[:oid,:any])
>>
>> This error occurs before my function is entered but I think it must be
>> related with my problem. Is it an internal MonetDB problem?
>>
>> I'm using the Feb2013-SP2 release.
>>
>> Any ideas? Thx.
>>
>> _______________________________________________
>> users-list mailing list
>> users-list at monetdb.org
>> http://mail.monetdb.org/mailman/listinfo/users-list
>>
> _______________________________________________
> users-list mailing list
> users-list at monetdb.org
> http://mail.monetdb.org/mailman/listinfo/users-list
>
-------------- next part --------------
sql>set optimizer='default_pipe';
operation successful (0.302ms)
sql>select optimizer;                
+--------------+
| single_value |
+==============+
| default_pipe |
+--------------+
1 tuple (0.385ms)
sql>explain select median(base) from genomics.echoli group by pos;
+-------------------------------------------------------------------+
| mal                                                               |
+===================================================================+
| function user.s8_1{autoCommit=true}():void;                       |
|     X_2 := sql.mvc();                                             |
|     X_3:bat[:oid,:oid]  := sql.tid(X_2,"genomics","echoli");      |
|     X_6 := sql.bind(X_2,"genomics","echoli","base",0);            |
|     (X_9,r1_9) := sql.bind(X_2,"genomics","echoli","base",2);     |
|     X_12 := sql.bind(X_2,"genomics","echoli","base",1);           |
|     X_14 := sql.delta(X_6,X_9,r1_9,X_12);                         |
|     X_15 := algebra.leftfetchjoin(X_3,X_14);                      |
|     X_16 := batcalc.bte(X_15);                                    |
|     X_17 := sql.bind(X_2,"genomics","echoli","pos",0);            |
|     (X_19,r1_21) := sql.bind(X_2,"genomics","echoli","pos",2);    |
|     X_21 := sql.bind(X_2,"genomics","echoli","pos",1);            |
|     X_22 := sql.delta(X_17,X_19,r1_21,X_21);                      |
|     X_23 := algebra.leftfetchjoin(X_3,X_22);                      |
|     (X_24,r1_28,r2_28) := group.subgroupdone(X_23);               |
|     X_27:bat[:oid,:bte]  := aggr.submedian(X_16,X_24,r1_28,true); |
|     X_29 := sql.resultSet(1,1,X_27);                              |
|     sql.rsColumn(X_29,"genomics.","L1","tinyint",8,0,X_27);       |
|     X_34 := io.stdout();                                          |
|     sql.exportResult(X_34,X_29);                                  |
| end s8_1;                                                         |
+-------------------------------------------------------------------+
21 tuples (0.757ms)
sql>explain select simple_genotype(base) from genomics.echoli group by pos;
+----------------------------------------------------------------------------------------------------+
| mal                                                                                                |
+====================================================================================================+
| function user.s9_1{autoCommit=true}():void;                                                        |
|     X_2 := sql.mvc();                                                                              |
|     X_37:bat[:oid,:oid]  := sql.tid(X_2,"genomics","echoli",0,2);                                  |
|     X_40:bat[:oid,:str]  := sql.bind(X_2,"genomics","echoli","base",0,0,2);                        |
|     (X_42:bat[:oid,:oid] ,X_43:bat[:oid,:str] ) := sql.bind(X_2,"genomics","echoli","base",2,0,2); |
|     X_52 := sql.delta(X_40,X_42,X_43);                                                             |
|     X_54 := algebra.leftfetchjoin(X_37,X_52);                                                      |
|     X_39:bat[:oid,:oid]  := sql.tid(X_2,"genomics","echoli",1,2);                                  |
|     X_41:bat[:oid,:str]  := sql.bind(X_2,"genomics","echoli","base",0,1,2);                        |
|     (X_44:bat[:oid,:oid] ,X_45:bat[:oid,:str] ) := sql.bind(X_2,"genomics","echoli","base",2,1,2); |
|     X_8 := sql.bind(X_2,"genomics","echoli","base",1);                                             |
|     X_53 := sql.delta(X_41,X_44,X_45,X_8);                                                         |
|     X_55 := algebra.leftfetchjoin(X_39,X_53);                                                      |
|     X_69 := mat.packIncrement(X_54,2);                                                             |
|     X_10 := mat.packIncrement(X_69,X_55);                                                          |
|     X_46:bat[:oid,:int]  := sql.bind(X_2,"genomics","echoli","pos",0,0,2);                         |
|     (X_48:bat[:oid,:oid] ,X_49:bat[:oid,:int] ) := sql.bind(X_2,"genomics","echoli","pos",2,0,2);  |
|     X_56 := sql.delta(X_46,X_48,X_49);                                                             |
|     X_58 := algebra.leftfetchjoin(X_37,X_56);                                                      |
|     (X_60,X_61,X_62) := group.subgroupdone(X_58);                                                  |
|     X_63 := algebra.leftfetchjoin(X_61,X_58);                                                      |
|     X_47:bat[:oid,:int]  := sql.bind(X_2,"genomics","echoli","pos",0,1,2);                         |
|     (X_50:bat[:oid,:oid] ,X_51:bat[:oid,:int] ) := sql.bind(X_2,"genomics","echoli","pos",2,1,2);  |
|     X_12 := sql.bind(X_2,"genomics","echoli","pos",1);                                             |
|     X_57 := sql.delta(X_47,X_50,X_51,X_12);                                                        |
|     X_59 := algebra.leftfetchjoin(X_39,X_57);                                                      |
|     (X_64,X_65,X_66) := group.subgroupdone(X_59);                                                  |
|     X_67 := algebra.leftfetchjoin(X_65,X_59);                                                      |
|     X_72 := mat.packIncrement(X_63,2);                                                             |
|     X_13 := mat.packIncrement(X_72,X_67);                                                          |
|     (X_14,r1_27,X_68) := group.subgroupdone(X_13);                                                 |
|     X_16:bat[:any,:str]  := genomics.subsimple_genotype(X_10,X_14,r1_27,true);                     |
|     X_18 := sql.resultSet(1,1,X_16);                                                               |
|     sql.rsColumn(X_18,"genomics.echoli","L1","clob",1,0,X_16);                                     |
|     X_22 := io.stdout();                                                                           |
|     sql.exportResult(X_22,X_18);                                                                   |
| end s9_1;                                                                                          |
+----------------------------------------------------------------------------------------------------+
37 tuples (1.109ms)
sql>set optimizer='no_mitosis_pipe';
operation successful (0.304ms)
sql>select optimizer;
+-----------------+
| single_value    |
+=================+
| no_mitosis_pipe |
+-----------------+
1 tuple (0.338ms)
sql>explain select median(base) from genomics.echoli group by pos;
+-------------------------------------------------------------------+
| mal                                                               |
+===================================================================+
| function user.s11_1{autoCommit=true}():void;                      |
|     X_2 := sql.mvc();                                             |
|     X_3:bat[:oid,:oid]  := sql.tid(X_2,"genomics","echoli");      |
|     X_6 := sql.bind(X_2,"genomics","echoli","base",0);            |
|     (X_9,r1_9) := sql.bind(X_2,"genomics","echoli","base",2);     |
|     X_12 := sql.bind(X_2,"genomics","echoli","base",1);           |
|     X_14 := sql.delta(X_6,X_9,r1_9,X_12);                         |
|     X_15 := algebra.leftfetchjoin(X_3,X_14);                      |
|     X_16 := batcalc.bte(X_15);                                    |
|     X_17 := sql.bind(X_2,"genomics","echoli","pos",0);            |
|     (X_19,r1_21) := sql.bind(X_2,"genomics","echoli","pos",2);    |
|     X_21 := sql.bind(X_2,"genomics","echoli","pos",1);            |
|     X_22 := sql.delta(X_17,X_19,r1_21,X_21);                      |
|     X_23 := algebra.leftfetchjoin(X_3,X_22);                      |
|     (X_24,r1_28,r2_28) := group.subgroupdone(X_23);               |
|     X_27:bat[:oid,:bte]  := aggr.submedian(X_16,X_24,r1_28,true); |
|     X_29 := sql.resultSet(1,1,X_27);                              |
|     sql.rsColumn(X_29,"genomics.","L1","tinyint",8,0,X_27);       |
|     X_34 := io.stdout();                                          |
|     sql.exportResult(X_34,X_29);                                  |
| end s11_1;                                                        |
+-------------------------------------------------------------------+
21 tuples (0.726ms)
sql>explain select simple_genotype(base) from genomics.echoli group by pos;
+--------------------------------------------------------------------------------+
| mal                                                                            |
+================================================================================+
| function user.s12_1{autoCommit=true}():void;                                   |
|     X_2 := sql.mvc();                                                          |
|     X_3:bat[:oid,:oid]  := sql.tid(X_2,"genomics","echoli");                   |
|     X_6 := sql.bind(X_2,"genomics","echoli","base",0);                         |
|     (X_9,r1_9) := sql.bind(X_2,"genomics","echoli","base",2);                  |
|     X_12 := sql.bind(X_2,"genomics","echoli","base",1);                        |
|     X_14 := sql.delta(X_6,X_9,r1_9,X_12);                                      |
|     X_15 := algebra.leftfetchjoin(X_3,X_14);                                   |
|     X_16 := sql.bind(X_2,"genomics","echoli","pos",0);                         |
|     (X_18,r1_20) := sql.bind(X_2,"genomics","echoli","pos",2);                 |
|     X_20 := sql.bind(X_2,"genomics","echoli","pos",1);                         |
|     X_21 := sql.delta(X_16,X_18,r1_20,X_20);                                   |
|     X_22 := algebra.leftfetchjoin(X_3,X_21);                                   |
|     (X_23,r1_27,r2_27) := group.subgroupdone(X_22);                            |
|     X_26:bat[:any,:str]  := genomics.subsimple_genotype(X_15,X_23,r1_27,true); |
|     X_28 := sql.resultSet(1,1,X_26);                                           |
|     sql.rsColumn(X_28,"genomics.echoli","L1","clob",1,0,X_26);                 |
|     X_32 := io.stdout();                                                       |
|     sql.exportResult(X_32,X_28);                                               |
| end s12_1;                                                                     |
+--------------------------------------------------------------------------------+
20 tuples (0.766ms)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 81_genomics.sql
Type: text/x-sql
Size: 346 bytes
Desc: not available
URL: <http://www.monetdb.org/pipermail/users-list/attachments/20130704/ccb3a2c0/attachment.bin>
-------------- next part --------------
# scalar MAL signatures
module genomics;

command subconcat_bases(b:bat[:oid,:str], e:bat[:oid,:oid], g:bat[:oid,:oid], ignorenils:bit) :bat[:any,:str]
address AGGConcatBases
comment "Concatenates all bases in a group.";

command subsimple_genotype(b:bat[:oid,:str], e:bat[:oid,:oid], g:bat[:oid,:oid], ignorenils:bit) :bat[:any,:str]
address AGGSimpleGenotype
comment "Determines the genotype by examining the relative frequencies in a group.";

command subgenotype_statistics(b:bat[:oid,:str], e:bat[:oid,:oid], g:bat[:oid,:oid], ignorenils:bit) :bat[:any,:str]
address AGGGenotypeStatistics
comment "Returns statistics about the genotype in a group.";


More information about the users-list mailing list