MonetDB: rdf - Generate sample data for all tables

Pham Duc P.Minh.Duc at cwi.nl
Wed Feb 26 15:27:29 CET 2014


Thanks Lefteris, 

I should look at that before implementing my own function -_-. 

Best, 

Minh-Duc

----- Original Message -----
From: "Lefteris" <lsidir at gmail.com>
To: "Communication channel for developers of the MonetDB suite." <developers-list at monetdb.org>
Sent: Wednesday, February 26, 2014 3:21:52 PM
Subject: Re: MonetDB: rdf - Generate sample data for all tables

Hi Duc,

I did not read in detail your function, but you can also use
BATsample_ (which is the void headed version of BATsample)

lefteris

On Wed, Feb 26, 2014 at 2:10 PM, Minh-Duc Pham <commits at monetdb.org> wrote:
> Changeset: e7109fc24610 for MonetDB
> URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610
> Modified Files:
>         monetdb5/extras/rdf/rdfschema.c
>         monetdb5/extras/rdf/rdfschema.h
> Branch: rdf
> Log Message:
>
> Generate sample data for all tables
>
>
> diffs (truncated from 721 to 300 lines):
>
> diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
> --- a/monetdb5/extras/rdf/rdfschema.c
> +++ b/monetdb5/extras/rdf/rdfschema.c
> @@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i
>         if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
>         else return 0;
>  }
> +static
> +char isInfrequentSampleCol(CS freqCS, PropTypes pt){
> +       if (pt.propFreq * 100 <  freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
> +       else return 0;
> +}
>
>  static
>  void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){
> @@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
>
>         }
>  }
> +
> +
> +static
> +void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){
> +       oid id;
> +       id = pos;
> +       id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID);
> +       *sOid = id;
> +}
> +
> +static
> +str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){
> +       BUN pos;
> +       oid *tmp;
> +       pos = BUNfnd(BATmirror(rmap),sbt);
> +       if (pos == BUN_NONE){
> +               throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap");
> +       }
> +       tmp = (oid *) Tloc(lmap, pos);
> +       if (*tmp == BUN_NONE){
> +               throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap");
> +       }
> +
> +       *origSbt = *tmp;
> +
> +       return MAL_SUCCEED;
> +}
> +
> +static
> +str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){
> +       BUN pos;
> +       oid *tmp;
> +       oid     tmporigOid = BUN_NONE;
> +       char objType;
> +       BUN     maxObjectURIOid =  ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS
> +
> +       objType = getObjType(*obt);
> +
> +       if (objType == URI || objType == BLANKNODE){
> +               tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
> +       }
> +
> +       if (tmporigOid > maxObjectURIOid){
> +               pos = BUNfnd(BATmirror(rmap),&tmporigOid);
> +               if (pos == BUN_NONE){
> +                       throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap");
> +               }
> +               tmp = (oid *) Tloc(lmap, pos);
> +               if (*tmp == BUN_NONE){
> +                       throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap");
> +               }
> +
> +               *origObt = *tmp;
> +       }
> +       else{
> +               *origObt = tmporigOid;
> +       }
> +
> +       return MAL_SUCCEED;
> +}
> +
> +static
> +str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables,  bat *lmapbatid, bat *rmapbatid){
> +       int     i, j, k;
> +       int     freqId;
> +       int     tmpNumcand;
> +       oid     tmpCandidate;
> +       int     randValue = 0;
> +       int     ranPosition = 0;        //random position of the instance in a table
> +       int     tmpNumCols;
> +       int     colIdx;
> +       BAT     *tmpbat = NULL;
> +       BATiter tmpi;
> +       BAT     *cursamplebat = NULL;
> +       int     tmpNumRows = 0;
> +       oid     tmpSoid = BUN_NONE, origSoid = BUN_NONE;
> +       oid     origOid = BUN_NONE;
> +       BAT     *lmap = NULL, *rmap = NULL;
> +
> +       if ((lmap = BATdescriptor(*lmapbatid)) == NULL) {
> +               throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
> +       }
> +
> +       if ((rmap = BATdescriptor(*rmapbatid)) == NULL) {
> +               BBPreleaseref(lmap->batCacheid);
> +               throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
> +       }
> +       srand(123456);
> +       for (i = 0; i < numTables; i++){
> +               freqId = mTblIdxFreqIdxMapping[i];
> +               csSampleEx[i].freqIdx = freqId;
> +               tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
> +               csSampleEx[i].name = cstablestat->lstcstable[i].tblname;
> +               csSampleEx[i].candidateCount = tmpNumcand;
> +               csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand);
> +               for (k = 0; k < tmpNumcand; k++){
> +                       csSampleEx[i].candidates[k] = label[freqId].candidates[k];
> +               }
> +               //Randomly exchange the value, change the position k with a random pos
> +               for (k = 0; k < tmpNumcand; k++){
> +                       randValue = rand() % tmpNumcand;
> +                       tmpCandidate = csSampleEx[i].candidates[k];
> +                       csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue];
> +                       csSampleEx[i].candidates[randValue] = tmpCandidate;
> +               }
> +
> +               csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE);
> +               for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
> +                       csSampleEx[i].lstSubjOid[k] = BUN_NONE;
> +
> +               tmpNumCols = csPropTypes[i].numProp -  csPropTypes[i].numInfreqProp; //already remove infrequent column;
> +               csSampleEx[i].numProp = tmpNumCols;
> +
> +               assert(tmpNumCols > 0);
> +
> +               csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols);
> +               csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols);
> +               csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols);
> +               csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols);
> +               colIdx = -1;
> +               csSampleEx[i].numInstances = 0;
> +               for(j = 0; j < csPropTypes[i].numProp; j++){
> +                       #if     REMOVE_INFREQ_PROP
> +                       if (csPropTypes[i].lstPropTypes[j].defColIdx == -1)     continue;  //Infrequent prop
> +                       #endif
> +                       colIdx++;
> +                       csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop;
> +
> +                       csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1);
> +
> +                       //Mark whether this col is infrequent sample cols
> +                       if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){
> +                               csSampleEx[i].lstIsInfrequentProp[colIdx] = 1;
> +                       }
> +                       else
> +                               csSampleEx[i].lstIsInfrequentProp[colIdx] = 0;
> +
> +                       //Mark whther this col is a MV col
> +                       csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp;
> +
> +                       //if this is a multivalue column, get the data type of the first column
> +
> +               }
> +               assert(colIdx == (tmpNumCols - 1));
> +
> +
> +               // Inserting instances to csSampleEx
> +
> +               tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]);
> +
> +               for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){
> +                       ranPosition = rand() % tmpNumRows;
> +
> +                       getSubjIdFromTablePosition(i, ranPosition, &tmpSoid);
> +
> +                       if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){
> +                               throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt ");
> +                       }
> +
> +                       csSampleEx[i].lstSubjOid[k] = origSoid;
> +
> +                       for (j = 0; j < tmpNumCols; j++){
> +                               cursamplebat = csSampleEx[i].colBats[j];
> +
> +                               tmpbat = cstablestat->lstcstable[i].colBats[j];
> +                               tmpi = bat_iterator(tmpbat);
> +
> +                               if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){
> +                                       //Get the original object oid
> +                                       oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition);
> +                                       if(*tmpOid != oid_nil){
> +                                               if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){
> +                                                       throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt ");
> +                                               }
> +                                               BUNappend(cursamplebat, &origOid, TRUE);
> +                                       }
> +                                       else{
> +                                               BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE);
> +                                       }
> +
> +                               }
> +                               else
> +                                       BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE);
> +
> +
> +
> +                       }
> +                       csSampleEx[i].numInstances++;
> +               }
> +
> +               if (i == 0)
> +                       for (j = 0; j < tmpNumCols; j++){
> +                               //BATprint(cstablestat->lstcstable[i].colBats[j]);
> +                               BATprint(csSampleEx[i].colBats[j]);
> +                       }
> +
> +       }
> +
> +       BBPunfix(lmap->batCacheid);
> +       BBPunfix(rmap->batCacheid);
> +
> +       return MAL_SUCCEED;
> +
> +}
> +
>  static
>  void freeSampleData(CSSample *csSample, int numCand){
>         int i, j;
> @@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample,
>         free(csSample);
>  }
>
> +
> +static
> +void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){
> +       int i, j;
> +       for (i = 0; i < numCand; i++){
> +               free(csSampleEx[i].lstProp);
> +               free(csSampleEx[i].lstIsInfrequentProp);
> +               free(csSampleEx[i].lstIsMVCol);
> +               free(csSampleEx[i].candidates);
> +               free(csSampleEx[i].lstSubjOid);
> +               for (j = 0; j < csSampleEx[i].numProp; j++){
> +                       BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
> +               }
> +               free(csSampleEx[i].colBats);
> +       }
> +
> +       free(csSampleEx);
> +}
> +
>  static
>  void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){
>         int i,j;
> @@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample,
>         return MAL_SUCCEED;
>  }
>
> +#if 0
> +static
> +str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){
> +
> +       int     i,j, k;
> +       FILE    *fout, *fouttb, *foutis;
> +       char    filename[100], filename2[100], filename3[100];
> +       char    tmpStr[20], tmpStr2[20], tmpStr3[20];
> +       int     ret;
> +
> +       str     propStr;
> +       str     subjStr;
> +       char*   schema = "rdf";
> +       CSSample        sample;
> +       CS              freqCS;
> +       char    objType = 0;
> +       str     objStr;
> +       oid     objOid = BUN_NONE;
> +       BATiter mapi;
> +       str     canStr;
> +       char    isTitle = 0;
> +       char    isUrl = 0;
> +       char    isType = 0;
> +       char    isDescription = 0;
> +       char    isImage = 0;
> +       char    isSite = 0;
> +       char    isEmail = 0;
> +       char    isCountry = 0;
> +       char    isLocality = 0;
> +       BAT     *lmap = NULL, *rmap = NULL
> +#if USE_SHORT_NAMES
> +       str     propStrShort = NULL;
> +       char    *pch;
> +#endif
> +
> +
> +
> +       mapi = bat_iterator(mbat);
> +
> +       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
> +               throw(RDF, "rdf.rdfschema",
> +                               "could not open the tokenizer\n");
> +       }
> _______________________________________________
> checkin-list mailing list
> checkin-list at monetdb.org
> https://www.monetdb.org/mailman/listinfo/checkin-list
_______________________________________________
developers-list mailing list
developers-list at monetdb.org
https://www.monetdb.org/mailman/listinfo/developers-list



More information about the developers-list mailing list