Re: MonetDB: rdf - Generate sample data for all tables
Hi Duc,
I did not read in detail your function, but you can also use BATsample_ (which is the void headed version of BATsample)
lefteris
On Wed, Feb 26, 2014 at 2:10 PM, Minh-Duc Pham commits@monetdb.org wrote:
Changeset: e7109fc24610 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Generate sample data for all tables
diffs (truncated from 721 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; } +static +char isInfrequentSampleCol(CS freqCS, PropTypes pt){
if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
else return 0;
+}
static void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ @@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
}
}
+static +void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){
oid id;
id = pos;
id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID);
*sOid = id;
+}
+static +str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){
BUN pos;
oid *tmp;
pos = BUNfnd(BATmirror(rmap),sbt);
if (pos == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap");
}
tmp = (oid *) Tloc(lmap, pos);
if (*tmp == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap");
}
*origSbt = *tmp;
return MAL_SUCCEED;
+}
+static +str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){
BUN pos;
oid *tmp;
oid tmporigOid = BUN_NONE;
char objType;
BUN maxObjectURIOid = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS
objType = getObjType(*obt);
if (objType == URI || objType == BLANKNODE){
tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
}
if (tmporigOid > maxObjectURIOid){
pos = BUNfnd(BATmirror(rmap),&tmporigOid);
if (pos == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap");
}
tmp = (oid *) Tloc(lmap, pos);
if (*tmp == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap");
}
*origObt = *tmp;
}
else{
*origObt = tmporigOid;
}
return MAL_SUCCEED;
+}
+static +str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables, bat *lmapbatid, bat *rmapbatid){
int i, j, k;
int freqId;
int tmpNumcand;
oid tmpCandidate;
int randValue = 0;
int ranPosition = 0; //random position of the instance in a table
int tmpNumCols;
int colIdx;
BAT *tmpbat = NULL;
BATiter tmpi;
BAT *cursamplebat = NULL;
int tmpNumRows = 0;
oid tmpSoid = BUN_NONE, origSoid = BUN_NONE;
oid origOid = BUN_NONE;
BAT *lmap = NULL, *rmap = NULL;
if ((lmap = BATdescriptor(*lmapbatid)) == NULL) {
throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
}
if ((rmap = BATdescriptor(*rmapbatid)) == NULL) {
BBPreleaseref(lmap->batCacheid);
throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
}
srand(123456);
for (i = 0; i < numTables; i++){
freqId = mTblIdxFreqIdxMapping[i];
csSampleEx[i].freqIdx = freqId;
tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
csSampleEx[i].name = cstablestat->lstcstable[i].tblname;
csSampleEx[i].candidateCount = tmpNumcand;
csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand);
for (k = 0; k < tmpNumcand; k++){
csSampleEx[i].candidates[k] = label[freqId].candidates[k];
}
//Randomly exchange the value, change the position k with a random pos
for (k = 0; k < tmpNumcand; k++){
randValue = rand() % tmpNumcand;
tmpCandidate = csSampleEx[i].candidates[k];
csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue];
csSampleEx[i].candidates[randValue] = tmpCandidate;
}
csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE);
for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
csSampleEx[i].lstSubjOid[k] = BUN_NONE;
tmpNumCols = csPropTypes[i].numProp - csPropTypes[i].numInfreqProp; //already remove infrequent column;
csSampleEx[i].numProp = tmpNumCols;
assert(tmpNumCols > 0);
csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols);
csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols);
csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols);
csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols);
colIdx = -1;
csSampleEx[i].numInstances = 0;
for(j = 0; j < csPropTypes[i].numProp; j++){
#if REMOVE_INFREQ_PROP
if (csPropTypes[i].lstPropTypes[j].defColIdx == -1) continue; //Infrequent prop
#endif
colIdx++;
csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop;
csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1);
//Mark whether this col is infrequent sample cols
if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){
csSampleEx[i].lstIsInfrequentProp[colIdx] = 1;
}
else
csSampleEx[i].lstIsInfrequentProp[colIdx] = 0;
//Mark whther this col is a MV col
csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp;
//if this is a multivalue column, get the data type of the first column
}
assert(colIdx == (tmpNumCols - 1));
// Inserting instances to csSampleEx
tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]);
for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){
ranPosition = rand() % tmpNumRows;
getSubjIdFromTablePosition(i, ranPosition, &tmpSoid);
if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){
throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt ");
}
csSampleEx[i].lstSubjOid[k] = origSoid;
for (j = 0; j < tmpNumCols; j++){
cursamplebat = csSampleEx[i].colBats[j];
tmpbat = cstablestat->lstcstable[i].colBats[j];
tmpi = bat_iterator(tmpbat);
if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){
//Get the original object oid
oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition);
if(*tmpOid != oid_nil){
if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){
throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt ");
}
BUNappend(cursamplebat, &origOid, TRUE);
}
else{
BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE);
}
}
else
BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE);
}
csSampleEx[i].numInstances++;
}
if (i == 0)
for (j = 0; j < tmpNumCols; j++){
//BATprint(cstablestat->lstcstable[i].colBats[j]);
BATprint(csSampleEx[i].colBats[j]);
}
}
BBPunfix(lmap->batCacheid);
BBPunfix(rmap->batCacheid);
return MAL_SUCCEED;
+}
static void freeSampleData(CSSample *csSample, int numCand){ int i, j; @@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample, free(csSample); }
+static +void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){
int i, j;
for (i = 0; i < numCand; i++){
free(csSampleEx[i].lstProp);
free(csSampleEx[i].lstIsInfrequentProp);
free(csSampleEx[i].lstIsMVCol);
free(csSampleEx[i].candidates);
free(csSampleEx[i].lstSubjOid);
for (j = 0; j < csSampleEx[i].numProp; j++){
BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
}
free(csSampleEx[i].colBats);
}
free(csSampleEx);
+}
static void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){ int i,j; @@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample, return MAL_SUCCEED; }
+#if 0 +static +str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){
int i,j, k;
FILE *fout, *fouttb, *foutis;
char filename[100], filename2[100], filename3[100];
char tmpStr[20], tmpStr2[20], tmpStr3[20];
int ret;
str propStr;
str subjStr;
char* schema = "rdf";
CSSample sample;
CS freqCS;
char objType = 0;
str objStr;
oid objOid = BUN_NONE;
BATiter mapi;
str canStr;
char isTitle = 0;
char isUrl = 0;
char isType = 0;
char isDescription = 0;
char isImage = 0;
char isSite = 0;
char isEmail = 0;
char isCountry = 0;
char isLocality = 0;
BAT *lmap = NULL, *rmap = NULL
+#if USE_SHORT_NAMES
str propStrShort = NULL;
char *pch;
+#endif
mapi = bat_iterator(mbat);
if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
throw(RDF, "rdf.rdfschema",
"could not open the tokenizer\n");
}
checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
Thanks Lefteris,
I should look at that before implementing my own function -_-.
Best,
Minh-Duc
----- Original Message ----- From: "Lefteris" lsidir@gmail.com To: "Communication channel for developers of the MonetDB suite." developers-list@monetdb.org Sent: Wednesday, February 26, 2014 3:21:52 PM Subject: Re: MonetDB: rdf - Generate sample data for all tables
Hi Duc,
I did not read in detail your function, but you can also use BATsample_ (which is the void headed version of BATsample)
lefteris
On Wed, Feb 26, 2014 at 2:10 PM, Minh-Duc Pham commits@monetdb.org wrote:
Changeset: e7109fc24610 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Generate sample data for all tables
diffs (truncated from 721 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; } +static +char isInfrequentSampleCol(CS freqCS, PropTypes pt){
if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
else return 0;
+}
static void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ @@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
}
}
+static +void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){
oid id;
id = pos;
id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID);
*sOid = id;
+}
+static +str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){
BUN pos;
oid *tmp;
pos = BUNfnd(BATmirror(rmap),sbt);
if (pos == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap");
}
tmp = (oid *) Tloc(lmap, pos);
if (*tmp == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap");
}
*origSbt = *tmp;
return MAL_SUCCEED;
+}
+static +str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){
BUN pos;
oid *tmp;
oid tmporigOid = BUN_NONE;
char objType;
BUN maxObjectURIOid = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS
objType = getObjType(*obt);
if (objType == URI || objType == BLANKNODE){
tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
}
if (tmporigOid > maxObjectURIOid){
pos = BUNfnd(BATmirror(rmap),&tmporigOid);
if (pos == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap");
}
tmp = (oid *) Tloc(lmap, pos);
if (*tmp == BUN_NONE){
throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap");
}
*origObt = *tmp;
}
else{
*origObt = tmporigOid;
}
return MAL_SUCCEED;
+}
+static +str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables, bat *lmapbatid, bat *rmapbatid){
int i, j, k;
int freqId;
int tmpNumcand;
oid tmpCandidate;
int randValue = 0;
int ranPosition = 0; //random position of the instance in a table
int tmpNumCols;
int colIdx;
BAT *tmpbat = NULL;
BATiter tmpi;
BAT *cursamplebat = NULL;
int tmpNumRows = 0;
oid tmpSoid = BUN_NONE, origSoid = BUN_NONE;
oid origOid = BUN_NONE;
BAT *lmap = NULL, *rmap = NULL;
if ((lmap = BATdescriptor(*lmapbatid)) == NULL) {
throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
}
if ((rmap = BATdescriptor(*rmapbatid)) == NULL) {
BBPreleaseref(lmap->batCacheid);
throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
}
srand(123456);
for (i = 0; i < numTables; i++){
freqId = mTblIdxFreqIdxMapping[i];
csSampleEx[i].freqIdx = freqId;
tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
csSampleEx[i].name = cstablestat->lstcstable[i].tblname;
csSampleEx[i].candidateCount = tmpNumcand;
csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand);
for (k = 0; k < tmpNumcand; k++){
csSampleEx[i].candidates[k] = label[freqId].candidates[k];
}
//Randomly exchange the value, change the position k with a random pos
for (k = 0; k < tmpNumcand; k++){
randValue = rand() % tmpNumcand;
tmpCandidate = csSampleEx[i].candidates[k];
csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue];
csSampleEx[i].candidates[randValue] = tmpCandidate;
}
csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE);
for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
csSampleEx[i].lstSubjOid[k] = BUN_NONE;
tmpNumCols = csPropTypes[i].numProp - csPropTypes[i].numInfreqProp; //already remove infrequent column;
csSampleEx[i].numProp = tmpNumCols;
assert(tmpNumCols > 0);
csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols);
csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols);
csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols);
csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols);
colIdx = -1;
csSampleEx[i].numInstances = 0;
for(j = 0; j < csPropTypes[i].numProp; j++){
#if REMOVE_INFREQ_PROP
if (csPropTypes[i].lstPropTypes[j].defColIdx == -1) continue; //Infrequent prop
#endif
colIdx++;
csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop;
csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1);
//Mark whether this col is infrequent sample cols
if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){
csSampleEx[i].lstIsInfrequentProp[colIdx] = 1;
}
else
csSampleEx[i].lstIsInfrequentProp[colIdx] = 0;
//Mark whther this col is a MV col
csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp;
//if this is a multivalue column, get the data type of the first column
}
assert(colIdx == (tmpNumCols - 1));
// Inserting instances to csSampleEx
tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]);
for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){
ranPosition = rand() % tmpNumRows;
getSubjIdFromTablePosition(i, ranPosition, &tmpSoid);
if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){
throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt ");
}
csSampleEx[i].lstSubjOid[k] = origSoid;
for (j = 0; j < tmpNumCols; j++){
cursamplebat = csSampleEx[i].colBats[j];
tmpbat = cstablestat->lstcstable[i].colBats[j];
tmpi = bat_iterator(tmpbat);
if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){
//Get the original object oid
oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition);
if(*tmpOid != oid_nil){
if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){
throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt ");
}
BUNappend(cursamplebat, &origOid, TRUE);
}
else{
BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE);
}
}
else
BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE);
}
csSampleEx[i].numInstances++;
}
if (i == 0)
for (j = 0; j < tmpNumCols; j++){
//BATprint(cstablestat->lstcstable[i].colBats[j]);
BATprint(csSampleEx[i].colBats[j]);
}
}
BBPunfix(lmap->batCacheid);
BBPunfix(rmap->batCacheid);
return MAL_SUCCEED;
+}
static void freeSampleData(CSSample *csSample, int numCand){ int i, j; @@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample, free(csSample); }
+static +void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){
int i, j;
for (i = 0; i < numCand; i++){
free(csSampleEx[i].lstProp);
free(csSampleEx[i].lstIsInfrequentProp);
free(csSampleEx[i].lstIsMVCol);
free(csSampleEx[i].candidates);
free(csSampleEx[i].lstSubjOid);
for (j = 0; j < csSampleEx[i].numProp; j++){
BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
}
free(csSampleEx[i].colBats);
}
free(csSampleEx);
+}
static void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){ int i,j; @@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample, return MAL_SUCCEED; }
+#if 0 +static +str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){
int i,j, k;
FILE *fout, *fouttb, *foutis;
char filename[100], filename2[100], filename3[100];
char tmpStr[20], tmpStr2[20], tmpStr3[20];
int ret;
str propStr;
str subjStr;
char* schema = "rdf";
CSSample sample;
CS freqCS;
char objType = 0;
str objStr;
oid objOid = BUN_NONE;
BATiter mapi;
str canStr;
char isTitle = 0;
char isUrl = 0;
char isType = 0;
char isDescription = 0;
char isImage = 0;
char isSite = 0;
char isEmail = 0;
char isCountry = 0;
char isLocality = 0;
BAT *lmap = NULL, *rmap = NULL
+#if USE_SHORT_NAMES
str propStrShort = NULL;
char *pch;
+#endif
mapi = bat_iterator(mbat);
if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
throw(RDF, "rdf.rdfschema",
"could not open the tokenizer\n");
}
checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
_______________________________________________ developers-list mailing list developers-list@monetdb.org https://www.monetdb.org/mailman/listinfo/developers-list
participants (2)
-
Lefteris
-
Pham Duc