Thank you Mark, 

Actually I managed to solve this issue on late Friday night, even when using TLoc with integer values i was having random errors in the log and the db was shutdown.
Do I need to set tonil and tnil flags when using BATloop/bat_iterator/BUNappend?

Find bellow the complete implementation:

str
UDFyearbracket(str *ret, const date *v)
{
if (*v == date_nil) {
*ret = GDKstrdup(str_nil);
} else {
int y = 0;
fromdate(*v, NULL, NULL, &y);
*ret = (str)GDKmalloc(snprintf(NULL, 0, "%d", y) + 1);
if (*ret == NULL)
throw(MAL, "UDF.yearbracket", "memory allocation failure");
sprintf(*ret, "%d", y);
}
return MAL_SUCCEED;
}

str
UDFBATyearbracket(bat *ret, const bat *bid)
{
BAT *b, *bn;
BATiter bi;
BUN i,n;

if ((b = BATdescriptor(*bid)) == NULL)
throw(MAL, "UDF.BATyearbracket", "Cannot access descriptor");
n = BATcount(b);

bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT);
if (bn == NULL) {
BBPunfix(b->batCacheid);
throw(MAL, "UDF.BATyearbracket", "memory allocation failure");
}

bi = bat_iterator(b);
BATloop(b, i, n) {
char *y = NULL;
const date *t = (const date *) BUNtail(bi, i);
if (*t == date_nil) {
y = GDKstrdup(str_nil);
} else
UDFyearbracket(&y, t);
if (BUNappend(bn, y, 0) != GDK_SUCCEED) {
goto bailout;
}
GDKfree(y);
}

BBPkeepref(*ret = bn->batCacheid);
BBPunfix(b->batCacheid);
return MAL_SUCCEED;
  bailout:
BBPunfix(b->batCacheid);
BBPunfix(bn->batCacheid);
throw(MAL, "UDF.BATyearbracket", MAL_MALLOC_FAIL);
}

Thank You.

On Mon, Jan 2, 2017 at 5:58 PM, Mark Raasveldt <m.raasveldt@cwi.nl> wrote:
Hey Imad,

Apologies, scrolling back I noticed that was actually your first attempt at writing the UDF. The source of your error is not encoding related, the error is misleading.

The problem is that in your bulk version you are using Tloc(bn, i) to assign to a string column. Tloc should only be used with constant-sized columns, such as integers or dates. For variable-sized columns such as strings, you should use BUNappend to add values to the column. The reason for that is that string columns are not stored as an array of character pointers, which your initial implementation assumes. Instead, string columns use integers to point into a heap of strings. You are assigning a pointer to one of these integers, which makes MonetDB think the strings are in some random part of your memory. There’s a high chance that that random part of memory does not contain a valid UTF-8 string, hence you get the encoding error.

Try the following bulk implementation instead, using BUNappend instead of Tloc to assign to your column.

Regards,

Mark

str
UDFBATyearbracket(bat *ret, const bat *bid)
{
        BAT *b, *bn;
        BUN i,n;
        const date *t;

        if ((b = BATdescriptor(*bid)) == NULL)
                throw(MAL, "UDF.BATyearbracket", "Cannot access descriptor");
        n = BATcount(b);

        bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT);
        if (bn == NULL) {
                BBPunfix(b->batCacheid);
                throw(MAL, "UDF.BATyearbracket", "memory allocation failure");
        }
        bn->tnonil = 1;
        bn->tnil = 0;

        t = (const date *) Tloc(b, 0);
        for (i = 0; i < n; i++) {
                if (*t == date_nil) {
                        BUNappend(bn, str_nil, FALSE);
                        bn->tnonil = 0;
                        bn->tnil = 1;
                } else {
                        char* ret;
                        UDFyearbracket(&ret, t);
                        BUNappend(bn, ret, FALSE);
                }
                t++;
        }

        BATsetcount(bn, n);

        bn->tsorted = BATcount(bn)<2;
        bn->trevsorted = BATcount(bn)<2;

        BBPkeepref(*ret = bn->batCacheid);
        BBPunfix(b->batCacheid);
        return MAL_SUCCEED;
}

----- Original Message -----
From: "Mark Raasveldt" <m.raasveldt@cwi.nl>
To: "users-list" <users-list@monetdb.org>
Sent: Monday, January 2, 2017 4:32:32 PM
Subject: Re: C UDF

Hey Imad,

One of the nice things about UTF-8 is that normal ASCII characters are valid UTF-8. Hence “normal strings” in C are already valid UTF-8. Try simply returning the output from sprintf, like this:

str
UDFyearbracket(str *ret, const date *v)
{
        if (*v == date_nil) {
                *ret = GDKstrdup(str_nil);
        } else {
                int year;
                char *buf;
                fromdate(*v, NULL, NULL, &year);
                buf = (char *) GDKmalloc(15);
                sprintf(buf, "%d", year);
                *ret = buf;
        }
        return MAL_SUCCEED;
}

Regards,

Mark

> On 29 Dec 2016, at 14:35, imad hajj chahine <imad.hajj.chahine@gmail.com> wrote:
>
> Hi Sjoerd,
>
> I tried to used iconv with no luck, I am getting always empty string. I assumed the encoding that i am getting from sprintf are in "ISO-8859-1"
> Can you please take a look at the following implementation:
>
> str
> UDFyearbracket(str *ret, const date *v)
> {
>       if (*v == date_nil) {
>               *ret = GDKstrdup(str_nil);
>       } else {
>               iconv_t cv = iconv_open("UTF-8", "ISO-8859-1");
>               int factor = 4;
>               size_t fromlen, tolen;
>
>               int year;
>               char *buf;
>               char *retChar = (char *)*ret;
>               fromdate(*v, NULL, NULL, &year);
>               buf = (char *) GDKmalloc(15);
>               sprintf(buf, "%d", year);
>
>
>               fromlen = strlen(buf);
>               tolen = factor * fromlen + 1;
>               retChar = (char *) GDKmalloc(tolen);
>               iconv(cv, &buf, &fromlen, &retChar, &tolen);
>               iconv_close(cv);
>       }
>       return MAL_SUCCEED;
> }
>
> Thanks
>
_______________________________________________
users-list mailing list
users-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/users-list
_______________________________________________
users-list mailing list
users-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/users-list