MonetDB: DVframework_bam - Added bam folder under sql/backends/m...

Stefan Manegold Stefan.Manegold at cwi.nl
Sat Jun 15 08:50:28 CEST 2013


Thanks Robin!

Eventually, we should of course add a proper configure check and make conditional for the samtools ... ;-)

Stefan

Robin Cijvat <commits at monetdb.org> wrote:

>Changeset: d87b8aafa5b7 for MonetDB
>URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d87b8aafa5b7
>Added Files:
>	sql/backends/monet5/bam/85_bam.mal
>	sql/backends/monet5/bam/85_bam.sql
>	sql/backends/monet5/bam/Makefile.ag
>	sql/backends/monet5/bam/bam.mal
>	sql/backends/monet5/bam/bam_clear.sql
>	sql/backends/monet5/bam/bam_schema.sql
>	sql/backends/monet5/bam/bamloader.c
>	sql/backends/monet5/bam/bamloader.h
>Modified Files:
>	sql/backends/monet5/Makefile.ag
>Branch: DVframework_bam
>Log Message:
>
>Added bam folder under sql/backends/monet5/ containing UDF that
>provides functionality for loading BAM file contents into temporary
>BATs. To run code, samtools has to be installed and referred to in the
>Makefile of the directory.
>
>
>diffs (truncated from 1260 to 300 lines):
>
>diff --git a/sql/backends/monet5/Makefile.ag
>b/sql/backends/monet5/Makefile.ag
>--- a/sql/backends/monet5/Makefile.ag
>+++ b/sql/backends/monet5/Makefile.ag
>@@ -15,7 +15,7 @@
> # Copyright August 2008-2013 MonetDB B.V.
> # All Rights Reserved.
> 
>-SUBDIRS = NOT_WIN32?vaults UDF LSST ENABLE_DATACELL?datacell
>HAVE_MSEED?miniseed
>+SUBDIRS = NOT_WIN32?vaults UDF bam LSST ENABLE_DATACELL?datacell
>HAVE_MSEED?miniseed
> 
> INCLUDES = ../../include ../../common ../../storage ../../server \
> 		   ../../../monetdb5/modules/atoms \
>diff --git a/sql/backends/monet5/bam/85_bam.mal
>b/sql/backends/monet5/bam/85_bam.mal
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/85_bam.mal
>@@ -0,0 +1,2 @@
>+library bam;
>+include bam;
>\ No newline at end of file
>diff --git a/sql/backends/monet5/bam/85_bam.sql
>b/sql/backends/monet5/bam/85_bam.sql
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/85_bam.sql
>@@ -0,0 +1,2 @@
>+CREATE PROCEDURE bamloader(repo string, mode int, num_threads int)
>+external name bam.bamloader;
>\ No newline at end of file
>diff --git a/sql/backends/monet5/bam/Makefile.ag
>b/sql/backends/monet5/bam/Makefile.ag
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/Makefile.ag
>@@ -0,0 +1,62 @@
>+# The contents of this file are subject to the MonetDB Public License
>+# Version 1.1 (the "License"); you may not use this file except in
>+# compliance with the License. You may obtain a copy of the License at
>+# http://www.monetdb.org/Legal/MonetDBLicense
>+#
>+# Software distributed under the License is distributed on an "AS IS"
>+# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
>the
>+# License for the specific language governing rights and limitations
>+# under the License.
>+#
>+# The Original Code is the MonetDB Database System.
>+#
>+# The Initial Developer of the Original Code is CWI.
>+# Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
>+# Copyright August 2008-2013 MonetDB B.V.
>+# All Rights Reserved.
>+
>+INCLUDES = .. \
>+	../../../include \
>+	../../../common \
>+	../../../storage \ 
>+	../../../server \
>+	../../../../monetdb5/modules/atoms \
>+	../../../../monetdb5/modules/kernel \
>+	../../../../monetdb5/mal \
>+	../../../../monetdb5/modules/mal \
>+	../../../../monetdb5/optimizer \
>+	../../../../clients/mapilib \
>+	../../../../common/options \
>+	../../../../common/stream \
>+	../../../../gdk \
>+    -I$(HOME)/Documents/BAM/samtools/include
>+
>+lib__bam = {
>+	MODULE
>+	DIR = libdir/monetdb5
>+	SOURCES = bamloader.c bamloader.h
>+	LIBS = ../../../../monetdb5/tools/libmonetdb5 \
>+		   ../../../../gdk/libbat \
>+           -L$(HOME)/Documents/BAM/samtools/lib -lbam
>+		   
>+}
>+
>+headers_mal = {
>+	HEADERS = mal
>+	DIR = libdir/monetdb5
>+	SOURCES = bam.mal
>+}
>+
>+headers_sql = {
>+	HEADERS = sql
>+	DIR = libdir/monetdb5/createdb
>+	SOURCES = 85_bam.sql
>+}
>+
>+headers_autoload = {
>+	HEADERS = mal
>+	DIR = libdir/monetdb5/autoload
>+	SOURCES = 85_bam.mal
>+}
>+
>+EXTRA_DIST_DIR = Tests
>diff --git a/sql/backends/monet5/bam/bam.mal
>b/sql/backends/monet5/bam/bam.mal
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/bam.mal
>@@ -0,0 +1,5 @@
>+module bam;
>+
>+pattern bamloader(entry:str, mode:int, num_threads:int):void
>+address bamloader
>+comment "Read the files in the BAM repository, fill and return a
>temp_container accordingly.";
>\ No newline at end of file
>diff --git a/sql/backends/monet5/bam/bam_clear.sql
>b/sql/backends/monet5/bam/bam_clear.sql
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/bam_clear.sql
>@@ -0,0 +1,7 @@
>+DROP TABLE bam.alignments_extra;
>+DROP TABLE bam.alignments;
>+DROP TABLE bam.pg;
>+DROP TABLE bam.rg;
>+DROP TABLE bam.sq;
>+DROP TABLE bam.files;
>+DROP SCHEMA bam;
>diff --git a/sql/backends/monet5/bam/bam_schema.sql
>b/sql/backends/monet5/bam/bam_schema.sql
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/bam_schema.sql
>@@ -0,0 +1,91 @@
>+CREATE SCHEMA bam;
>+
>+CREATE TABLE "bam"."files" (
>+    "file_location"                 STRING      NOT NULL      UNIQUE,
>+    "format_version"                REAL,
>+    "sorting_order"                 VARCHAR(10),
>+    "comments"                      STRING,
>+    CONSTRAINT "files_pkey_file_location" PRIMARY KEY (file_location)
>+);
>+
>+CREATE TABLE "bam"."sq" (
>+    "sn"                            STRING      NOT NULL,
>+    "file_location"                 STRING      NOT NULL,
>+    "ln"                            INT         NOT NULL,
>+    "as"                            INT,
>+    "m5"                            STRING,
>+    "sp"                            STRING,
>+    "ur"                            STRING,
>+    CONSTRAINT "sq_pkey_sn_file_location" PRIMARY KEY (sn,
>file_location),
>+    CONSTRAINT "sq_fkey_file_location" FOREIGN KEY (file_location)
>REFERENCES bam.files (file_location)
>+);
>+
>+CREATE TABLE "bam"."rg" (
>+    "id"                            STRING      NOT NULL      UNIQUE,
>+    "file_location"                 STRING      NOT NULL,
>+    "cn"                            STRING,
>+    "ds"                            STRING,
>+    "dt"                            TIMESTAMP,
>+    "fo"                            STRING,
>+    "ks"                            STRING,
>+    "lb"                            STRING,
>+    "pg"                            STRING,
>+    "pi"                            INT,
>+    "PL"                            STRING,
>+    "PU"                            STRING,
>+    "SM"                            STRING,
>+    CONSTRAINT "rg_pkey_id_file_location" PRIMARY KEY (id,
>file_location),
>+    CONSTRAINT "rg_fkey_file_location" FOREIGN KEY (file_location)
>REFERENCES bam.files (file_location)
>+);
>+
>+CREATE TABLE "bam"."pg" (
>+    "id"                            STRING      NOT NULL    UNIQUE,
>+    "file_location"                 STRING      NOT NULL,
>+    "pn"                            STRING,
>+    "cl"                            STRING,
>+    "pp"                            STRING,
>+    "vn"                            REAL,
>+    CONSTRAINT "pg_pkey_id_file_location" PRIMARY KEY (id,
>file_location),
>+    CONSTRAINT "pg_fkey_file_location" FOREIGN KEY (file_location)
>REFERENCES bam.files (file_location)
>+);
>+
>+CREATE TABLE "bam"."alignments" (
>+    "file_location"                 STRING      NOT NULL,
>+    "virtual_offset"                INT         NOT NULL,
>+    "qname"                         STRING      NOT NULL,
>+    "flag_temp_mult_segm"           BOOLEAN     NOT NULL,
>+    "flag_each_segm_prop_alig"      BOOLEAN     NOT NULL,
>+    "flag_segm_unma"                BOOLEAN     NOT NULL,
>+    "flag_next_segm_unma"           BOOLEAN     NOT NULL,
>+    "flag_seq_reve_comp"            BOOLEAN     NOT NULL,
>+    "flag_seq_next_segm_reve"       BOOLEAN     NOT NULL,
>+    "flag_first_segm"               BOOLEAN     NOT NULL,
>+    "flag_last_segm"                BOOLEAN     NOT NULL,
>+    "flag_seco_alig"                BOOLEAN     NOT NULL,
>+    "flag_not_pass_qual_cont"       BOOLEAN     NOT NULL,
>+    "flag_pcr_opti_dupl"            BOOLEAN     NOT NULL,
>+    "rname"                         STRING      NOT NULL,
>+    "pos"                           INT         NOT NULL,
>+    "mapq"                          INT         NOT NULL,
>+    "cigar"                         STRING      NOT NULL,
>+    "rnext"                         STRING      NOT NULL,
>+    "pnext"                         INT         NOT NULL,
>+    "tlen"                          INT         NOT NULL,
>+    "seq"                           STRING      NOT NULL,
>+    "qual"                          STRING      NOT NULL,
>+    CONSTRAINT "alignments_pkey_file_location_virtual_offset" PRIMARY
>KEY (file_location, virtual_offset),
>+    CONSTRAINT "alignments_fkey_file_location" FOREIGN KEY
>(file_location) REFERENCES bam.files (file_location)
>+);
>+
>+CREATE TABLE "bam"."alignments_extra" (
>+    "tag"                           CHAR(2)     NOT NULL,
>+    "file_location"                 STRING      NOT NULL,
>+    "virtual_offset"                INT         NOT NULL,
>+    "type"                          CHAR(1)     NOT NULL,
>+    "value"                         STRING,
>+    CONSTRAINT
>"alignments_extra_pkey_tag_file_location_virtual_offset" PRIMARY KEY
>(tag, file_location, virtual_offset),
>+    CONSTRAINT "alignments_extra_fkey_file_location_virtual_offset"
>FOREIGN KEY (file_location, virtual_offset) REFERENCES bam.alignments
>(file_location, virtual_offset)
>+);
>+
>+
>+
>diff --git a/sql/backends/monet5/bam/bamloader.c
>b/sql/backends/monet5/bam/bamloader.c
>new file mode 100644
>--- /dev/null
>+++ b/sql/backends/monet5/bam/bamloader.c
>@@ -0,0 +1,1013 @@
>+#include <stdio.h>
>+#include "monetdb_config.h"
>+#include "bamloader.h"
>+
>+
>+/*
>+ * NOTE: Copied directly from miniseed/registrar.c
>+ * TODO: Make both miniseed/registrar.c and this file include some
>generic library for these kind of structures
>+ * keeps BAT and other properties of columns of a table.
>+ */
>+typedef struct {
>+	bat *column_bats; /* keeps bats of the columns: lower array */
>+	str *column_names; /* names of columns that are kept in the higher
>array */
>+    str *column_types_strs; /* type strings of columns */
>+} temp_subcontainer;
>+
>+/*
>+ * NOTE: Copied directly from miniseed/registrar.c
>+ * TODO: Make both miniseed/registrar.c and this file include some
>generic library for these kind of structures
>+ * keeps (some) tables of a schema.
>+ */
>+typedef struct {
>+	str schema_name; /* schema or vault name */
>+	temp_subcontainer *tables_columns; /* keeps tables: higher array */
>+	str *table_names; /* names of tables that are kept in the higher
>array */
>+	int *num_columns; /* number of columns in each table in the higher
>array */
>+	int num_tables;
>+} temp_container;
>+
>+/*
>+* File format specific structures
>+*/
>+typedef struct {
>+    str tag;
>+    str value;
>+} bam_header_option;
>+
>+typedef struct {
>+    str header_tag;
>+    bam_header_option *options;
>+    int num_options;
>+} bam_header_line;
>+
>+
>+/* Global vars */
>+FILE *logfile = NULL; //keep logfile file opened in this global var
>while the bam code runs, 
>+                        //since opening and closing every time
>something has to be written turned out to be very slow
>+
>+
>+
>+/* File format specific functions */
>+str init_temp_container(temp_container *ret_tc);
>+str loadfile(str filepath, temp_container *ret_tc); //load file and
>add contents to tc
>+str process_bam_header(str filepath, str header, temp_container
>*ret_tc);
>+int append_to_bat_cond(temp_container *ret_tc, bam_header_option *opt,
>str cmp, int table, int col, int *appendErr, int *flag);
>+str read_bam_header_line(str *header, bam_header_line *ret_hl, int
>*eof);
>+void free_bam_header_line(bam_header_line *hl);
>+str process_bam_alignment(str filepath, int virtual_offset,
>bam_header_t *header, bam1_t *alignment, temp_container *ret_tc);
>+int parse_alignment_str(str *sam_alig, str *dest);
>+int parse_alignment_long(str *sam_alig, long int *dest);
>+
>+/* Generic functions */
>+str init_temp_subcontainer(temp_subcontainer *ret_tsc,
>+    str *col_names, str *col_types_strs, int *col_types, int
>num_cols);
>+str append_to_bat(bat cb, ptr val);
>+int concatenate_strs(str* words_to_concat, int num_words_to_concat,
>str* ret_concatenated);
>+str prepare_insertion(Client cntxt, temp_container* tc);
>+str insert_into_vault(Client cntxt, temp_container* tc);
>+str register_table(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr
>pci);
>+int read_string_until_delim(str *src, str *ret, char *delims, int
>num_delims);
>+int parse_long(str *src, long int *i);
>+int get_kth_bit(unsigned int i, int k);
>+void append_to_log(str mssg);
>+void free_temp_container(temp_container* tc);
>+
>+/* External functions */
>+str SQLstatementIntern(Client c, str *expr, str nme, int execute, bit
>output);
>+
>+
>+
>+/* File format specific functions */
>_______________________________________________
>checkin-list mailing list
>checkin-list at monetdb.org
>http://mail.monetdb.org/mailman/listinfo/checkin-list

-- 
| Stefan.Manegold at CWI.nl | Database Architectures   (DA) |
|  www.CWI.nl/~manegold  | Science Park 123 (L321) |
|   +31 (0)20 592-4212   | 1098 XG Amsterdam  (NL) |
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.monetdb.org/pipermail/developers-list/attachments/20130615/5a4eb5e6/attachment.html>


More information about the developers-list mailing list