Thanks Robin!

Eventually, we should of course add a proper configure check and make conditional for the samtools ... ;-)

Stefan

Robin Cijvat <commits@monetdb.org> wrote:
Changeset: d87b8aafa5b7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d87b8aafa5b7
Added Files:
sql/backends/monet5/bam/85_bam.mal
sql/backends/monet5/bam/85_bam.sql
sql/backends/monet5/bam/Makefile.ag
sql/backends/monet5/bam/bam.mal
sql/backends/monet5/bam/bam_clear.sql
sql/backends/monet5/bam/bam_schema.sql
sql/backends/monet5/bam/bamloader.c
sql/backends/monet5/bam/bamloader.h
Modified Files:
sql/backends/monet5/Makefile.ag
Branch: DVframework_bam
Log Message:

Added bam folder under sql/backends/monet5/ containing UDF that provides functionality for loading BAM file contents into temporary BATs. To run code, samtools has to be installed and referred to in the Makefile of the directory.


diffs (truncated from 1260 to 300 lines):

diff --git a/sql/backends/monet5/Makefile.ag b/sql/backends/monet5/Makefile.ag
--- a/sql/backends/monet5/Makefile.ag
+++ b/sql/backends/monet5/Makefile.ag
@@ -15,7 +15,7 @@
# Copyright August 2008-2013 MonetDB B.V.
# All Rights Reserved.

-SUBDIRS = NOT_WIN32?vaults UDF LSST ENABLE_DATACELL?datacell HAVE_MSEED?miniseed
+SUBDIRS = NOT_WIN32?vaults UDF bam LSST ENABLE_DATACELL?datacell HAVE_MSEED?miniseed

INCLUDES = ../../include ../../common ../../storage ../../server \
../../../monetdb5/modules/atoms \
diff --git a/sql/backends/monet5/bam/85_bam.mal b/sql/backends/monet5/bam/85_bam.mal
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/85_bam.mal
@@ -0,0 +1,2 @@
+library bam;
+include bam;
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/85_bam.sql b/sql/backends/monet5/bam/85_bam.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -0,0 +1,2 @@
+CREATE PROCEDURE bamloader(repo string, mode int, num_threads int)
+external name bam.bamloader;
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/Makefile.ag b/sql/backends/monet5/bam/Makefile.ag
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/Makefile.ag
@@ -0,0 +1,62 @@
+# The contents of this file are subject to the MonetDB Public License
+# Version 1.1 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://www.monetdb.org/Legal/MonetDBLicense
+#
+# Software distributed under the License is distributed on an "AS IS"
+# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+# License for the specific language governing rights and limitations
+# under the License.
+#
+# The Original Code is the MonetDB Database System.
+#
+# The Initial Developer of the Original Code is CWI.
+# Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+# Copyright August 2008-2013 MonetDB B.V.
+# All Rights Reserved.
+
+INCLUDES = .. \
+ ../../../include \
+ ../../../common \
+ ../../../storage \
+ ../../../server \
+ ../../../../monetdb5/modules/atoms \
+ ../../../../monetdb5/modules/kernel \
+ ../../../../monetdb5/mal \
+ ../../../../monetdb5/modules/mal \
+ ./../../../monetdb5/optimizer \
+ ../../../../clients/mapilib \
+ ../../../../common/options \
+ ../../../../common/stream \
+ ../../../../gdk \
+ -I$(HOME)/Documents/BAM/samtools/include
+
+lib__bam = {
+ MODULE
+ DIR = libdir/monetdb5
+ SOURCES = bamloader.c bamloader.h
+ LIBS = ../../../../monetdb5/tools/libmonetdb5 \
+ ../../../../gdk/libbat \
+ -L$(HOME)/Documents/BAM/samtools/lib -lbam
+
+}
+
+headers_mal = {
+ HEADERS = mal
+ DIR = libdir/monetdb5
+ SOURCES = bam.mal
+}
+
+headers_sql = {
+ HEADERS = sql
+ DIR = libdir/monetdb5/createdb
+ SOURCES = 85_bam.sql
+}
+
+headers_autoload = {
+ HEADERS = mal
+ DIR = libdir/monetdb5/autoload
+ SOURCES = 85_bam.mal
+}
+
+EXTRA_DIST_DIR = Tests
diff --git a/sql/backends/monet5/bam/bam.mal b/sql/backends/monet5/bam/bam.mal
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam.mal
@@ -0,0 +1,5 @@
+module bam;
+
+pattern bamloader(entry:str, mode:int, num_threads:int):void
+address bamloader
+comment "Read the files in the BAM repository, fill and return a temp_container accordingly.";
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/bam_clear.sql b/sql/backends/monet5/bam/bam_clear.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_clear.sql
@@ -0,0 +1,7 @@
+DROP TABLE bam.alignments_extra;
+DROP TABLE bam.alignments;
+DROP TABLE bam.pg;
+DROP TABLE bam.rg;
+DROP TABLE bam.sq;
+DROP TABLE bam.files;
+DROP SCHEMA bam;
diff --git a/sql/backends/monet5/bam/bam_schema.sql b/sql/backends/monet5/bam/bam_schema.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_schema.sql
@@ -0,0 +1,91 @@
+CREATE SCHEMA bam;
+
+CREATE TABLE "bam"."files" (
+ "file_location" STRING NOT NULL UNIQUE,
+ "format_version" REAL,
+ "sorting_order" VARCHAR(10),
+ "comments" STRING,
+ CONSTRAINT "files_pkey_file_location" PRIMARY KEY (file_location)
+);
+
+CREATE TABLE "bam"."sq" (
+ "sn" STRING NOT NULL,
+ "file_location" STRING NOT NULL,
+ "ln" INT NOT NULL,
+ "as" INT,
+ "m5" STRING,
+ "sp" STRING,
+ "ur" STRING,
+ CONSTRAINT "sq_pkey_sn_file_location" PRIMARY KEY (sn, file_location),
+ CONSTRAINT "sq_fkey_file_location" FOREIGN KEY (file_location) REFERENCES bam.files (file_location)
+);
+
+CREATE TABLE "bam"."rg" (
+ "id" STRING NOT NULL UNIQUE,
+ "file_location" STRING NOT NULL,
+ "cn" STRING,
+ "ds" STRING,
+ "dt" TIMESTAMP,
+ "fo" STRING,
+ "ks" STRING,
+ "lb" STRING,
+ "pg" STRING,
+ "pi" INT,
+ "PL" STRING,
+ "PU" STRING,
+ "SM" STRING,
+ CONSTRAINT "rg_pkey_id_file_location" PRIMARY KEY (id, file_location),
+ CONSTRAINT "rg_fkey_file_location" FOREIGN KEY (file_location) REFERENCES bam.files (file_location)
+);
+
+CREATE TABLE "bam"."pg" (
+ "id" STRING NOT NULL UNIQUE,
+ "file_location" STRING NOT NULL,
+ "pn" STRING,
+ "cl" STRING,
+ "pp" STRING,
+ "vn" REAL,
+ CONSTRAINT "pg_pkey_id_file_location" PRIMARY KEY (id, file_location),
+ CONSTRAINT "pg_fkey_file_location" FOREIGN KEY (file_location) REFERENCES bam.files (file_location)
+);
+
+CREATE TABLE "bam"."alignments" (
+ "file_location" STRING NOT NULL,
+ "virtual_offset" INT NOT NULL,
+ "qname" STRING NOT NULL,
+ "flag_temp_mult_segm" BOOLEAN NOT NULL,
+ "flag_each_segm_prop_alig" BOOLEAN NOT NULL,
+ "flag_segm_unma" BOOLEAN NOT NULL,
+ "flag_next_segm_unma" BOOLEAN NOT NULL,
+ "flag_seq_reve_comp" BOOLEAN NOT NULL,
+ "flag_seq_next_segm_reve" BOOLEAN NOT NULL,
+ "flag_first_segm" BOOLEAN NOT NULL,
+ "flag_last_segm" BOOLEAN NOT NULL,
+ "flag_seco_alig" BOOLEAN NOT NULL,
+ "flag_not_pass_qual_cont" BOOLEAN NOT NULL,
+ "flag_pcr_opti_dupl" BOOLEAN NOT NULL,
+ "rname" STRING NOT NULL,
+ "pos" INT NOT NULL,
+ "mapq" INT NOT NULL,
+ "cigar" STRING NOT NULL,
+ "rnext" STRING NOT NULL,
+ "pnext" INT NOT NULL,
+ "tlen" INT NOT NULL,
+ "seq" STRING NOT NULL,
+ "qual" STRING NOT NULL,
+ CONSTRAINT "alignments_pkey_file_location_virtual_offset" PRIMARY KEY (file_location, virtual_offset),
+ CONSTRAINT "alignments_fkey_file_location" FOREIGN KEY (file_location) REFERENCES bam.files (file_location)
+);
+
+CREATE TABLE "bam"."alignments_extra" (
+ "tag" CHAR(2) NOT NULL,
+ "file_location" STRING NOT NULL,
+ "virtual_offset" INT NOT NULL,
+ "type" CHAR(1) NOT NULL,
+ "value" STRING,
+ CONSTRAINT "alignments_extra_pkey_tag_file_location_virtual_offset" PRIMARY KEY (tag, file_location, virtual_offset),
+ CONSTRAINT "alignments_extra_fkey_file_location_virtual_offset" FOREIGN KEY (file_location, virtual_offset) REFERENCES bam.alignments (file_location, virtual_offset)
+);
+
+
+
diff --git a/sql/backends/monet5/bam/bamloader.c b/sql/backends/monet5/bam/bamloader.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bamloader.c
@@ -0,0 +1,1013 @@
+#include <stdio.h>
+#include "monetdb_config.h"
+#include "bamloader.h"
+
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic library for these kind of structures
+ * keeps BAT and other properties of columns of a table.
+ */
+typedef struct {
+ bat *column_bats; /* keeps bats of the columns: lower array */
+ str *column_names; /* names of columns that are kept in the higher array */
+ str *column_types_strs; /* type strings of columns */
+} temp_subcontainer;
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic library for these kind of structures
+ * keeps (some) tables of a schema.
+ */
+typedef struct {
+ str schema_name; /* schema or vault name */
+ temp_subcontainer *tables_columns; /* keeps tables: higher array */
+ str *table_names; /* names of tables that are kept in the higher array */
+ int *num_columns; /* number of columns in each table in the higher array */
+ int num_tables;
+} temp_container;
+
+/*
+* File format specific structures
+*/
+typedef struct {
+ str tag;
+ str value;
+} bam_header_option;
+
+typedef struct {
+ str header_tag;
+ bam_header_option *options;
+ int num_options;
+} bam_header_line;
+
+
+/* Global vars */
+FILE *logfile = NULL; //keep logfile file opened in this global var while the bam code runs,
+ //since opening and closing every time something has to be written turned out to be very slow
+
+
+
+/* File format specific functions */
+str init_temp_container(temp_container *ret_tc);
+str loadfile(str filepath, temp_container *ret_tc); //load file and add contents to tc
+str process_bam_header(str filepath, str header, temp_container *ret_tc);
+int append_to_bat_cond(temp_container *ret_tc, bam_header_option *opt, str cmp, int table, int col, int *appendErr, int *flag);
+str read_bam_header_line(str *header, bam_header_line *ret_hl, int *eof);
+void free_bam_header_line(bam_header_line *hl);
+str process_bam_alignment(str filepath, int virtual_offset, bam_header_t *header, bam1_t *alignment, temp_container *ret_tc);
+int parse_alignment_str(str *sam_alig, str *dest);
+int parse_alignment_long(str *sam_alig, long int *dest);
+
+/* Generic functions */
+str init_temp_subcontainer(temp_subcontainer *ret_tsc,
+ str *col_names, str *col_types_strs, int *col_types, int num_cols);
+str append_to_bat(bat cb, ptr val);
+int concatenate_strs(str* words_to_concat, int num_words_to_concat, str* ret_concatenated);
+str prepare_insertion(Client cntxt, temp_container* tc);
+str insert_into_vault(Client cntxt, temp_container* tc);
+str register_table(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci);
+int read_string_until_delim(str *src, str *ret, char *delims, int num_delims);
+int parse_long(str *src, long int *i);
+int get_kth_bit(unsigned int i, int k);
+void append_to_log(str mssg);
+void free_temp_container(temp_container* tc);
+
+/* External functions */
+str SQLstatementIntern(Client c, str *expr, str nme, int execute, bit output);
+
+
+
+/* File format specific functions */


checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

--
| Stefan.Manegold@CWI.nl | Database Architectures (DA) |
| www.CWI.nl/~manegold | Science Park 123 (L321) |
| +31 (0)20 592-4212 | 1098 XG Amsterdam (NL) |