Giter Club home page Giter Club logo

qlbridge's Introduction

QLBridge - Go SQL Runtime Engine

A SQL execution engine for embedded use as a library for SQL or SQL-Like functionality. Hackable, add datasources ("Storage" can be rest apis, or anything), and add functions. See usage in https://github.com/dataux/dataux a federated Sql Engine mysql-compatible with backends (Elasticsearch, Google-Datastore, Mongo, Cassandra, Files).

Code Coverage GoDoc Build Status Go ReportCard

QLBridge Features and Goals

  • expression engine for evaluation of single expressions
  • execution of sql queries against your data, embedable, not coupled to storage layer
  • extend VM with custom go functions, provide rich basic library of functions
  • provide example backends (csv, elasticsearch, etc)

Dialects

Example of Expression Evaluation Engine

These expressions can be used stand-alone embedded usage in your app. But, are the same expressions which might be columns, where, group-by clauses in SQL. see example

func main() {

	// Add a custom function to the VM to make available to expression language
	expr.FuncAdd("email_is_valid", &EmailIsValid{})

	// This is the evaluation context which will be the data-source
	// to be evaluated against the expressions.  There is a very simple
	// interface you can use to create your own.
	evalContext := datasource.NewContextSimpleNative(map[string]interface{}{
		"int5":     5,
		"str5":     "5",
		"created":  dateparse.MustParse("12/18/2015"),
		"bvalt":    true,
		"bvalf":    false,
		"user_id":  "abc",
		"urls":     []string{"http://google.com", "http://nytimes.com"},
		"hits":     map[string]int64{"google.com": 5, "bing.com": 1},
		"email":    "[email protected]",
		"emailbad": "bob",
		"mt": map[string]time.Time{
			"event0": dateparse.MustParse("12/18/2015"),
			"event1": dateparse.MustParse("12/22/2015"),
		},
	})

	// Example list of expressions
	exprs := []string{
		"int5 == 5",
		`6 > 5`,
		`6 > 5.5`,
		`(4 + 5) / 2`,
		`6 == (5 + 1)`,
		`2 * (3 + 5)`,
		`todate("12/12/2012")`,
		`created > "now-1M"`, // Date math
		`created > "now-10y"`,
		`user_id == "abc"`,
		`email_is_valid(email)`,
		`email_is_valid(emailbad)`,
		`email_is_valid("not_an_email")`,
		`EXISTS int5`,
		`!exists(user_id)`,
		`mt.event0 > now()`, // step into child of maps
		`["portland"] LIKE "*land"`,
		`email contains "bob"`,
		`email NOT contains "bob"`,
		`[1,2,3] contains int5`,
		`[1,2,3,5] NOT contains int5`,
		`urls contains "http://google.com"`,
		`split("chicago,portland",",") LIKE "*land"`,
		`10 BETWEEN 1 AND 50`,
		`15.5 BETWEEN 1 AND "55.5"`,
		`created BETWEEN "now-50w" AND "12/18/2020"`,
		`toint(not_a_field) NOT IN ("a","b" 4.5)`,
		`
		OR (
			email != "[email protected]"
			AND (
				NOT EXISTS not_a_field
				int5 == 5 
			)
		)`,
	}

	for _, expression := range exprs {
		// Same ast can be re-used safely concurrently
		exprAst := expr.MustParse(expression)
		// Evaluate AST in the vm
		val, _ := vm.Eval(evalContext, exprAst)
		v := val.Value()
		u.Debugf("Output: %-35v T:%-15T expr:  %s", v, v, expression)
	}
}

// Example of a custom Function, that we are making available in the Expression VM
type EmailIsValid struct{}

func (m *EmailIsValid) Validate(n *expr.FuncNode) (expr.EvaluatorFunc, error) {
	if len(n.Args) != 1 {
		return nil, fmt.Errorf("Expected 1 arg for EmailIsValid(arg) but got %s", n)
	}
	return func(ctx expr.EvalContext, args []value.Value) (value.Value, bool) {
		if args[0] == nil || args[0].Err() || args[0].Nil() {
			return value.BoolValueFalse, true
		}
		if _, err := mail.ParseAddress(args[0].ToString()); err == nil {
			return value.BoolValueTrue, true
		}

		return value.BoolValueFalse, true
	}, nil
}
func (m *EmailIsValid) Type() value.ValueType { return value.BoolType }

Example SQL Runtime for Reading a Csv via Stdio, File

See example in qlcsv folder for a CSV reader, parser, evaluation engine.

./qlcsv -sql 'select 
		user_id, email, item_count * 2, yy(reg_date) > 10 
	FROM stdin where email_is_valid(email);' < users.csv
func main() {

	if sqlText == "" {
		u.Errorf("You must provide a valid select query in argument:    --sql=\"select ...\"")
		return
	}

	// load all of our built-in functions
	builtins.LoadAllBuiltins()

	// Add a custom function to the VM to make available to SQL language
	expr.FuncAdd("email_is_valid", &EmailIsValid{})

	// We are registering the "csv" datasource, to show that
	// the backend/sources can be easily created/added.  This csv
	// reader is an example datasource that is very, very simple.
	exit := make(chan bool)
	src, _ := datasource.NewCsvSource("stdin", 0, bytes.NewReader([]byte("##")), exit)
	schema.RegisterSourceAsSchema("example_csv", src)

	db, err := sql.Open("qlbridge", "example_csv")
	if err != nil {
		panic(err.Error())
	}
	defer db.Close()

	rows, err := db.Query(sqlText)
	if err != nil {
		u.Errorf("could not execute query: %v", err)
		return
	}
	defer rows.Close()
	cols, _ := rows.Columns()

	// this is just stupid hijinx for getting pointers for unknown len columns
	readCols := make([]interface{}, len(cols))
	writeCols := make([]string, len(cols))
	for i := range writeCols {
		readCols[i] = &writeCols[i]
	}
	fmt.Printf("\n\nScanning through CSV: (%v)\n\n", strings.Join(cols, ","))
	for rows.Next() {
		rows.Scan(readCols...)
		fmt.Println(strings.Join(writeCols, ", "))
	}
	fmt.Println("")
}

// Example of a custom Function, that we are adding into the Expression VM
//
//         select
//              user_id AS theuserid, email, item_count * 2, reg_date
//         FROM stdin
//         WHERE email_is_valid(email)
type EmailIsValid struct{}

func (m *EmailIsValid) Validate(n *expr.FuncNode) (expr.EvaluatorFunc, error) {
	if len(n.Args) != 1 {
		return nil, fmt.Errorf("Expected 1 arg for EmailIsValid(arg) but got %s", n)
	}
	return func(ctx expr.EvalContext, args []value.Value) (value.Value, bool) {
		if args[0] == nil || args[0].Err() || args[0].Nil() {
			return value.BoolValueFalse, true
		}
		if _, err := mail.ParseAddress(args[0].ToString()); err == nil {
			return value.BoolValueTrue, true
		}

		return value.BoolValueFalse, true
	}, nil
}
func (m *EmailIsValid) Type() value.ValueType { return value.BoolType }

[x]QL languages are making a comeback. It is still an easy, approachable way of working with data. Also, we see more and more ql's that are xql'ish but un-apologetically non-standard. This matches our observation that data is stored in more and more formats in more tools, services that aren't traditional db's but querying that data should still be easy. Examples Influx, GitQL, Presto, Hive, CQL, yql, ql.io, etc

Projects that access non-sql data via [x]ql

Go Script/VM interpreters

qlbridge's People

Contributors

akeustis avatar allisonmorgan avatar araddon avatar ashyablok avatar bpopadiuk avatar epsniff avatar junichif avatar kyledj avatar martint avatar michaelbironneau avatar prashantv avatar schmichael avatar timkaye11 avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

qlbridge's Issues

Aggregation's and GroupBy's

Implement groupby and aggregations Tasks. Just simple builtin aggregation functions (sum, avg, max)

  • GroupBy
    • aliasing on cols: aka "select user_id as uid from users group by user_id"
    • deterministic/sorted ordering on group by rows (currently bc map using non-determistic)
  • Distinct
    • distinct inside of count: select count(distinct(field)) from users
  • recognize non-group by aggregates (count, sum, avg) when in select clause.
  • Aggregation Functions
    • sum
    • count
    • avg
  • other functions
    • max, min, avg, top, bottom

remove legacy namespacing support

Originally, i made a big mistake on left.right namespacing identities, and allowed a bad format. Need to find a way to back this out. utility to find/remove?

particularly in generators/elasticsearch

# legacy
`key_name.field value` -> "key_name", "field value" 
# correct future state
`key_name.field value` -> "", "key_name.field value" 

Comparing AST's that are unequal size args can panic

Panic on Equal()

panic(0x7ff8d34a8ba0, 0xc4200100c0) 
	/usr/local/go/src/runtime/panic.go:458 +0x243 
github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr.(*ArrayNode).Equal(0xc48bbfc390, 0x7ff8d454e780, 0xc48bbfc4b0, 0x1) 
	/home/go/src/github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr/node.go:1979 +0x11a 
github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr.(*BinaryNode).Equal(0xc4a2f15da0, 0x7ff8d454e800, 0xc4a2f15740, 0x1) 
	/home/go/src/github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr/node.go:1387 +0x193 
github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr.(*BooleanNode).Equal(0xc4a2f15620, 0x7ff8d454e880, 0xc4a2f15c80, 0x1c) 
	/home/go/src/github.com/lytics/lio/vendor/github.com/araddon/qlbridge/expr/node.go:1529 +0x190 
github.com/lytics/lio/vendor/github.com/araddon/qlbridge/rel.(*FilterStatement).Equal(0xc49e9eb290, 0xc49e9ebc30, 0x0) 
	/home/go/src/github.com/lytics/lio/vendor/github.com/araddon/qlbridge/rel/filter.go:147 +0x225 

Generator: Elasticsearch left-right identity expression support

Currently in Elasticsearch Generator #158 the Right side of a left-right binary expressio must be a value, not an identity. We can epress right side identities using es5 scripting.

  • allow named functions to have their own converters
  • create testing dataset for es and tests for current generators
    • support es5 in put mappings
  • left-right generation
  • usage in dataux for FILTER's

Schema Store cleanup

The schema discovery/setup layer is horrible code, clean it up.

  • reduce complexity/lack of boundary of api
  • i removed the lazy load which allowed insert statements to insert into non-existent tables, and next time it would lazy load. need to decide do we allow lazy loading? force create table DDL? do inferred create with insert statement?
  • the schema/Show statements aren't working at the moment.

expressions need to pass through expressions to DB's

As a proxy, expressions in SQL could potentially be evaluated in one of several points of the lifecycle:;

  1. Initial Proxy Routing/Evaluation context: ie proxy recieves sql, evaluate before passing through to datastore
  2. DataStore: pass through expressions to db, used for db's that support them natively, either as a function in db, or a translation such as mutations ie col +1 or set type operations in cassandra, n1ql or postgres or mongo
  3. return, after returning data from datastore, before returning to requester, perform manipulations on it.

How to express these 3 different contexts? Currently I think only 1 is supported for update/insert/delete and 2 is not supported at all and 3 for select only.

UPDATE counterks.page_view_counts
  SET counter_value = counter_value + 2
  WHERE url_name='www.datastax.com' AND page_name='home';

CQL set operation

UPDATE users
  SET emails = emails + {'[email protected]'} WHERE user_id = 'frodo';

FilterQl enhancements

A few enhancements

  • support multi-statements in same query for parser
  • better filterql doc
  • better support for arbitraryily positioned comments in SELECT COLUMNS
  • easier debugging/better error messages via line/column position of tokens.

Node RoundTriping Json & Identity Cleanup

In implementation of the new Node -> Expr -> Json we should add round-tripping to our tests.

Additionally, we are having problems with left.right identities.

  • Ensure we have a large suite of tests that round-trip
  • fix left.right including in expr
  • replace current Protobuf with expr.ToPb

Function Registries, and enforce function existence errors

Currently the function-registry is assumed to be global scope. This is flawed, each table in a schema would have its own function registry, so that a elasticsearch backed table would not have the same available functions as say a mysql one, etc.

Source Backed Source

We need a way of having a source-backed-source. So that a rest-api or some high-latency, low query/index feature set can be fronted by a richer query engine.

Support Expression JSON

Expression needs to support to/from json to allow json api usage.

  • re-implement the custom AST objects (filterstatement, etc) as Node interfaces
    • new Include
    • new BooleanNode
    • parser/ast uses new objects
  • node logic Elide, simplification to easiest form, collapsing single boolean/binaries.
  • vm supports new boolean, include
  • new simpler anonymous embed Expr for Json
  • cleanup old code in Node, Parse tree, VM that is no longer used or makes sense or not idiomatic
  • part of #99 allow custom validation for functions, allow a interface for func, validation
  • Allow different func registries per evaluation context (es) instead of falling back to global
  • downstream conversion 100% on queries, expressions
  • Value, NodeArray ensure are 100% tested and implemented in Protobuf, Json Conversions
  • test the hell out of this
    • new tests for escaping table.field name type identities.
    • new round-trip test's with both protobuf, json
  • new simpler anonymous embed Expr for Protobuf
  • implement ToEs()

Better Error Messaging on Parse

Error messages on parse are not very good.

  • currently missing functions don't return an error as they should bc the validate allows missing.
  • better error messages
  • create a variety of failed queries, and watch for error messages, create tests for those cases.

Support funcs on context-wrapper/structs

Allow functions in XQL to pass through to structs, instead of only supporting builtin funcs.

for example FirstName() below would pass through to context-reader instead of builtins

SELECT  FirstName("--") FROM user

Functions should support per-function shared state

Many functions can have per-function shared state as many arguments are static for that parsed expression.

Maybe the Validate can return the func?

// The last arg "," is static, and the function should be able to pre-calculate that value
join(arg1,arg2, ",")  

Collapse logically un-used operations AST

Ensure logically equivalent statements equate to Equal() in code
by collapsing un-used logical statements:

FILTER OR (x = "y")   =   FILTER x = "y"
FILTER NOT (x = "y") =   FILTER X != "y"
  • Equal() for FilterQL

Improve Source & Execution Engine

Job/Exec engine pipeline/tasks need complete refactor.

Revamp the Plan/Exec with the following Goals

  • align source, and planning from dataux/qlbridge
    • get rid of SourceTask interface in dataux move to Accept visitor pattern w poly-fill
  • refactor sources, exec engine to be much cleaner/better/modular code

Tasks

  • Refactor the SubQuery/From to allow folding, use just source's as input to Join: each different from in query is its own source & discrete task
    • The TaskSequential/Parallel don't have channel routing (in, out, setup) right
    • rewrite where, source, etc to support fold
      • Where implemented in each SubSelect
      • bug in sourceproject/sqldrivermessage doesn't match columns expected in query
      • extract joinkey from Merge into JoinKey function/Task
        • implement JoinExpr to create key used in join expression (ie, for multi-condition join conditions, create composite key similar to cql)
        • create joinNodes during sql rewrite
        • are we going to use int? string? for the "key" or "id"? ie ? add update to use new Key()/Id() or equivalent
      • projection w unit test
    • implement sub-queries tests depends on #35
      • fix lexer which can't descend to sub-clauses
      • unit test for 3 sources
      • column list for merged sources (two sources feeding into 3rd)
  • TestSqlDbConnFailure: lock when running two queries
  • Naive Parallel Planner (see below drawing) single node

img_20150919_144144

FilterQL Improvements

Variety of improvements for FilterQL:

  • alias/with were reversed in parser
  • WITH clause was not available in String() functions and couldn't roundtrip
  • code cleanup
  • doc improvements

expr should support returns of (Value, evaluated, error)

Currently the expression Walk logic, and context.Get() are not consistent with respect to:

  • some return nil, true for could not be evaluated, some ErrValue, false, some return NilValue, false

TODO:

  • Find a test suite from another logic/script language

Replace SHOW/Describe and internal schema with rewrite, and database

The SHOW/Describe code is not great. Replace it with query-rewrite, so that show statements become normal Select statements that operate against normal data sources.

  • frontends (such as mysql) need to be able to over-ride default behavior of "Writers" so that dialect specific is builtin

TODO

  • implement more of the schema in normal datastore type backends
  • the plan.Context will need to use a different schema from connection, ensure the plan.Context.Schema is used not any connection specific ones throughout.
  • rewrite show statements to select statements to utilize the build-select which has better where/filter support
  • WHERE/LIKE filters : variables like '%xyz' add where-clause support for show/describe
  • variables (implement session)
  • ensure all downstream show, describe's work in dataux
    • fix horrible datasource/registry
  • resolve the new required schema dependency issue that breaks literal queries and unit test in plan

Allow distributed execution by allowing planner, executor to be swapped out

Allow a distributed planner (dataux) to replace built in qlbridge execution planner (in exec ) . Currently there is no separation of planning, execution, and no strong interface to allow the distributed executor to replace the native in-process qlbridge executor. Also, no serialization interfaces/capability to allow tasks to be deterministically distributed, and no primitives (partitions, tablets) to allow work to be broken down.

Phase 1 usecase: Query Mongo with 3 nodes participating, a groub-by aggregate query select AVG(CHAR_LENGTH(CAST(titleAS CHAR))) as title_avg from article GROUP BY category

  • 1 refactor the exec job builders to support over-riding core behavior
    • plan.Visitor with full suite of plan structs instead of using rel sql structs
    • revamp the task assembly (list of tasks, taskparent.Add(task)) to be interfaces not implementation
    • allow the job builder to have a TaskMaker factory for overriding built in task implementations
    • split the core visitor interfaces in exec into plan.Planner, exec.Walk where the output is dag of plan.Task
      • plan.PlannerDefault and all new plan types
      • implement new Walk Executor in exec that translates the plans -> tasks
    • move all of the plan, and exec Task creation into NewTaskName methods
  • 4 Partitionalble sources: underlying sources/interfaces need to be able to support some type of partitioning so a source query can be split across nodes.
    • hack in quick partition storage (Custom)
    • fix the distributed/group-by to pass underlying values (tuple?) allowing calculating finalization.
  • 5 network/serialization distributed-enable
    • protobuf plan.Tasks
    • protobuf implementation for node
    • protobuf for sql (only select ops) structs

depended on by dataux/dataux#9

moving until later:

  • 6 allow visualization of dag of tasks by supporting MarshalJson() or Explain()?

support CREATE statement in sql

support the standard CREATE statements

  • lex create statements including SOURCE, TABLE and create cols
  • parse source, table statements into apporpriate AST
  • support in exec query engine

Performance Regression

Some sources (think json files) have sparse columns within a row, that is vs the entire possible keys within a keyspace (row) any one row may have a subset. The messages that now require schema enforce creation of non-sparse rows representing these sparse rows as they go through sql exec tasks. This is really slow for very, very sparse data sets.

  • create benchmark test for current exec engine with tests of these methods:

Schema Information Layer, @@vars, Create, like

To be useful, it must expose schema info to tools such as Metabase, Looker, Tableau, etc. Make the currently in memory structs available as schema like traditional db, ie for show, create, like statements.

  • improve lexer for support of db.table syntax
  • SHOW FULL TABLES FROM temp LIKE '%'
  • variables: @@info
  • SHOW CREATE TABLE dataux.jobstate;
  • SHOW FULL COLUMNS FROM table FROM db;

misc expr node improvements

Misc usability improvements on the underlying expression Node's

  • convert IN statement from MultiArg -> BinaryNode w ArrayNode type
  • cleanup remove un-used interface methods (NodeType())
  • fingerprint() for filterql
  • fix negateable string for BinaryNode (LIKE)

Dialect Writers

Some dialects literals are escaped with ', others are ", Identties similarly are different. Pass through dialect writers to the Node and Statement's to allow dialect-specific outputs. This implies that qlbridge is starting to implement sql-builder functionality, for reasons of being able to take a query, and transform it into a different query. (most builders are to aid construction of sql by using non-sql native golang functions). See https://github.com/umisama/go-sqlbuilder and https://github.com/elgris/golang-sql-builder-benchmark

  • expr pkg
  • rel sql, filterql lib
  • possibly help support rewrite? knowledge of which functions can be passed through?

memdb source

datasource that will be good for files-api

  • memdb source

Simplify Node Interface for non Lexed ast usage

Construction of Nodes directly, not through lexing should be easier

  • Get rid of Pos
  • consolidate String and StringAST into just String
  • create helper functions for creating Nodes from New functions

Lexer/Parser core language improvements list

  • Allow Comment blocks on any location for sql
  • WHERE Clause improvements see here
  • CAS Statements from CQL
    • INSERT INTO cas_table (title, revid, last_modified) VALUES (?, ?, ?) IF NOT EXISTS
    • DELETE FROM cas_table WHERE title = ? and revid = ? IF last_modified = ?
  • CQL
  • postgresql

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.