sqllexer

package module
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 11, 2023 License: MIT Imports: 3 Imported by: 4

README

go-sqllexer

This repository contains a hand written SQL Lexer that tokenizes SQL queries with a focus on obfuscating and normalization. The lexer is written in Go with no external dependencies. Note This is NOT a SQL parser, it only tokenizes SQL queries.

Features

  • 🚀 Fast and lightweight tokenization (not regex based)
  • 🔒 Obfuscates sensitive data (e.g. numbers, strings, specific literals like dollar quoted strings in Postgres, etc.)
  • 📖 Even works with truncated queries
  • 🌐 UTF-8 support
  • 🔧 Normalizes obfuscated queries

Installation

go get github.com/DataDog/go-sqllexer

Usage

Tokenize
import "github.com/DataDog/go-sqllexer"

func main() {
    query := "SELECT * FROM users WHERE id = 1"
    lexer := sqllexer.New(query)
    tokens := lexer.ScanAll()
    for _, token := range tokens {
        fmt.Println(token)
    }
}
Obfuscate
import (
    "fmt"
    "github.com/DataDog/go-sqllexer"
)

func main() {
    query := "SELECT * FROM users WHERE id = 1"
    obfuscator := sqllexer.NewObfuscator()
    obfuscated := obfuscator.Obfuscate(query)
    // "SELECT * FROM users WHERE id = ?"
    fmt.Println(obfuscated)
}
Normalize
import (
    "fmt"
    "github.com/DataDog/go-sqllexer"
)

func main() {
    query := "SELECT * FROM users WHERE id in (?, ?)"
    normalizer := sqllexer.NewNormalizer(
        WithCollectComments(true),
        WithCollectCommands(true),
        WithCollectTables(true),
        WithKeepSQLAlias(false),
    )
    normalized, statementMetadata, err := normalizer.Normalize(query)
    // "SELECT * FROM users WHERE id in (?)"
    fmt.Println(normalized)
}

Testing

go test -v ./...

Benchmarks

go test -bench=. -benchmem ./...

License

MIT License

Documentation

Index

Examples

Constants

View Source
const (
	ArrayPlaceholder   = "( ? )"
	BracketPlaceholder = "[ ? ]"
)
View Source
const (
	StringPlaceholder = "?"
	NumberPlaceholder = "?"
)

Variables

View Source
var Commands = map[string]bool{
	"SELECT":   true,
	"INSERT":   true,
	"UPDATE":   true,
	"DELETE":   true,
	"CREATE":   true,
	"ALTER":    true,
	"DROP":     true,
	"JOIN":     true,
	"GRANT":    true,
	"REVOKE":   true,
	"COMMIT":   true,
	"BEGIN":    true,
	"TRUNCATE": true,
	"MERGE":    true,
}

Functions

func DiscardSQLAlias

func DiscardSQLAlias(input string) string

DiscardSQLAlias removes any SQL alias from the input string and returns the modified string. It uses a regular expression to match the alias pattern and replace it with an empty string. The function is case-insensitive and matches the pattern "AS <alias_name>". The input string is not modified in place.

func WithCollectCommands

func WithCollectCommands(collectCommands bool) normalizerOption

func WithCollectComments

func WithCollectComments(collectComments bool) normalizerOption

func WithCollectTables

func WithCollectTables(collectTables bool) normalizerOption

func WithDBMS

func WithDBMS(dbms DBMSType) lexerOption

func WithDollarQuotedFunc

func WithDollarQuotedFunc(dollarQuotedFunc bool) obfuscatorOption

func WithKeepSQLAlias

func WithKeepSQLAlias(keepSQLAlias bool) normalizerOption

func WithReplaceDigits

func WithReplaceDigits(replaceDigits bool) obfuscatorOption

func WithUppercaseKeywords added in v0.0.2

func WithUppercaseKeywords(uppercaseKeywords bool) normalizerOption

Types

type DBMSType added in v0.0.2

type DBMSType string
const (
	// DBMSSQLServer is a MS SQL Server
	DBMSSQLServer DBMSType = "mssql"
	// DBMSPostgres is a PostgreSQL Server
	DBMSPostgres DBMSType = "postgresql"
	// DBMSMySQL is a MySQL Server
	DBMSMySQL DBMSType = "mysql"
	// DBMSOracle is a Oracle Server
	DBMSOracle DBMSType = "oracle"
)

type Lexer

type Lexer struct {
	// contains filtered or unexported fields
}

SQL Lexer inspired from Rob Pike's talk on Lexical Scanning in Go

Example
query := "SELECT * FROM users WHERE id = 1"
lexer := New(query)
tokens := lexer.ScanAll()
fmt.Println(tokens)
Output:

[{6 SELECT} {2  } {8 *} {2  } {6 FROM} {2  } {6 users} {2  } {6 WHERE} {2  } {6 id} {2  } {7 =} {2  } {5 1}]

func New

func New(input string, opts ...lexerOption) *Lexer

func (*Lexer) Scan

func (s *Lexer) Scan() Token

Scan scans the next token and returns it.

func (*Lexer) ScanAll

func (s *Lexer) ScanAll() []Token

ScanAll scans the entire input string and returns a slice of tokens.

func (*Lexer) ScanAllTokens

func (s *Lexer) ScanAllTokens() <-chan Token

ScanAllTokens scans the entire input string and returns a channel of tokens. Use this if you want to process the tokens as they are scanned.

type LexerConfig added in v0.0.2

type LexerConfig struct {
	DBMS DBMSType
}

type Normalizer

type Normalizer struct {
	// contains filtered or unexported fields
}
Example
normalizer := NewNormalizer(
	WithCollectComments(true),
	WithCollectCommands(true),
	WithCollectTables(true),
	WithKeepSQLAlias(false),
)

normalizedSQL, statementMetadata, _ := normalizer.Normalize(
	`
		/* this is a comment */
		SELECT * FROM users WHERE id in (?, ?)
		`,
)

fmt.Println(normalizedSQL)
fmt.Println(statementMetadata)
Output:

SELECT * FROM users WHERE id in ( ? )
&{[users] [/* this is a comment */] [SELECT]}

func NewNormalizer

func NewNormalizer(opts ...normalizerOption) *Normalizer

func (*Normalizer) Normalize

func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normalized string, info *StatementMetadata, err error)

Normalize takes an input SQL string and returns a normalized SQL string, a StatementMetadata struct, and an error. The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder, and collects metadata such as table names, comments, and commands.

type Obfuscator

type Obfuscator struct {
	// contains filtered or unexported fields
}
Example
obfuscator := NewObfuscator()
obfuscated := obfuscator.Obfuscate("SELECT * FROM users WHERE id = 1")
fmt.Println(obfuscated)
Output:

SELECT * FROM users WHERE id = ?

func NewObfuscator

func NewObfuscator(opts ...obfuscatorOption) *Obfuscator

func (*Obfuscator) Obfuscate

func (o *Obfuscator) Obfuscate(input string, lexerOpts ...lexerOption) string

Obfuscate takes an input SQL string and returns an obfuscated SQL string. The obfuscator replaces all literal values with a single placeholder

type StatementMetadata

type StatementMetadata struct {
	Tables   []string
	Comments []string
	Commands []string
}

type Token

type Token struct {
	Type  TokenType
	Value string
}

Token represents a SQL token with its type and value.

type TokenType

type TokenType int
const (
	ERROR TokenType = iota
	EOF
	WS                     // whitespace
	STRING                 // string literal
	INCOMPLETE_STRING      // illegal string literal so that we can obfuscate it, e.g. 'abc
	NUMBER                 // number literal
	IDENT                  // identifier
	OPERATOR               // operator
	WILDCARD               // wildcard *
	COMMENT                // comment
	MULTILINE_COMMENT      // multiline comment
	PUNCTUATION            // punctuation
	DOLLAR_QUOTED_FUNCTION // dollar quoted function
	DOLLAR_QUOTED_STRING   // dollar quoted string
	NUMBERED_PARAMETER     // numbered parameter
	UNKNOWN                // unknown token
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL