Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 71 additions & 1 deletion parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,51 @@ func (p *parsing) parseNumber() bool {
return true
}

// ignore case equals a and b
func ignoreCaseEquals(a byte, b byte) bool {
return upperCaseAlphabet(a) == upperCaseAlphabet(b)
}

func upperCaseAlphabet(c byte) byte {
if c >= 'a' && c <= 'z' {
c -= 32
}
return c
}

func isAlphabet(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}

// ignore case match compares next bytes from data with `r`
//
// copy match method add ignore case logic
func (p *parsing) ignoreCaseMatch(r []byte, seek bool) bool {
if ignoreCaseEquals(r[0], p.curr) {
if len(r) > 1 {
if p.ensureBytes(len(r) - 1) {
var i = 1
for ; i < len(r); i++ {
if !ignoreCaseEquals(r[i], p.str[p.pos+i]) {
return false
}
}
if seek {
p.pos += i - 1
p.next()
}
return true
}
return false
}
if seek {
p.next()
}
return true
}
return false
}

// match compares next bytes from data with `r`
func (p *parsing) match(r []byte, seek bool) bool {
if r[0] == p.curr {
Expand Down Expand Up @@ -432,16 +477,41 @@ func (p *parsing) parseQuote() bool {
func (p *parsing) parseToken() bool {
if p.curr != 0 {
toks := p.t.index[p.curr]
if isAlphabet(p.curr) {
upCaseByte := upperCaseAlphabet(p.curr)
c := p.t.icIndex[upCaseByte]
toks = append(toks, c...)
}
// toks := p.t.index[p.curr]
if toks != nil {
start := p.pos
for _, t := range toks {
if p.match(t.Token, true) {

var matchFn func(r []byte, seek bool) bool
if t.IgnoreCase {
matchFn = p.ignoreCaseMatch
} else {
matchFn = p.match
}

if matchFn(t.Token, true) {
// alone split patch
if t.Alone && len(p.str) > start+len(t.Token) {
nt := p.str[start+len(t.Token)]
if nt >= '0' && nt <= '9' || nt >= 'a' && nt <= 'z' || nt >= 'A' && nt <= 'Z' || nt == '_' {
// rollback pos
p.pos = p.pos - (len(t.Token))
p.curr = p.str[p.pos]
continue
}
}
p.token.key = t.Key
p.token.offset = p.offset + start
p.token.value = t.Token
p.emmitToken()
return true
}

}
}
}
Expand Down
56 changes: 45 additions & 11 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ type tokenRef struct {
Key TokenKey
// Token value as is. Should be unique.
Token []byte
// Token Alone flag, with true mean token only split with Non-continuous string
Alone bool
// ignore token case
IgnoreCase bool
}

// QuoteInjectSettings describes open injection token and close injection token.
Expand Down Expand Up @@ -129,8 +133,10 @@ type Tokenizer struct {
stopOnUnknown bool
allowNumberUnderscore bool
// all defined custom tokens {key: [token1, token2, ...], ...}
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
// with ignore case token index
icIndex map[byte][]*tokenRef
quotes []*StringSettings
wSpaces []byte
kwMajorSymbols []rune
Expand All @@ -144,6 +150,7 @@ func New() *Tokenizer {
// flags: 0,
tokens: map[TokenKey][]*tokenRef{},
index: map[byte][]*tokenRef{},
icIndex: map[byte][]*tokenRef{},
quotes: []*StringSettings{},
wSpaces: DefaultWhiteSpaces,
}
Expand Down Expand Up @@ -206,27 +213,54 @@ func (t *Tokenizer) AllowNumberUnderscore() *Tokenizer {
return t
}

type DefineTokenOption func(*tokenRef)

func AloneTokenOption(ref *tokenRef) {
ref.Alone = true
}

func IgnoreCaseTokenOption(ref *tokenRef) {
ref.IgnoreCase = true
}

// DefineTokens add custom token.
// The `key` is the identifier of `tokens`, `tokens` — slice of tokens as string.
// If a key already exists, tokens will be rewritten.
func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string, options ...DefineTokenOption) *Tokenizer {
var tks []*tokenRef
if key < 1 {
return t
}
for _, token := range tokens {
ref := tokenRef{
Key: key,
Token: s2b(token),
Key: key,
Token: s2b(token),
Alone: false,
IgnoreCase: false,
}
if len(options) > 0 {
for _, option := range options {
option(&ref)
}
}
head := ref.Token[0]

tks = append(tks, &ref)
if t.index[head] == nil {
t.index[head] = []*tokenRef{}
var index map[byte][]*tokenRef
var head byte

if ref.IgnoreCase {
index = t.icIndex
head = upperCaseAlphabet(ref.Token[0])
} else {
index = t.index
head = ref.Token[0]
}
if index[head] == nil {
index[head] = []*tokenRef{}
}
t.index[head] = append(t.index[head], &ref)
sort.Slice(t.index[head], func(i, j int) bool {
return len(t.index[head][i].Token) > len(t.index[head][j].Token)
index[head] = append(index[head], &ref)
sort.Slice(index[head], func(i, j int) bool {
return len(index[head][i].Token) > len(index[head][j].Token)
})
}
t.tokens[key] = tks
Expand Down
26 changes: 25 additions & 1 deletion tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,34 @@ package tokenizer

import (
"bytes"
"github.com/stretchr/testify/require"
"strings"
"testing"

"github.com/stretchr/testify/require"
)

func TestTokenizeParse(t *testing.T) {
THello := TokenKey(100)
TWorld := TokenKey(101)
TRoleKey := TokenKey(105)
TAndKey := TokenKey(106)

tokenizer := New()
// ignore case
tokenizer.DefineTokens(THello, []string{"hello"}, IgnoreCaseTokenOption)
tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption)
tokenizer.DefineTokens(TWorld, []string{"world"}, IgnoreCaseTokenOption, AloneTokenOption)
tokenizer.DefineTokens(TAndKey, []string{"and"})
input := "HeLLoWoRlD can match,prefixWorld role and roles both not match,but Role and WorLd is match will"
stream := tokenizer.ParseString(input)
for stream.IsValid() {
token := stream.CurrentToken()
t.Logf("[%d:%d] %s %v", token.Line(), token.Offset(), token.ValueString(), token.Key())
stream.GoNext()
}

}

func TestTokenize(t *testing.T) {
type item struct {
value interface{}
Expand All @@ -17,6 +40,7 @@ func TestTokenize(t *testing.T) {
wordTokenKey := TokenKey(11)
dquoteKey := TokenKey(14)
tokenizer.AllowNumberUnderscore()

tokenizer.DefineTokens(condTokenKey, []string{">=", "<=", "==", ">", "<"})
tokenizer.DefineTokens(wordTokenKey, []string{"or", "или"})
tokenizer.SetWhiteSpaces([]byte{' ', '\t', '\n'})
Expand Down