parser.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -338,6 +338,51 @@ func (p *parsing) parseNumber() bool { @@
     	return true
     }
+    // ignore case equals a and b
+    func ignoreCaseEquals(a byte, b byte) bool {
+    	return upperCaseAlphabet(a) == upperCaseAlphabet(b)
+    }
+    func upperCaseAlphabet(c byte) byte {
+    	if c >= 'a' && c <= 'z' {
+    		c -= 32
+    	}
+    	return c
+    }
+    func isAlphabet(c byte) bool {
+    	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+    }
+    // ignore case match compares next bytes from data with `r`
+    //
+    // copy match method add ignore case logic
+    func (p *parsing) ignoreCaseMatch(r []byte, seek bool) bool {
+    	if ignoreCaseEquals(r[0], p.curr) {
+    		if len(r) > 1 {
+    			if p.ensureBytes(len(r) - 1) {
+    				var i = 1
+    				for ; i < len(r); i++ {
+    					if !ignoreCaseEquals(r[i], p.str[p.pos+i]) {
+    						return false
+    					}
+    				}
+    				if seek {
+    					p.pos += i - 1
+    					p.next()
+    				}
+    				return true
+    			}
+    			return false
+    		}
+    		if seek {
+    			p.next()
+    		}
+    		return true
+    	}
+    	return false
+    }
     // match compares next bytes from data with `r`
     func (p *parsing) match(r []byte, seek bool) bool {
     	if r[0] == p.curr {
@@ Expand Down Expand Up / @@ -432,16 +477,41 @@ func (p *parsing) parseQuote() bool { @@
     func (p *parsing) parseToken() bool {
     	if p.curr != 0 {
     		toks := p.t.index[p.curr]
+    		if isAlphabet(p.curr) {
+    			upCaseByte := upperCaseAlphabet(p.curr)
+    			c := p.t.icIndex[upCaseByte]
+    			toks = append(toks, c...)
+    		}
+    		// toks := p.t.index[p.curr]
     		if toks != nil {
     			start := p.pos
     			for _, t := range toks {
-    				if p.match(t.Token, true) {
+    				var matchFn func(r []byte, seek bool) bool
+    				if t.IgnoreCase {
+    					matchFn = p.ignoreCaseMatch
+    				} else {
+    					matchFn = p.match
+    				}
+    				if matchFn(t.Token, true) {
+    					// alone split patch
+    					if t.Alone && len(p.str) > start+len(t.Token) {
+    						nt := p.str[start+len(t.Token)]
+    						if nt >= '0' && nt <= '9' || nt >= 'a' && nt <= 'z' || nt >= 'A' && nt <= 'Z' || nt == '_' {
+    							// rollback pos
+    							p.pos = p.pos - (len(t.Token))
+    							p.curr = p.str[p.pos]
+    							continue
+    						}
+    					}
     					p.token.key = t.Key
     					p.token.offset = p.offset + start
     					p.token.value = t.Token
     					p.emmitToken()
     					return true
     				}
     			}
     		}
     	}
@@ Expand Down @@

tokenizer.go

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -70,6 +70,10 @@ type tokenRef struct {
  
    	Key TokenKey

    	// Token value as is. Should be unique.

    	Token []byte

    	// Token Alone flag, with true mean token only split with Non-continuous string

    	Alone bool

    	// ignore token case

    	IgnoreCase bool

    }

    // QuoteInjectSettings describes open injection token and close injection token.

    @@ -129,8 +133,10 @@ type Tokenizer struct {
  
    	stopOnUnknown         bool

    	allowNumberUnderscore bool

    	// all defined custom tokens {key: [token1, token2, ...], ...}

    	tokens         map[TokenKey][]*tokenRef

    	index          map[byte][]*tokenRef

    	tokens map[TokenKey][]*tokenRef

    	index  map[byte][]*tokenRef

    	// with ignore case token index

    	icIndex        map[byte][]*tokenRef

    	quotes         []*StringSettings

    	wSpaces        []byte

    	kwMajorSymbols []rune

    @@ -144,6 +150,7 @@ func New() *Tokenizer {
  
    		// flags:   0,

    		tokens:  map[TokenKey][]*tokenRef{},

    		index:   map[byte][]*tokenRef{},

    		icIndex: map[byte][]*tokenRef{},

    		quotes:  []*StringSettings{},

    		wSpaces: DefaultWhiteSpaces,

    	}

    @@ -206,27 +213,54 @@ func (t *Tokenizer) AllowNumberUnderscore() *Tokenizer {
  
    	return t

    }

    type DefineTokenOption func(*tokenRef)

    func AloneTokenOption(ref *tokenRef) {

    	ref.Alone = true

    }

    func IgnoreCaseTokenOption(ref *tokenRef) {

    	ref.IgnoreCase = true

    }

    // DefineTokens add custom token.

    // The `key` is the identifier of `tokens`, `tokens` — slice of tokens as string.

    // If a key already exists, tokens will be rewritten.

    func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {

    func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string, options ...DefineTokenOption) *Tokenizer {

    	var tks []*tokenRef

    	if key < 1 {

    		return t

    	}

    	for _, token := range tokens {

    		ref := tokenRef{

    			Key:   key,

    			Token: s2b(token),

    			Key:        key,

    			Token:      s2b(token),

    			Alone:      false,

    			IgnoreCase: false,

    		}

    		if len(options) > 0 {

    			for _, option := range options {

    				option(&ref)

    			}

    		}

    		head := ref.Token[0]

    		tks = append(tks, &ref)

    		if t.index[head] == nil {

    			t.index[head] = []*tokenRef{}

    		var index map[byte][]*tokenRef

    		var head byte

    		if ref.IgnoreCase {

    			index = t.icIndex

    			head = upperCaseAlphabet(ref.Token[0])

    		} else {

    			index = t.index

    			head = ref.Token[0]

    		}

    		if index[head] == nil {

    			index[head] = []*tokenRef{}

    		}

    		t.index[head] = append(t.index[head], &ref)

    		sort.Slice(t.index[head], func(i, j int) bool {

    			return len(t.index[head][i].Token) > len(t.index[head][j].Token)

    		index[head] = append(index[head], &ref)

    		sort.Slice(index[head], func(i, j int) bool {

    			return len(index[head][i].Token) > len(index[head][j].Token)

    		})

    	}

    	t.tokens[key] = tks

tokenizer_test.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,11 +2,34 @@ package tokenizer @@
     import (
     	"bytes"
-    	"github.com/stretchr/testify/require"
     	"strings"
     	"testing"
+    	"github.com/stretchr/testify/require"
     )
+    func TestTokenizeParse(t *testing.T) {
+    	THello := TokenKey(100)
+    	TWorld := TokenKey(101)
+    	TRoleKey := TokenKey(105)
+    	TAndKey := TokenKey(106)
+    	tokenizer := New()
+    	// ignore case
+    	tokenizer.DefineTokens(THello, []string{"hello"}, IgnoreCaseTokenOption)
+    	tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption)
+    	tokenizer.DefineTokens(TWorld, []string{"world"}, IgnoreCaseTokenOption, AloneTokenOption)
+    	tokenizer.DefineTokens(TAndKey, []string{"and"})
+    	input := "HeLLoWoRlD can match,prefixWorld role and roles both not match,but Role and WorLd is match will"
+    	stream := tokenizer.ParseString(input)
+    	for stream.IsValid() {
+    		token := stream.CurrentToken()
+    		t.Logf("[%d:%d] %s %v", token.Line(), token.Offset(), token.ValueString(), token.Key())
+    		stream.GoNext()
+    	}
+    }
     func TestTokenize(t *testing.T) {
     	type item struct {
     		value interface{}
@@ Expand All / @@ -17,6 +40,7 @@ func TestTokenize(t *testing.T) { @@
     	wordTokenKey := TokenKey(11)
     	dquoteKey := TokenKey(14)
     	tokenizer.AllowNumberUnderscore()
     	tokenizer.DefineTokens(condTokenKey, []string{">=", "<=", "==", ">", "<"})
     	tokenizer.DefineTokens(wordTokenKey, []string{"or", "или"})
     	tokenizer.SetWhiteSpaces([]byte{' ', '\t', '\n'})
@@ Expand Down @@

support `DefineTokens` with AloneTokenOption and IgnoreCaseTokenOption #38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

einsitang wants to merge 4 commits into bzick:master from einsitang:codespace-improved-waffle-r74qv7vjx7c5v54

-Original file line number
+Diff line change
@@ Expand Up / @@ -338,6 +338,51 @@ func (p *parsing) parseNumber() bool { @@
     	return true
     }
+    // ignore case equals a and b
+    func ignoreCaseEquals(a byte, b byte) bool {
+    	return upperCaseAlphabet(a) == upperCaseAlphabet(b)
+    }
+    func upperCaseAlphabet(c byte) byte {
+    	if c >= 'a' && c <= 'z' {
+    		c -= 32
+    	}
+    	return c
+    }
+    func isAlphabet(c byte) bool {
+    	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+    }
+    // ignore case match compares next bytes from data with `r`
+    //
+    // copy match method add ignore case logic
+    func (p *parsing) ignoreCaseMatch(r []byte, seek bool) bool {
+    	if ignoreCaseEquals(r[0], p.curr) {
+    		if len(r) > 1 {
+    			if p.ensureBytes(len(r) - 1) {
+    				var i = 1
+    				for ; i < len(r); i++ {
+    					if !ignoreCaseEquals(r[i], p.str[p.pos+i]) {
+    						return false
+    					}
+    				}
+    				if seek {
+    					p.pos += i - 1
+    					p.next()
+    				}
+    				return true
+    			}
+    			return false
+    		}
+    		if seek {
+    			p.next()
+    		}
+    		return true
+    	}
+    	return false
+    }
     // match compares next bytes from data with `r`
     func (p *parsing) match(r []byte, seek bool) bool {
     	if r[0] == p.curr {
@@ Expand Down Expand Up / @@ -432,16 +477,41 @@ func (p *parsing) parseQuote() bool { @@
     func (p *parsing) parseToken() bool {
     	if p.curr != 0 {
     		toks := p.t.index[p.curr]
+    		if isAlphabet(p.curr) {
+    			upCaseByte := upperCaseAlphabet(p.curr)
+    			c := p.t.icIndex[upCaseByte]
+    			toks = append(toks, c...)
+    		}
+    		// toks := p.t.index[p.curr]
     		if toks != nil {
     			start := p.pos
     			for _, t := range toks {
-    				if p.match(t.Token, true) {
+    				var matchFn func(r []byte, seek bool) bool
+    				if t.IgnoreCase {
+    					matchFn = p.ignoreCaseMatch
+    				} else {
+    					matchFn = p.match
+    				}
+    				if matchFn(t.Token, true) {
+    					// alone split patch
+    					if t.Alone && len(p.str) > start+len(t.Token) {
+    						nt := p.str[start+len(t.Token)]
+    						if nt >= '0' && nt <= '9' || nt >= 'a' && nt <= 'z' || nt >= 'A' && nt <= 'Z' || nt == '_' {
+    							// rollback pos
+    							p.pos = p.pos - (len(t.Token))
+    							p.curr = p.str[p.pos]
+    							continue
+    						}
+    					}
     					p.token.key = t.Key
     					p.token.offset = p.offset + start
     					p.token.value = t.Token
     					p.emmitToken()
     					return true
     				}
     			}
     		}
     	}
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,11 +2,34 @@ package tokenizer @@
     import (
     	"bytes"
-    	"github.com/stretchr/testify/require"
     	"strings"
     	"testing"
+    	"github.com/stretchr/testify/require"
     )
+    func TestTokenizeParse(t *testing.T) {
+    	THello := TokenKey(100)
+    	TWorld := TokenKey(101)
+    	TRoleKey := TokenKey(105)
+    	TAndKey := TokenKey(106)
+    	tokenizer := New()
+    	// ignore case
+    	tokenizer.DefineTokens(THello, []string{"hello"}, IgnoreCaseTokenOption)
+    	tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption)
+    	tokenizer.DefineTokens(TWorld, []string{"world"}, IgnoreCaseTokenOption, AloneTokenOption)
+    	tokenizer.DefineTokens(TAndKey, []string{"and"})
+    	input := "HeLLoWoRlD can match,prefixWorld role and roles both not match,but Role and WorLd is match will"
+    	stream := tokenizer.ParseString(input)
+    	for stream.IsValid() {
+    		token := stream.CurrentToken()
+    		t.Logf("[%d:%d] %s %v", token.Line(), token.Offset(), token.ValueString(), token.Key())
+    		stream.GoNext()
+    	}
+    }
     func TestTokenize(t *testing.T) {
     	type item struct {
     		value interface{}
@@ Expand All / @@ -17,6 +40,7 @@ func TestTokenize(t *testing.T) { @@
     	wordTokenKey := TokenKey(11)
     	dquoteKey := TokenKey(14)
     	tokenizer.AllowNumberUnderscore()
     	tokenizer.DefineTokens(condTokenKey, []string{">=", "<=", "==", ">", "<"})
     	tokenizer.DefineTokens(wordTokenKey, []string{"or", "или"})
     	tokenizer.SetWhiteSpaces([]byte{' ', '\t', '\n'})
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

support `DefineTokens` with AloneTokenOption and IgnoreCaseTokenOption #38

Uh oh!

Diff view

Diff view

There are no files selected for viewing

support DefineTokens with AloneTokenOption and IgnoreCaseTokenOption #38

Are you sure you want to change the base?

Uh oh!

support DefineTokens with AloneTokenOption and IgnoreCaseTokenOption #38

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

support `DefineTokens` with AloneTokenOption and IgnoreCaseTokenOption #38

support `DefineTokens` with AloneTokenOption and IgnoreCaseTokenOption #38