diff --git a/parser.go b/parser.go index 003608e..002835e 100644 --- a/parser.go +++ b/parser.go @@ -338,6 +338,51 @@ func (p *parsing) parseNumber() bool { return true } +// ignore case equals a and b +func ignoreCaseEquals(a byte, b byte) bool { + return upperCaseAlphabet(a) == upperCaseAlphabet(b) +} + +func upperCaseAlphabet(c byte) byte { + if c >= 'a' && c <= 'z' { + c -= 32 + } + return c +} + +func isAlphabet(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +// ignore case match compares next bytes from data with `r` +// +// copy match method add ignore case logic +func (p *parsing) ignoreCaseMatch(r []byte, seek bool) bool { + if ignoreCaseEquals(r[0], p.curr) { + if len(r) > 1 { + if p.ensureBytes(len(r) - 1) { + var i = 1 + for ; i < len(r); i++ { + if !ignoreCaseEquals(r[i], p.str[p.pos+i]) { + return false + } + } + if seek { + p.pos += i - 1 + p.next() + } + return true + } + return false + } + if seek { + p.next() + } + return true + } + return false +} + // match compares next bytes from data with `r` func (p *parsing) match(r []byte, seek bool) bool { if r[0] == p.curr { @@ -432,16 +477,41 @@ func (p *parsing) parseQuote() bool { func (p *parsing) parseToken() bool { if p.curr != 0 { toks := p.t.index[p.curr] + if isAlphabet(p.curr) { + upCaseByte := upperCaseAlphabet(p.curr) + c := p.t.icIndex[upCaseByte] + toks = append(toks, c...) + } + // toks := p.t.index[p.curr] if toks != nil { start := p.pos for _, t := range toks { - if p.match(t.Token, true) { + + var matchFn func(r []byte, seek bool) bool + if t.IgnoreCase { + matchFn = p.ignoreCaseMatch + } else { + matchFn = p.match + } + + if matchFn(t.Token, true) { + // alone split patch + if t.Alone && len(p.str) > start+len(t.Token) { + nt := p.str[start+len(t.Token)] + if nt >= '0' && nt <= '9' || nt >= 'a' && nt <= 'z' || nt >= 'A' && nt <= 'Z' || nt == '_' { + // rollback pos + p.pos = p.pos - (len(t.Token)) + p.curr = p.str[p.pos] + continue + } + } p.token.key = t.Key p.token.offset = p.offset + start p.token.value = t.Token p.emmitToken() return true } + } } } diff --git a/tokenizer.go b/tokenizer.go index 22092c6..db291ad 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -70,6 +70,10 @@ type tokenRef struct { Key TokenKey // Token value as is. Should be unique. Token []byte + // Token Alone flag, with true mean token only split with Non-continuous string + Alone bool + // ignore token case + IgnoreCase bool } // QuoteInjectSettings describes open injection token and close injection token. @@ -129,8 +133,10 @@ type Tokenizer struct { stopOnUnknown bool allowNumberUnderscore bool // all defined custom tokens {key: [token1, token2, ...], ...} - tokens map[TokenKey][]*tokenRef - index map[byte][]*tokenRef + tokens map[TokenKey][]*tokenRef + index map[byte][]*tokenRef + // with ignore case token index + icIndex map[byte][]*tokenRef quotes []*StringSettings wSpaces []byte kwMajorSymbols []rune @@ -144,6 +150,7 @@ func New() *Tokenizer { // flags: 0, tokens: map[TokenKey][]*tokenRef{}, index: map[byte][]*tokenRef{}, + icIndex: map[byte][]*tokenRef{}, quotes: []*StringSettings{}, wSpaces: DefaultWhiteSpaces, } @@ -206,27 +213,54 @@ func (t *Tokenizer) AllowNumberUnderscore() *Tokenizer { return t } +type DefineTokenOption func(*tokenRef) + +func AloneTokenOption(ref *tokenRef) { + ref.Alone = true +} + +func IgnoreCaseTokenOption(ref *tokenRef) { + ref.IgnoreCase = true +} + // DefineTokens add custom token. // The `key` is the identifier of `tokens`, `tokens` — slice of tokens as string. // If a key already exists, tokens will be rewritten. -func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer { +func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string, options ...DefineTokenOption) *Tokenizer { var tks []*tokenRef if key < 1 { return t } for _, token := range tokens { ref := tokenRef{ - Key: key, - Token: s2b(token), + Key: key, + Token: s2b(token), + Alone: false, + IgnoreCase: false, + } + if len(options) > 0 { + for _, option := range options { + option(&ref) + } } - head := ref.Token[0] + tks = append(tks, &ref) - if t.index[head] == nil { - t.index[head] = []*tokenRef{} + var index map[byte][]*tokenRef + var head byte + + if ref.IgnoreCase { + index = t.icIndex + head = upperCaseAlphabet(ref.Token[0]) + } else { + index = t.index + head = ref.Token[0] + } + if index[head] == nil { + index[head] = []*tokenRef{} } - t.index[head] = append(t.index[head], &ref) - sort.Slice(t.index[head], func(i, j int) bool { - return len(t.index[head][i].Token) > len(t.index[head][j].Token) + index[head] = append(index[head], &ref) + sort.Slice(index[head], func(i, j int) bool { + return len(index[head][i].Token) > len(index[head][j].Token) }) } t.tokens[key] = tks diff --git a/tokenizer_test.go b/tokenizer_test.go index 55eeadf..50444b1 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -2,11 +2,34 @@ package tokenizer import ( "bytes" - "github.com/stretchr/testify/require" "strings" "testing" + + "github.com/stretchr/testify/require" ) +func TestTokenizeParse(t *testing.T) { + THello := TokenKey(100) + TWorld := TokenKey(101) + TRoleKey := TokenKey(105) + TAndKey := TokenKey(106) + + tokenizer := New() + // ignore case + tokenizer.DefineTokens(THello, []string{"hello"}, IgnoreCaseTokenOption) + tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption) + tokenizer.DefineTokens(TWorld, []string{"world"}, IgnoreCaseTokenOption, AloneTokenOption) + tokenizer.DefineTokens(TAndKey, []string{"and"}) + input := "HeLLoWoRlD can match,prefixWorld role and roles both not match,but Role and WorLd is match will" + stream := tokenizer.ParseString(input) + for stream.IsValid() { + token := stream.CurrentToken() + t.Logf("[%d:%d] %s %v", token.Line(), token.Offset(), token.ValueString(), token.Key()) + stream.GoNext() + } + +} + func TestTokenize(t *testing.T) { type item struct { value interface{} @@ -17,6 +40,7 @@ func TestTokenize(t *testing.T) { wordTokenKey := TokenKey(11) dquoteKey := TokenKey(14) tokenizer.AllowNumberUnderscore() + tokenizer.DefineTokens(condTokenKey, []string{">=", "<=", "==", ">", "<"}) tokenizer.DefineTokens(wordTokenKey, []string{"or", "или"}) tokenizer.SetWhiteSpaces([]byte{' ', '\t', '\n'})