From f28e4655ce64d2569141a73c0d5707b6d3670747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=88=B1=E5=9B=A0=E6=96=AF=E5=94=90?= <1745525+einsitang@users.noreply.github.com> Date: Mon, 30 Jun 2025 02:04:15 +0000 Subject: [PATCH 1/4] feat:token case insensitivity and support for separate splitting --- parser.go | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++- tokenizer.go | 27 +++++++++++++++++++--- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/parser.go b/parser.go index 003608e..2de550d 100644 --- a/parser.go +++ b/parser.go @@ -338,6 +338,48 @@ func (p *parsing) parseNumber() bool { return true } +// 忽略大小写对比 +func ignoreCaseEquals(a byte, b byte) bool { + if a >= 'a' && a <= 'z' { + a += 32 + } + + if b >= 'a' && b <= 'z' { + b += 32 + } + + return a == b +} + +// ignore case match compares next bytes from data with `r` +// +// copy match method add ignore case logic +func (p *parsing) ignoreCaseMatch(r []byte, seek bool) bool { + if ignoreCaseEquals(r[0], p.curr) { + if len(r) > 1 { + if p.ensureBytes(len(r) - 1) { + var i = 1 + for ; i < len(r); i++ { + if !ignoreCaseEquals(r[i], p.str[p.pos+i]) { + return false + } + } + if seek { + p.pos += i - 1 + p.next() + } + return true + } + return false + } + if seek { + p.next() + } + return true + } + return false +} + // match compares next bytes from data with `r` func (p *parsing) match(r []byte, seek bool) bool { if r[0] == p.curr { @@ -435,13 +477,32 @@ func (p *parsing) parseToken() bool { if toks != nil { start := p.pos for _, t := range toks { - if p.match(t.Token, true) { + + var matchFn func(r []byte, seek bool) bool + if t.IgnoreCase { + matchFn = p.ignoreCaseMatch + } else { + matchFn = p.match + } + + if matchFn(t.Token, true) { + // alone split patch + if t.Alone && len(p.str) > start+len(t.Token) { + nt := p.str[start+len(t.Token)] + if nt >= '0' && nt <= '9' || nt >= 'a' && nt <= 'z' || nt >= 'A' && nt <= 'Z' || nt == '_' { + // rollback pos + p.pos = p.pos - (len(t.Token)) + p.curr = p.str[p.pos] + continue + } + } p.token.key = t.Key p.token.offset = p.offset + start p.token.value = t.Token p.emmitToken() return true } + } } } diff --git a/tokenizer.go b/tokenizer.go index 22092c6..99d0e8b 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -70,6 +70,10 @@ type tokenRef struct { Key TokenKey // Token value as is. Should be unique. Token []byte + // Token Alone flag, with true mean token only split with Non-continuous string + Alone bool + // ignore token case + IgnoreCase bool } // QuoteInjectSettings describes open injection token and close injection token. @@ -206,18 +210,35 @@ func (t *Tokenizer) AllowNumberUnderscore() *Tokenizer { return t } +type DefineTokenOption func(*tokenRef) + +func AloneTokenOption(ref *tokenRef) { + ref.Alone = true +} + +func InsensitiveTokenOption(ref *tokenRef) { + ref.IgnoreCase = true +} + // DefineTokens add custom token. // The `key` is the identifier of `tokens`, `tokens` — slice of tokens as string. // If a key already exists, tokens will be rewritten. -func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer { +func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string, options ...DefineTokenOption) *Tokenizer { var tks []*tokenRef if key < 1 { return t } for _, token := range tokens { ref := tokenRef{ - Key: key, - Token: s2b(token), + Key: key, + Token: s2b(token), + Alone: false, + IgnoreCase: false, + } + if len(options) > 0 { + for _, option := range options { + option(&ref) + } } head := ref.Token[0] tks = append(tks, &ref) From 296e054a608559967c4197568c686a5192797f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=88=B1=E5=9B=A0=E6=96=AF=E5=94=90?= <1745525+einsitang@users.noreply.github.com> Date: Mon, 30 Jun 2025 02:12:52 +0000 Subject: [PATCH 2/4] Initial commit From 982a7a9de7189567bfcce8d22607850b9eb8e81e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=88=B1=E5=9B=A0=E6=96=AF=E5=94=90?= <1745525+einsitang@users.noreply.github.com> Date: Mon, 30 Jun 2025 11:05:57 +0000 Subject: [PATCH 3/4] feat: add ignore case support for token parsing and enhance tests refactor: rename ignoreCaseAlphabet to upperCaseAlphabet for clarity --- parser.go | 25 +++++++++++++++++-------- tokenizer.go | 31 ++++++++++++++++++++++--------- tokenizer_test.go | 24 +++++++++++++++++++++++- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/parser.go b/parser.go index 2de550d..002835e 100644 --- a/parser.go +++ b/parser.go @@ -338,17 +338,20 @@ func (p *parsing) parseNumber() bool { return true } -// 忽略大小写对比 +// ignore case equals a and b func ignoreCaseEquals(a byte, b byte) bool { - if a >= 'a' && a <= 'z' { - a += 32 - } + return upperCaseAlphabet(a) == upperCaseAlphabet(b) +} - if b >= 'a' && b <= 'z' { - b += 32 +func upperCaseAlphabet(c byte) byte { + if c >= 'a' && c <= 'z' { + c -= 32 } + return c +} - return a == b +func isAlphabet(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') } // ignore case match compares next bytes from data with `r` @@ -474,10 +477,16 @@ func (p *parsing) parseQuote() bool { func (p *parsing) parseToken() bool { if p.curr != 0 { toks := p.t.index[p.curr] + if isAlphabet(p.curr) { + upCaseByte := upperCaseAlphabet(p.curr) + c := p.t.icIndex[upCaseByte] + toks = append(toks, c...) + } + // toks := p.t.index[p.curr] if toks != nil { start := p.pos for _, t := range toks { - + var matchFn func(r []byte, seek bool) bool if t.IgnoreCase { matchFn = p.ignoreCaseMatch diff --git a/tokenizer.go b/tokenizer.go index 99d0e8b..db291ad 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -133,8 +133,10 @@ type Tokenizer struct { stopOnUnknown bool allowNumberUnderscore bool // all defined custom tokens {key: [token1, token2, ...], ...} - tokens map[TokenKey][]*tokenRef - index map[byte][]*tokenRef + tokens map[TokenKey][]*tokenRef + index map[byte][]*tokenRef + // with ignore case token index + icIndex map[byte][]*tokenRef quotes []*StringSettings wSpaces []byte kwMajorSymbols []rune @@ -148,6 +150,7 @@ func New() *Tokenizer { // flags: 0, tokens: map[TokenKey][]*tokenRef{}, index: map[byte][]*tokenRef{}, + icIndex: map[byte][]*tokenRef{}, quotes: []*StringSettings{}, wSpaces: DefaultWhiteSpaces, } @@ -216,7 +219,7 @@ func AloneTokenOption(ref *tokenRef) { ref.Alone = true } -func InsensitiveTokenOption(ref *tokenRef) { +func IgnoreCaseTokenOption(ref *tokenRef) { ref.IgnoreCase = true } @@ -240,14 +243,24 @@ func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string, options ...Defin option(&ref) } } - head := ref.Token[0] + tks = append(tks, &ref) - if t.index[head] == nil { - t.index[head] = []*tokenRef{} + var index map[byte][]*tokenRef + var head byte + + if ref.IgnoreCase { + index = t.icIndex + head = upperCaseAlphabet(ref.Token[0]) + } else { + index = t.index + head = ref.Token[0] + } + if index[head] == nil { + index[head] = []*tokenRef{} } - t.index[head] = append(t.index[head], &ref) - sort.Slice(t.index[head], func(i, j int) bool { - return len(t.index[head][i].Token) > len(t.index[head][j].Token) + index[head] = append(index[head], &ref) + sort.Slice(index[head], func(i, j int) bool { + return len(index[head][i].Token) > len(index[head][j].Token) }) } t.tokens[key] = tks diff --git a/tokenizer_test.go b/tokenizer_test.go index 55eeadf..a209573 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -2,11 +2,32 @@ package tokenizer import ( "bytes" - "github.com/stretchr/testify/require" "strings" "testing" + + "github.com/stretchr/testify/require" ) +func TestTokenizeParse(t *testing.T) { + TXKey := TokenKey(100) + TRoleKey := TokenKey(101) + TAndKey := TokenKey(102) + + tokenizer := New() + // ignore case + tokenizer.DefineTokens(TXKey, []string{"hello"}, IgnoreCaseTokenOption) + tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption) + tokenizer.DefineTokens(TAndKey, []string{"and"}) + input := "heLlOHhellox and 1 == 0.5+0.5 Role xRolex xandx" + stream := tokenizer.ParseString(input) + for stream.IsValid() { + token := stream.CurrentToken() + t.Logf("[%d:%d] %s %v", token.Line(), token.Offset(), token.ValueString(), token.Key()) + stream.GoNext() + } + +} + func TestTokenize(t *testing.T) { type item struct { value interface{} @@ -17,6 +38,7 @@ func TestTokenize(t *testing.T) { wordTokenKey := TokenKey(11) dquoteKey := TokenKey(14) tokenizer.AllowNumberUnderscore() + tokenizer.DefineTokens(condTokenKey, []string{">=", "<=", "==", ">", "<"}) tokenizer.DefineTokens(wordTokenKey, []string{"or", "или"}) tokenizer.SetWhiteSpaces([]byte{' ', '\t', '\n'}) From 547b4ffb49cdf32ec48add746be6f2cf9b37a7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=88=B1=E5=9B=A0=E6=96=AF=E5=94=90?= <1745525+einsitang@users.noreply.github.com> Date: Mon, 30 Jun 2025 11:39:06 +0000 Subject: [PATCH 4/4] update test case --- tokenizer_test.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tokenizer_test.go b/tokenizer_test.go index a209573..50444b1 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -9,16 +9,18 @@ import ( ) func TestTokenizeParse(t *testing.T) { - TXKey := TokenKey(100) - TRoleKey := TokenKey(101) - TAndKey := TokenKey(102) + THello := TokenKey(100) + TWorld := TokenKey(101) + TRoleKey := TokenKey(105) + TAndKey := TokenKey(106) tokenizer := New() // ignore case - tokenizer.DefineTokens(TXKey, []string{"hello"}, IgnoreCaseTokenOption) + tokenizer.DefineTokens(THello, []string{"hello"}, IgnoreCaseTokenOption) tokenizer.DefineTokens(TRoleKey, []string{"Role"}, AloneTokenOption) + tokenizer.DefineTokens(TWorld, []string{"world"}, IgnoreCaseTokenOption, AloneTokenOption) tokenizer.DefineTokens(TAndKey, []string{"and"}) - input := "heLlOHhellox and 1 == 0.5+0.5 Role xRolex xandx" + input := "HeLLoWoRlD can match,prefixWorld role and roles both not match,but Role and WorLd is match will" stream := tokenizer.ParseString(input) for stream.IsValid() { token := stream.CurrentToken()