diff --git a/tools/tokenizer/tokenizer.go b/tools/tokenizer/tokenizer.go index 54a67d86..573ed8b0 100644 --- a/tools/tokenizer/tokenizer.go +++ b/tools/tokenizer/tokenizer.go @@ -34,9 +34,10 @@ func NewFromBytes(b []byte) *Tokenizer { // New creates new Tokenizer from the provided reader with DefaultSeparators. func New(r io.Reader) *Tokenizer { return &Tokenizer{ - r: bufio.NewReader(r), - separators: DefaultSeparators, - keepSeparator: false, + r: bufio.NewReader(r), + separators: DefaultSeparators, + keepSeparator: false, + ignoreParenthesis: false, } } @@ -45,54 +46,61 @@ func New(r io.Reader) *Tokenizer { type Tokenizer struct { r *bufio.Reader - separators []rune - keepSeparator bool + separators []rune + keepSeparator bool + ignoreParenthesis bool } // Separators defines the provided separatos of the current Tokenizer. -func (s *Tokenizer) Separators(separators ...rune) { - s.separators = separators +func (t *Tokenizer) Separators(separators ...rune) { + t.separators = separators } // KeepSeparator defines whether to keep the separator rune as part // of the token (default to false). -func (s *Tokenizer) KeepSeparator(state bool) { - s.keepSeparator = state +func (t *Tokenizer) KeepSeparator(state bool) { + t.keepSeparator = state +} + +// IgnoreParenthesis defines whether to ignore the parenthesis boundaries +// and to treat the '(' and ')' as regular characters. +func (t *Tokenizer) IgnoreParenthesis(state bool) { + t.ignoreParenthesis = state } // Scan reads and returns the next available token from the Tokenizer's buffer (trimmed). // // Returns [io.EOF] error when there are no more tokens to scan. -func (s *Tokenizer) Scan() (string, error) { - ch := s.read() +func (t *Tokenizer) Scan() (string, error) { + ch := t.read() if ch == eof { return "", io.EOF } if isWhitespaceRune(ch) { - s.readWhiteSpaces() + t.readWhiteSpaces() } else { - s.unread() + t.unread() } - token, err := s.readToken() + token, err := t.readToken() if err != nil { return "", err } // read all remaining whitespaces - s.readWhiteSpaces() + t.readWhiteSpaces() return token, err } // ScanAll reads the entire Tokenizer's buffer and return all found tokens. -func (s *Tokenizer) ScanAll() ([]string, error) { +func (t *Tokenizer) ScanAll() ([]string, error) { tokens := []string{} for { - token, err := s.Scan() + token, err := t.Scan() if err != nil { if err == io.EOF { break @@ -108,35 +116,35 @@ func (s *Tokenizer) ScanAll() ([]string, error) { } // readToken reads a single token from the buffer and returns it. -func (s *Tokenizer) readToken() (string, error) { +func (t *Tokenizer) readToken() (string, error) { var buf bytes.Buffer var parenthesis int var quoteCh rune var prevCh rune for { - ch := s.read() + ch := t.read() if ch == eof { break } if !isEscapeRune(prevCh) { - if ch == '(' && quoteCh == eof { - parenthesis++ - } else if ch == ')' && parenthesis > 0 && quoteCh == eof { - parenthesis-- + if !t.ignoreParenthesis && ch == '(' && quoteCh == eof { + parenthesis++ // opening parenthesis + } else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof { + parenthesis-- // closing parenthesis } else if isQuoteRune(ch) { if quoteCh == ch { - quoteCh = eof // reached closing quote + quoteCh = eof // closing quote } else if quoteCh == eof { quoteCh = ch // opening quote } } } - if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof { - if s.keepSeparator { + if t.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof { + if t.keepSeparator { buf.WriteRune(ch) } break @@ -154,16 +162,16 @@ func (s *Tokenizer) readToken() (string, error) { } // readWhiteSpaces consumes all contiguous whitespace runes. -func (s *Tokenizer) readWhiteSpaces() { +func (t *Tokenizer) readWhiteSpaces() { for { - ch := s.read() + ch := t.read() if ch == eof { break } - if !s.isSeperatorRune(ch) { - s.unread() + if !t.isSeperatorRune(ch) { + t.unread() break } } @@ -171,8 +179,8 @@ func (s *Tokenizer) readWhiteSpaces() { // read reads the next rune from the buffered reader. // Returns the `rune(0)` if an error or `io.EOF` occurs. -func (s *Tokenizer) read() rune { - ch, _, err := s.r.ReadRune() +func (t *Tokenizer) read() rune { + ch, _, err := t.r.ReadRune() if err != nil { return eof } @@ -181,13 +189,13 @@ func (s *Tokenizer) read() rune { } // unread places the previously read rune back on the reader. -func (s *Tokenizer) unread() error { - return s.r.UnreadRune() +func (t *Tokenizer) unread() error { + return t.r.UnreadRune() } // isSeperatorRune checks if a rune is a token part separator. -func (s *Tokenizer) isSeperatorRune(ch rune) bool { - for _, r := range s.separators { +func (t *Tokenizer) isSeperatorRune(ch rune) bool { + for _, r := range t.separators { if ch == r { return true } diff --git a/tools/tokenizer/tokenizer_test.go b/tools/tokenizer/tokenizer_test.go index d728ba89..3b2f634b 100644 --- a/tools/tokenizer/tokenizer_test.go +++ b/tools/tokenizer/tokenizer_test.go @@ -28,32 +28,38 @@ func TestFactories(t *testing.T) { } for _, s := range scenarios { - content, _ := s.tk.r.ReadString(0) + t.Run(s.name, func(t *testing.T) { + content, _ := s.tk.r.ReadString(0) - if content != expectedContent { - t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content) - } + if content != expectedContent { + t.Fatalf("Expected reader with content %q, got %q", expectedContent, content) + } - if s.tk.keepSeparator != false { - t.Fatalf("[%s] Expected false, got true", s.name) - } + if s.tk.keepSeparator != false { + t.Fatal("Expected keepSeparator false, got true") + } - if len(s.tk.separators) != len(DefaultSeparators) { - t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators) - } + if s.tk.ignoreParenthesis != false { + t.Fatal("Expected ignoreParenthesis false, got true") + } - for _, r := range s.tk.separators { - exists := false - for _, def := range s.tk.separators { - if r == def { - exists = true - break + if len(s.tk.separators) != len(DefaultSeparators) { + t.Fatalf("Expected \n%v, \ngot \n%v", DefaultSeparators, s.tk.separators) + } + + for _, r := range s.tk.separators { + exists := false + for _, def := range s.tk.separators { + if r == def { + exists = true + break + } + } + if !exists { + t.Fatalf("Unexpected sepator %s", string(r)) } } - if !exists { - t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r)) - } - } + }) } } @@ -85,54 +91,58 @@ func TestScan(t *testing.T) { func TestScanAll(t *testing.T) { scenarios := []struct { - name string - content string - separators []rune - keepSeparator bool - expectError bool - expectTokens []string + name string + content string + separators []rune + keepSeparator bool + ignoreParenthesis bool + expectError bool + expectTokens []string }{ { - "empty string", - "", - DefaultSeparators, - false, - false, - nil, + name: "empty string", + content: "", + separators: DefaultSeparators, + keepSeparator: false, + ignoreParenthesis: false, + expectError: false, + expectTokens: nil, }, { - "unbalanced parenthesis", - `(a,b() c`, - DefaultSeparators, - false, - true, - []string{}, + name: "unbalanced parenthesis", + content: `(a,b() c`, + separators: DefaultSeparators, + keepSeparator: false, + ignoreParenthesis: false, + expectError: true, + expectTokens: []string{}, }, { - "unmatching quotes", - `'asd"`, - DefaultSeparators, - false, - true, - []string{}, + name: "unmatching quotes", + content: `'asd"`, + separators: DefaultSeparators, + keepSeparator: false, + ignoreParenthesis: false, + expectError: true, + expectTokens: []string{}, }, { - "no separators", - `a, b, c, d, e 123, "abc"`, - nil, - false, - false, - []string{ - `a, b, c, d, e 123, "abc"`, - }, + name: "no separators", + content: `a, b, c, d, e 123, "abc"`, + separators: nil, + keepSeparator: false, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{`a, b, c, d, e 123, "abc"`}, }, { - "default separators", - `a, b, c, d e, "a,b, c ", (123, 456)`, - DefaultSeparators, - false, - false, - []string{ + name: "default separators", + content: `a, b, c, d e, "a,b, c ", (123, 456)`, + separators: DefaultSeparators, + keepSeparator: false, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{ "a", "b", "c", @@ -142,12 +152,13 @@ func TestScanAll(t *testing.T) { }, }, { - "default separators (with preserve)", - `a, b, c, d e, "a,b, c ", (123, 456)`, - DefaultSeparators, - true, - false, - []string{ + name: "default separators (with preserve)", + content: `a, b, c, d e, "a,b, c ", (123, 456)`, + separators: DefaultSeparators, + keepSeparator: true, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{ "a,", "b,", "c,", @@ -157,14 +168,15 @@ func TestScanAll(t *testing.T) { }, }, { - "custom separators", - ` a , 123.456, b, c d, ( + name: "custom separators", + content: ` a , 123.456, b, c d, ( test (a,b,c) " 123 " ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, - []rune{',', ' ', '\t', '\n'}, - false, - false, - []string{ + separators: []rune{',', ' ', '\t', '\n'}, + keepSeparator: false, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{ "a", "123.456", "b", @@ -178,14 +190,15 @@ func TestScanAll(t *testing.T) { }, }, { - "custom separators (with preserve)", - ` a , 123.456, b, c d, ( + name: "custom separators (with preserve)", + content: ` a , 123.456, b, c d, ( test (a,b,c) " 123 " ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, - []rune{',', ' ', '\t', '\n'}, - true, - false, - []string{ + separators: []rune{',', ' ', '\t', '\n'}, + keepSeparator: true, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{ "a ", "123.456,", "b,", @@ -198,36 +211,53 @@ func TestScanAll(t *testing.T) { `'abc "'`, }, }, + { + name: "ignoring parenthesis", + content: `a, b, (c,d)`, + separators: []rune{','}, + keepSeparator: false, + ignoreParenthesis: true, + expectError: false, + expectTokens: []string{ + "a", + "b", + "(c", + "d)", + }, + }, } for _, s := range scenarios { - tk := NewFromString(s.content) + t.Run(s.name, func(t *testing.T) { + tk := NewFromString(s.content) - tk.Separators(s.separators...) - tk.KeepSeparator(s.keepSeparator) + tk.Separators(s.separators...) + tk.KeepSeparator(s.keepSeparator) + tk.IgnoreParenthesis(s.ignoreParenthesis) - tokens, err := tk.ScanAll() + tokens, err := tk.ScanAll() - hasErr := err != nil - if hasErr != s.expectError { - t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err) - } + hasErr := err != nil + if hasErr != s.expectError { + t.Fatalf("Expected hasErr %v, got %v (%v)", s.expectError, hasErr, err) + } - if len(tokens) != len(s.expectTokens) { - t.Fatalf("[%s] Expected \n%v (%d), \ngot \n%v (%d)", s.name, s.expectTokens, len(s.expectTokens), tokens, len(tokens)) - } + if len(tokens) != len(s.expectTokens) { + t.Fatalf("Expected \n%v (%d), \ngot \n%v (%d)", s.expectTokens, len(s.expectTokens), tokens, len(tokens)) + } - for _, tok := range tokens { - exists := false - for _, def := range s.expectTokens { - if tok == def { - exists = true - break + for _, tok := range tokens { + exists := false + for _, def := range s.expectTokens { + if tok == def { + exists = true + break + } + } + if !exists { + t.Fatalf("Unexpected token %s", tok) } } - if !exists { - t.Fatalf("[%s] Unexpected token %s", s.name, tok) - } - } + }) } }