added generic tokenizer helper

2023-02-05 20:53:48 +02:00 · 2023-02-05 20:53:48 +02:00 · 23dfa9c634
parent 1b21e86be6
commit 23dfa9c634
2 changed files with 386 additions and 0 deletions
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@ -0,0 +1,201 @@
 // Package tokenizer implements a rudimentary tokens parser of buffered
 // io.Reader while respecting quotes and parenthesis boundaries.
 //
 // Example
 //
 //	tk := tokenizer.NewFromString("a, b, (c, d)")
 //	result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
 package tokenizer
 import (
 	"bufio"
 	"bytes"
 	"fmt"
 	"io"
 	"strings"
 )
 // eof represents a marker rune for the end of the reader.
 const eof = rune(0)
 // DefaultSeparators is a list with the default token separator characters.
 var DefaultSeparators = []rune{',', ' ', '\t', '\n'}
 // NewFromString creates new Tokenizer from the provided string.
 func NewFromString(str string) *Tokenizer {
 	return New(strings.NewReader(str))
 }
 // NewFromBytes creates new Tokenizer from the provided bytes slice.
 func NewFromBytes(b []byte) *Tokenizer {
 	return New(bytes.NewReader(b))
 }
 // New creates new Tokenizer from the provided reader.
 func New(r io.Reader) *Tokenizer {
 	return &Tokenizer{
 		r:          bufio.NewReader(r),
 		separators: DefaultSeparators,
 	}
 }
 // Tokenizer defines a struct that parses a reader into tokens while
 // respecting quotes and parenthesis boundaries.
 type Tokenizer struct {
 	r *bufio.Reader
 	separators []rune
 }
 // SetSeparators specifies the provided separatos of the current Tokenizer.
 func (s *Tokenizer) SetSeparators(separators ...rune) {
 	s.separators = separators
 }
 // Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
 //
 // Returns [io.EOF] error when there are no more tokens to scan.
 func (s *Tokenizer) Scan() (string, error) {
 	ch := s.read()
 	if ch == eof {
 		return "", io.EOF
 	}
 	if isWhitespaceRune(ch) {
 		s.readWhiteSpaces()
 	} else {
 		s.unread()
 	}
 	token, err := s.readToken()
 	if err != nil {
 		return "", err
 	}
 	// read all remaining whitespaces
 	s.readWhiteSpaces()
 	return token, err
 }
 // ScanAll reads the entire Tokenizer's buffer and return all found tokens.
 func (s *Tokenizer) ScanAll() ([]string, error) {
 	tokens := []string{}
 	for {
 		token, err := s.Scan()
 		if err != nil {
 			if err == io.EOF {
 				break
 			}
 			return nil, err
 		}
 		tokens = append(tokens, token)
 	}
 	return tokens, nil
 }
 // readToken reads a single token from the buffer and returns it.
 func (s *Tokenizer) readToken() (string, error) {
 	var buf bytes.Buffer
 	var parenthesis int
 	var quoteCh rune
 	var prevCh rune
 	for {
 		ch := s.read()
 		if ch == eof {
 			break
 		}
 		if !isEscapeRune(prevCh) {
 			if ch == '(' && quoteCh == eof {
 				parenthesis++
 			} else if ch == ')' && parenthesis > 0 && quoteCh == eof {
 				parenthesis--
 			} else if isQuoteRune(ch) {
 				if quoteCh == ch {
 					quoteCh = eof // reached closing quote
 				} else if quoteCh == eof {
 					quoteCh = ch // opening quote
 				}
 			}
 		}
 		if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
 			break
 		}
 		prevCh = ch
 		buf.WriteRune(ch)
 	}
 	if parenthesis > 0 || quoteCh != eof {
 		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
 	}
 	return buf.String(), nil
 }
 // readWhiteSpaces consumes all contiguous whitespace runes.
 func (s *Tokenizer) readWhiteSpaces() {
 	for {
 		ch := s.read()
 		if ch == eof {
 			break
 		}
 		if !s.isSeperatorRune(ch) {
 			s.unread()
 			break
 		}
 	}
 }
 // read reads the next rune from the buffered reader.
 // Returns the `rune(0)` if an error or `io.EOF` occurs.
 func (s *Tokenizer) read() rune {
 	ch, _, err := s.r.ReadRune()
 	if err != nil {
 		return eof
 	}
 	return ch
 }
 // unread places the previously read rune back on the reader.
 func (s *Tokenizer) unread() error {
 	return s.r.UnreadRune()
 }
 // isSeperatorRune checks if a rune is a token part separator.
 func (s *Tokenizer) isSeperatorRune(ch rune) bool {
 	for _, r := range s.separators {
 		if ch == r {
 			return true
 		}
 	}
 	return false
 }
 // isWhitespaceRune checks if a rune is a space, tab, or newline.
 func isWhitespaceRune(ch rune) bool {
 	return ch == ' ' || ch == '\t' || ch == '\n'
 }
 // isQuoteRune checks if a rune is a quote.
 func isQuoteRune(ch rune) bool {
 	return ch == '\'' || ch == '"' || ch == '`'
 }
 // isEscapeRune checks if a rune is an escape character.
 func isEscapeRune(ch rune) bool {
 	return ch == '\\'
 }
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@ -0,0 +1,185 @@
 package tokenizer
 import (
 	"io"
 	"strings"
 	"testing"
 )
 func TestFactories(t *testing.T) {
 	expectedContent := "test"
 	scenarios := []struct {
 		name string
 		tk   *Tokenizer
 	}{
 		{
 			"New()",
 			New(strings.NewReader(expectedContent)),
 		},
 		{
 			"NewFromString()",
 			NewFromString(expectedContent),
 		},
 		{
 			"NewFromBytes()",
 			NewFromBytes([]byte(expectedContent)),
 		},
 	}
 	for _, s := range scenarios {
 		content, _ := s.tk.r.ReadString(0)
 		if content != expectedContent {
 			t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content)
 		}
 		if len(s.tk.separators) != len(DefaultSeparators) {
 			t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators)
 		}
 		for _, r := range s.tk.separators {
 			exists := false
 			for _, def := range s.tk.separators {
 				if r == def {
 					exists = true
 					break
 				}
 			}
 			if !exists {
 				t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r))
 			}
 		}
 	}
 }
 func TestScan(t *testing.T) {
 	tk := NewFromString("abc 123.456 (abc)")
 	expectedTokens := []string{"abc", "123.456", "(abc)"}
 	for _, token := range expectedTokens {
 		result, err := tk.Scan()
 		if err != nil {
 			t.Fatalf("Expected token %q, got error %v", token, err)
 		}
 		if result != token {
 			t.Fatalf("Expected token %q, got error %v", token, result)
 		}
 	}
 	// scan the last character
 	token, err := tk.Scan()
 	if err != io.EOF {
 		t.Fatalf("Expected EOF error, got %v", err)
 	}
 	if token != "" || err != io.EOF {
 		t.Fatalf("Expected empty token, got %q", token)
 	}
 }
 func TestScanAllWithDefaultSeparators(t *testing.T) {
 	scenarios := []struct {
 		name         string
 		content      string
 		separators   []rune
 		expectError  bool
 		expectTokens []string
 	}{
 		{
 			"empty string",
 			"",
 			DefaultSeparators,
 			false,
 			nil,
 		},
 		{
 			"unbalanced parenthesis",
 			`(a,b() c`,
 			DefaultSeparators,
 			true,
 			[]string{},
 		},
 		{
 			"unmatching quotes",
 			`'asd"`,
 			DefaultSeparators,
 			true,
 			[]string{},
 		},
 		{
 			"no separators",
 			`a, b, c, d, e 123, "abc"`,
 			nil,
 			false,
 			[]string{
 				`a, b, c, d, e 123, "abc"`,
 			},
 		},
 		{
 			"default separators",
 			`   a   , 123.456, b, c d, (
 				test (a,b,c) " 123 "
 			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
 			DefaultSeparators,
 			false,
 			[]string{
 				"a",
 				"123.456",
 				"b",
 				"c",
 				"d",
 				"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
 				`"(abc d"`,
 				`"abc) d"`,
 				`"(abc) d \" "`,
 				`'abc "'`,
 			},
 		},
 		{
 			"custom separators",
 			`a, b, c, d e, "a,b,  c  ", (123, 456)`,
 			[]rune{','},
 			false,
 			[]string{
 				"a",
 				"b",
 				"c",
 				"d e",
 				`"a,b,  c  "`,
 				`(123, 456)`,
 			},
 		},
 	}
 	for _, s := range scenarios {
 		tk := NewFromString(s.content)
 		tk.SetSeparators(s.separators...)
 		tokens, err := tk.ScanAll()
 		hasErr := err != nil
 		if hasErr != s.expectError {
 			t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err)
 		}
 		if len(tokens) != len(s.expectTokens) {
 			t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, s.expectTokens, tokens)
 		}
 		for _, tok := range tokens {
 			exists := false
 			for _, def := range s.expectTokens {
 				if tok == def {
 					exists = true
 					break
 				}
 			}
 			if !exists {
 				t.Fatalf("[%s] Unexpected token %s", s.name, tok)
 			}
 		}
 	}
 }