synced with master

2023-10-27 22:28:15 +03:00 · 2023-10-27 22:28:15 +03:00 · f889a3fcb3
parent 1d67a35acf 34fed679fd
commit f889a3fcb3
5 changed files with 173 additions and 89 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -28,6 +28,15 @@
    ```


+## v0.19.1
+
+- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset.
+  An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`.
+  _This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._
+
+- Fixed the `:excerpt(max, withEllipsis?)` `field` query param modifier to properly add space to the generated text fragment after block tags.
+
+
 ## v0.19.0

 - Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv).
--- a/tools/rest/excerpt_modifier.go
+++ b/tools/rest/excerpt_modifier.go
@ -78,8 +78,7 @@ func (m *excerptModifier) Modify(value any) (any, error) {
 		return "", err
 	}

-	var isNotEmpty bool
-	var needSpace bool
+	var hasPrevSpace bool

 	// for all node types and more details check
 	// https://pkg.go.dev/golang.org/x/net/html#Parse
@ -87,37 +86,47 @@ func (m *excerptModifier) Modify(value any) (any, error) {
 	stripTags = func(n *html.Node) {
 		switch n.Type {
 		case html.TextNode:
-			if txt := strings.TrimSpace(whitespaceRegex.ReplaceAllString(n.Data, " ")); txt != "" {
-				if isNotEmpty && needSpace {
-					needSpace = false
-					builder.WriteString(" ")
+			// collapse multiple spaces into one
+			txt := whitespaceRegex.ReplaceAllString(n.Data, " ")
+
+			if hasPrevSpace {
+				txt = strings.TrimLeft(txt, " ")
 			}

+			if txt != "" {
+				hasPrevSpace = strings.HasSuffix(txt, " ")
+
 				builder.WriteString(txt)
-
-				if !isNotEmpty {
-					isNotEmpty = true
-				}
-			}
-		case html.ElementNode:
-			if !needSpace && !list.ExistInSlice(n.Data, inlineTags) {
-				needSpace = true
 			}
 		}

-		if builder.Len() > m.max {
+		// excerpt max has been reached => no need to further iterate
+		// (+2 for the extra whitespace suffix/prefix that will be trimmed later)
+		if builder.Len() > m.max+2 {
 			return
 		}

 		for c := n.FirstChild; c != nil; c = c.NextSibling {
 			if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) {
+				isBlock := c.Type == html.ElementNode && !list.ExistInSlice(c.Data, inlineTags)
+
+				if isBlock && !hasPrevSpace {
+					builder.WriteString(" ")
+					hasPrevSpace = true
+				}
+
 				stripTags(c)
+
+				if isBlock && !hasPrevSpace {
+					builder.WriteString(" ")
+					hasPrevSpace = true
+				}
 			}
 		}
 	}
 	stripTags(doc)

-	result := builder.String()
+	result := strings.TrimSpace(builder.String())

 	if len(result) > m.max {
 		result = strings.TrimSpace(result[:m.max])
--- a/tools/rest/excerpt_modifier_test.go
+++ b/tools/rest/excerpt_modifier_test.go
@ -84,11 +84,10 @@ func TestNewExcerptModifier(t *testing.T) {
 }

 func TestExcerptModifierModify(t *testing.T) {
-	// plain text value: "Hello t est12 3 word"
 	html := ` <script>var a = 123;</script>   <p>Hello</p><div id="test_id">t   est<b>12
-	3</b></div> <h1>word  </h1> `
+	3</b><span>456</span></div><span>word <b>7</b> 89<span>!<b>?</b><b> a </b><b>b </b>c</span>#<h1>title</h1>`

-	plainText := "Hello t est12 3 word"
+	plainText := "Hello t est12 3456 word 7 89!? a b c# title"

 	scenarios := []struct {
 		name     string
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@ -21,6 +21,8 @@ const eof = rune(0)
 // DefaultSeparators is a list with the default token separator characters.
 var DefaultSeparators = []rune{','}

+var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
+
 // NewFromString creates new Tokenizer from the provided string.
 func NewFromString(str string) *Tokenizer {
 	return New(strings.NewReader(str))
@ -33,12 +35,11 @@ func NewFromBytes(b []byte) *Tokenizer {

 // New creates new Tokenizer from the provided reader with DefaultSeparators.
 func New(r io.Reader) *Tokenizer {
-	return &Tokenizer{
-		r:                 bufio.NewReader(r),
-		separators:        DefaultSeparators,
-		keepSeparator:     false,
-		ignoreParenthesis: false,
-	}
+	t := &Tokenizer{r: bufio.NewReader(r)}
+
+	t.Separators(DefaultSeparators...)
+
+	return t
 }

 // Tokenizer defines a struct that parses a reader into tokens while
@ -46,14 +47,18 @@ func New(r io.Reader) *Tokenizer {
 type Tokenizer struct {
 	r *bufio.Reader

+	trimCutset        string
 	separators        []rune
 	keepSeparator     bool
+	keepEmptyTokens   bool
 	ignoreParenthesis bool
 }

 // Separators defines the provided separatos of the current Tokenizer.
 func (t *Tokenizer) Separators(separators ...rune) {
 	t.separators = separators
+
+	t.rebuildTrimCutset()
 }

 // KeepSeparator defines whether to keep the separator rune as part
@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
 	t.keepSeparator = state
 }

+// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
+func (t *Tokenizer) KeepEmptyTokens(state bool) {
+	t.keepEmptyTokens = state
+}
+
 // IgnoreParenthesis defines whether to ignore the parenthesis boundaries
 // and to treat the '(' and ')' as regular characters.
 func (t *Tokenizer) IgnoreParenthesis(state bool) {
 	t.ignoreParenthesis = state
 }

-// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
+//
+// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
 //
 // Returns [io.EOF] error when there are no more tokens to scan.
 func (t *Tokenizer) Scan() (string, error) {
 	ch := t.read()
-
 	if ch == eof {
 		return "", io.EOF
 	}
-
-	if isWhitespaceRune(ch) {
-		t.readWhiteSpaces()
-	} else {
 	t.unread()
-	}

 	token, err := t.readToken()
 	if err != nil {
 		return "", err
 	}

-	// read all remaining whitespaces
-	t.readWhiteSpaces()
+	if !t.keepEmptyTokens && token == "" {
+		return t.Scan()
+	}

 	return token, err
 }
@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
 			break
 		}

-		if !isEscapeRune(prevCh) {
+		if !t.isEscapeRune(prevCh) {
 			if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
 				parenthesis++ // opening parenthesis
 			} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
 				parenthesis-- // closing parenthesis
-			} else if isQuoteRune(ch) {
+			} else if t.isQuoteRune(ch) {
 				if quoteCh == ch {
 					quoteCh = eof // closing quote
 				} else if quoteCh == eof {
@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
 		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
 	}

-	return buf.String(), nil
+	return strings.Trim(buf.String(), t.trimCutset), nil
 }

 // readWhiteSpaces consumes all contiguous whitespace runes.
@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
 			break
 		}

-		if !t.isSeperatorRune(ch) {
+		if !t.isWhitespaceRune(ch) {
 			t.unread()
 			break
 		}
@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
 	return t.r.UnreadRune()
 }

+// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
+func (t *Tokenizer) rebuildTrimCutset() {
+	var cutset strings.Builder
+
+	for _, w := range whitespaceChars {
+		if t.isSeperatorRune(w) {
+			continue
+		}
+		cutset.WriteRune(w)
+	}
+
+	t.trimCutset = cutset.String()
+}
+
 // isSeperatorRune checks if a rune is a token part separator.
 func (t *Tokenizer) isSeperatorRune(ch rune) bool {
 	for _, r := range t.separators {
@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
 	return false
 }

-// isWhitespaceRune checks if a rune is a space, tab, or newline.
-func isWhitespaceRune(ch rune) bool {
-	return ch == ' ' || ch == '\t' || ch == '\n'
+// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
+func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
+	for _, c := range whitespaceChars {
+		if c == ch {
+			return true
+		}
+	}
+
+	return false
 }

 // isQuoteRune checks if a rune is a quote.
-func isQuoteRune(ch rune) bool {
+func (t *Tokenizer) isQuoteRune(ch rune) bool {
 	return ch == '\'' || ch == '"' || ch == '`'
 }

 // isEscapeRune checks if a rune is an escape character.
-func isEscapeRune(ch rune) bool {
+func (t *Tokenizer) isEscapeRune(ch rune) bool {
 	return ch == '\\'
 }
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@ -95,6 +95,7 @@ func TestScanAll(t *testing.T) {
 		content           string
 		separators        []rune
 		keepSeparator     bool
+		keepEmptyTokens   bool
 		ignoreParenthesis bool
 		expectError       bool
 		expectTokens      []string
@ -104,6 +105,7 @@ func TestScanAll(t *testing.T) {
 			content:           "",
 			separators:        DefaultSeparators,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       false,
 			expectTokens:      nil,
@ -113,6 +115,7 @@ func TestScanAll(t *testing.T) {
 			content:           `(a,b() c`,
 			separators:        DefaultSeparators,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       true,
 			expectTokens:      []string{},
@ -122,6 +125,7 @@ func TestScanAll(t *testing.T) {
 			content:           `'asd"`,
 			separators:        DefaultSeparators,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       true,
 			expectTokens:      []string{},
@ -131,15 +135,18 @@ func TestScanAll(t *testing.T) {
 			content:           `a, b, c, d, e 123, "abc"`,
 			separators:        nil,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       false,
 			expectTokens:      []string{`a, b, c, d, e 123, "abc"`},
 		},
 		{
 			name: "default separators",
-			content:           `a, b, c, d e, "a,b,  c  ", (123, 456)`,
+			content: `a, b , c  , d e  , "a,b,  c  " , ,, ,	  (123, 456)
+			`,
 			separators:        DefaultSeparators,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       false,
 			expectTokens: []string{
@ -152,70 +159,49 @@ func TestScanAll(t *testing.T) {
 			},
 		},
 		{
-			name:              "default separators (with preserve)",
+			name: "keep separators",
 			content: `a, b, c, d  e, "a,b,  c  ",	(123, 456)`,
-			separators:        DefaultSeparators,
+			separators:        []rune{',', ' '}, // the space should be removed from the cutset
 			keepSeparator:     true,
+			keepEmptyTokens:   true,
 			ignoreParenthesis: false,
 			expectError:       false,
 			expectTokens: []string{
 				"a,",
+				" ",
 				"b,",
+				" ",
 				"c,",
-				"d e,",
+				" ",
+				"d ",
+				" ",
+				"e,",
+				" ",
 				`"a,b,  c  ",`,
 				`(123, 456)`,
 			},
 		},
 		{
 			name:              "custom separators",
-			content: `   a   , 123.456, b, c d, (
-				test (a,b,c) " 123 "
-			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
-			separators:        []rune{',', ' ', '\t', '\n'},
+			content:           `a | b c  d &(e + f) &  "g & h" & & &`,
+			separators:        []rune{'|', '&'},
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: false,
 			expectError:       false,
 			expectTokens: []string{
 				"a",
-				"123.456",
-				"b",
-				"c",
-				"d",
-				"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
-				`"(abc d"`,
-				`"abc) d"`,
-				`"(abc) d \" "`,
-				`'abc "'`,
-			},
-		},
-		{
-			name: "custom separators (with preserve)",
-			content: `   a   , 123.456, b, c d, (
-				test (a,b,c) " 123 "
-			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
-			separators:        []rune{',', ' ', '\t', '\n'},
-			keepSeparator:     true,
-			ignoreParenthesis: false,
-			expectError:       false,
-			expectTokens: []string{
-				"a ",
-				"123.456,",
-				"b,",
-				"c ",
-				"d,",
-				"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t),",
-				`"(abc d",`,
-				`"abc) d",`,
-				`"(abc) d \" " `,
-				`'abc "'`,
+				"b c  d",
+				"(e + f)",
+				`"g & h"`,
 			},
 		},
 		{
 			name:              "ignoring parenthesis",
 			content:           `a, b, (c,d)`,
-			separators:        []rune{','},
+			separators:        DefaultSeparators,
 			keepSeparator:     false,
+			keepEmptyTokens:   false,
 			ignoreParenthesis: true,
 			expectError:       false,
 			expectTokens: []string{
@ -225,6 +211,26 @@ func TestScanAll(t *testing.T) {
 				"d)",
 			},
 		},
+		{
+			name:              "keep empty tokens",
+			content:           `a, b, (c, d), ,, , e, , f`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   true,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens: []string{
+				"a",
+				"b",
+				"(c, d)",
+				"",
+				"",
+				"",
+				"e",
+				"",
+				"f",
+			},
+		},
 	}

 	for _, s := range scenarios {
@ -233,6 +239,7 @@ func TestScanAll(t *testing.T) {

 			tk.Separators(s.separators...)
 			tk.KeepSeparator(s.keepSeparator)
+			tk.KeepEmptyTokens(s.keepEmptyTokens)
 			tk.IgnoreParenthesis(s.ignoreParenthesis)

 			tokens, err := tk.ScanAll()
@ -255,9 +262,42 @@ func TestScanAll(t *testing.T) {
 					}
 				}
 				if !exists {
-					t.Fatalf("Unexpected token %s", tok)
+					t.Fatalf("Unexpected token %q", tok)
 				}
 			}
 		})
 	}
 }
+
+func TestTrimCutset(t *testing.T) {
+	scenarios := []struct {
+		name           string
+		separators     []rune
+		expectedCutset string
+	}{
+		{
+			"default factory separators",
+			nil,
+			"\t\n\v\f\r \u0085\u00a0",
+		},
+		{
+			"custom separators",
+			[]rune{'\t', ' ', '\r', ','},
+			"\n\v\f\u0085\u00a0",
+		},
+	}
+
+	for _, s := range scenarios {
+		t.Run(s.name, func(t *testing.T) {
+			tk := NewFromString("")
+
+			if len(s.separators) > 0 {
+				tk.Separators(s.separators...)
+			}
+
+			if tk.trimCutset != s.expectedCutset {
+				t.Fatalf("Expected cutset %q, got %q", s.expectedCutset, tk.trimCutset)
+			}
+		})
+	}
+}