diff --git a/CHANGELOG.md b/CHANGELOG.md
index 027c36ef..228533cb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,15 @@
```
+## v0.19.1
+
+- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset.
+ An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`.
+ _This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._
+
+- Fixed the `:excerpt(max, withEllipsis?)` `field` query param modifier to properly add space to the generated text fragment after block tags.
+
+
## v0.19.0
- Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv).
diff --git a/tools/rest/excerpt_modifier.go b/tools/rest/excerpt_modifier.go
index 107a8fcf..07da1a06 100644
--- a/tools/rest/excerpt_modifier.go
+++ b/tools/rest/excerpt_modifier.go
@@ -78,8 +78,7 @@ func (m *excerptModifier) Modify(value any) (any, error) {
return "", err
}
- var isNotEmpty bool
- var needSpace bool
+ var hasPrevSpace bool
// for all node types and more details check
// https://pkg.go.dev/golang.org/x/net/html#Parse
@@ -87,37 +86,47 @@ func (m *excerptModifier) Modify(value any) (any, error) {
stripTags = func(n *html.Node) {
switch n.Type {
case html.TextNode:
- if txt := strings.TrimSpace(whitespaceRegex.ReplaceAllString(n.Data, " ")); txt != "" {
- if isNotEmpty && needSpace {
- needSpace = false
- builder.WriteString(" ")
- }
+ // collapse multiple spaces into one
+ txt := whitespaceRegex.ReplaceAllString(n.Data, " ")
+
+ if hasPrevSpace {
+ txt = strings.TrimLeft(txt, " ")
+ }
+
+ if txt != "" {
+ hasPrevSpace = strings.HasSuffix(txt, " ")
builder.WriteString(txt)
-
- if !isNotEmpty {
- isNotEmpty = true
- }
- }
- case html.ElementNode:
- if !needSpace && !list.ExistInSlice(n.Data, inlineTags) {
- needSpace = true
}
}
- if builder.Len() > m.max {
+ // excerpt max has been reached => no need to further iterate
+ // (+2 for the extra whitespace suffix/prefix that will be trimmed later)
+ if builder.Len() > m.max+2 {
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) {
+ isBlock := c.Type == html.ElementNode && !list.ExistInSlice(c.Data, inlineTags)
+
+ if isBlock && !hasPrevSpace {
+ builder.WriteString(" ")
+ hasPrevSpace = true
+ }
+
stripTags(c)
+
+ if isBlock && !hasPrevSpace {
+ builder.WriteString(" ")
+ hasPrevSpace = true
+ }
}
}
}
stripTags(doc)
- result := builder.String()
+ result := strings.TrimSpace(builder.String())
if len(result) > m.max {
result = strings.TrimSpace(result[:m.max])
diff --git a/tools/rest/excerpt_modifier_test.go b/tools/rest/excerpt_modifier_test.go
index 67c69850..47a87a66 100644
--- a/tools/rest/excerpt_modifier_test.go
+++ b/tools/rest/excerpt_modifier_test.go
@@ -84,11 +84,10 @@ func TestNewExcerptModifier(t *testing.T) {
}
func TestExcerptModifierModify(t *testing.T) {
- // plain text value: "Hello t est12 3 word"
html := `
Hello
t est12
- 3
word
`
+ 3456word 7 89!? a b c#title
`
- plainText := "Hello t est12 3 word"
+ plainText := "Hello t est12 3456 word 7 89!? a b c# title"
scenarios := []struct {
name string
diff --git a/tools/tokenizer/tokenizer.go b/tools/tokenizer/tokenizer.go
index 573ed8b0..f7111a27 100644
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@@ -21,6 +21,8 @@ const eof = rune(0)
// DefaultSeparators is a list with the default token separator characters.
var DefaultSeparators = []rune{','}
+var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
+
// NewFromString creates new Tokenizer from the provided string.
func NewFromString(str string) *Tokenizer {
return New(strings.NewReader(str))
@@ -33,12 +35,11 @@ func NewFromBytes(b []byte) *Tokenizer {
// New creates new Tokenizer from the provided reader with DefaultSeparators.
func New(r io.Reader) *Tokenizer {
- return &Tokenizer{
- r: bufio.NewReader(r),
- separators: DefaultSeparators,
- keepSeparator: false,
- ignoreParenthesis: false,
- }
+ t := &Tokenizer{r: bufio.NewReader(r)}
+
+ t.Separators(DefaultSeparators...)
+
+ return t
}
// Tokenizer defines a struct that parses a reader into tokens while
@@ -46,14 +47,18 @@ func New(r io.Reader) *Tokenizer {
type Tokenizer struct {
r *bufio.Reader
+ trimCutset string
separators []rune
keepSeparator bool
+ keepEmptyTokens bool
ignoreParenthesis bool
}
// Separators defines the provided separatos of the current Tokenizer.
func (t *Tokenizer) Separators(separators ...rune) {
t.separators = separators
+
+ t.rebuildTrimCutset()
}
// KeepSeparator defines whether to keep the separator rune as part
@@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
t.keepSeparator = state
}
+// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
+func (t *Tokenizer) KeepEmptyTokens(state bool) {
+ t.keepEmptyTokens = state
+}
+
// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
// and to treat the '(' and ')' as regular characters.
func (t *Tokenizer) IgnoreParenthesis(state bool) {
t.ignoreParenthesis = state
}
-// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
+//
+// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
//
// Returns [io.EOF] error when there are no more tokens to scan.
func (t *Tokenizer) Scan() (string, error) {
ch := t.read()
-
if ch == eof {
return "", io.EOF
}
-
- if isWhitespaceRune(ch) {
- t.readWhiteSpaces()
- } else {
- t.unread()
- }
+ t.unread()
token, err := t.readToken()
if err != nil {
return "", err
}
- // read all remaining whitespaces
- t.readWhiteSpaces()
+ if !t.keepEmptyTokens && token == "" {
+ return t.Scan()
+ }
return token, err
}
@@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
break
}
- if !isEscapeRune(prevCh) {
+ if !t.isEscapeRune(prevCh) {
if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
parenthesis++ // opening parenthesis
} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
parenthesis-- // closing parenthesis
- } else if isQuoteRune(ch) {
+ } else if t.isQuoteRune(ch) {
if quoteCh == ch {
quoteCh = eof // closing quote
} else if quoteCh == eof {
@@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
}
- return buf.String(), nil
+ return strings.Trim(buf.String(), t.trimCutset), nil
}
// readWhiteSpaces consumes all contiguous whitespace runes.
@@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
break
}
- if !t.isSeperatorRune(ch) {
+ if !t.isWhitespaceRune(ch) {
t.unread()
break
}
@@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
return t.r.UnreadRune()
}
+// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
+func (t *Tokenizer) rebuildTrimCutset() {
+ var cutset strings.Builder
+
+ for _, w := range whitespaceChars {
+ if t.isSeperatorRune(w) {
+ continue
+ }
+ cutset.WriteRune(w)
+ }
+
+ t.trimCutset = cutset.String()
+}
+
// isSeperatorRune checks if a rune is a token part separator.
func (t *Tokenizer) isSeperatorRune(ch rune) bool {
for _, r := range t.separators {
@@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
return false
}
-// isWhitespaceRune checks if a rune is a space, tab, or newline.
-func isWhitespaceRune(ch rune) bool {
- return ch == ' ' || ch == '\t' || ch == '\n'
+// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
+func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
+ for _, c := range whitespaceChars {
+ if c == ch {
+ return true
+ }
+ }
+
+ return false
}
// isQuoteRune checks if a rune is a quote.
-func isQuoteRune(ch rune) bool {
+func (t *Tokenizer) isQuoteRune(ch rune) bool {
return ch == '\'' || ch == '"' || ch == '`'
}
// isEscapeRune checks if a rune is an escape character.
-func isEscapeRune(ch rune) bool {
+func (t *Tokenizer) isEscapeRune(ch rune) bool {
return ch == '\\'
}
diff --git a/tools/tokenizer/tokenizer_test.go b/tools/tokenizer/tokenizer_test.go
index 3b2f634b..801ba946 100644
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@@ -95,6 +95,7 @@ func TestScanAll(t *testing.T) {
content string
separators []rune
keepSeparator bool
+ keepEmptyTokens bool
ignoreParenthesis bool
expectError bool
expectTokens []string
@@ -104,6 +105,7 @@ func TestScanAll(t *testing.T) {
content: "",
separators: DefaultSeparators,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: false,
expectTokens: nil,
@@ -113,6 +115,7 @@ func TestScanAll(t *testing.T) {
content: `(a,b() c`,
separators: DefaultSeparators,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: true,
expectTokens: []string{},
@@ -122,6 +125,7 @@ func TestScanAll(t *testing.T) {
content: `'asd"`,
separators: DefaultSeparators,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: true,
expectTokens: []string{},
@@ -131,15 +135,18 @@ func TestScanAll(t *testing.T) {
content: `a, b, c, d, e 123, "abc"`,
separators: nil,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{`a, b, c, d, e 123, "abc"`},
},
{
- name: "default separators",
- content: `a, b, c, d e, "a,b, c ", (123, 456)`,
+ name: "default separators",
+ content: `a, b , c , d e , "a,b, c " , ,, , (123, 456)
+ `,
separators: DefaultSeparators,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{
@@ -152,70 +159,49 @@ func TestScanAll(t *testing.T) {
},
},
{
- name: "default separators (with preserve)",
- content: `a, b, c, d e, "a,b, c ", (123, 456)`,
- separators: DefaultSeparators,
+ name: "keep separators",
+ content: `a, b, c, d e, "a,b, c ", (123, 456)`,
+ separators: []rune{',', ' '}, // the space should be removed from the cutset
keepSeparator: true,
+ keepEmptyTokens: true,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{
"a,",
+ " ",
"b,",
+ " ",
"c,",
- "d e,",
+ " ",
+ "d ",
+ " ",
+ "e,",
+ " ",
`"a,b, c ",`,
`(123, 456)`,
},
},
{
- name: "custom separators",
- content: ` a , 123.456, b, c d, (
- test (a,b,c) " 123 "
- ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
- separators: []rune{',', ' ', '\t', '\n'},
+ name: "custom separators",
+ content: `a | b c d &(e + f) & "g & h" & & &`,
+ separators: []rune{'|', '&'},
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{
"a",
- "123.456",
- "b",
- "c",
- "d",
- "(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
- `"(abc d"`,
- `"abc) d"`,
- `"(abc) d \" "`,
- `'abc "'`,
- },
- },
- {
- name: "custom separators (with preserve)",
- content: ` a , 123.456, b, c d, (
- test (a,b,c) " 123 "
- ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
- separators: []rune{',', ' ', '\t', '\n'},
- keepSeparator: true,
- ignoreParenthesis: false,
- expectError: false,
- expectTokens: []string{
- "a ",
- "123.456,",
- "b,",
- "c ",
- "d,",
- "(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t),",
- `"(abc d",`,
- `"abc) d",`,
- `"(abc) d \" " `,
- `'abc "'`,
+ "b c d",
+ "(e + f)",
+ `"g & h"`,
},
},
{
name: "ignoring parenthesis",
content: `a, b, (c,d)`,
- separators: []rune{','},
+ separators: DefaultSeparators,
keepSeparator: false,
+ keepEmptyTokens: false,
ignoreParenthesis: true,
expectError: false,
expectTokens: []string{
@@ -225,6 +211,26 @@ func TestScanAll(t *testing.T) {
"d)",
},
},
+ {
+ name: "keep empty tokens",
+ content: `a, b, (c, d), ,, , e, , f`,
+ separators: DefaultSeparators,
+ keepSeparator: false,
+ keepEmptyTokens: true,
+ ignoreParenthesis: false,
+ expectError: false,
+ expectTokens: []string{
+ "a",
+ "b",
+ "(c, d)",
+ "",
+ "",
+ "",
+ "e",
+ "",
+ "f",
+ },
+ },
}
for _, s := range scenarios {
@@ -233,6 +239,7 @@ func TestScanAll(t *testing.T) {
tk.Separators(s.separators...)
tk.KeepSeparator(s.keepSeparator)
+ tk.KeepEmptyTokens(s.keepEmptyTokens)
tk.IgnoreParenthesis(s.ignoreParenthesis)
tokens, err := tk.ScanAll()
@@ -255,9 +262,42 @@ func TestScanAll(t *testing.T) {
}
}
if !exists {
- t.Fatalf("Unexpected token %s", tok)
+ t.Fatalf("Unexpected token %q", tok)
}
}
})
}
}
+
+func TestTrimCutset(t *testing.T) {
+ scenarios := []struct {
+ name string
+ separators []rune
+ expectedCutset string
+ }{
+ {
+ "default factory separators",
+ nil,
+ "\t\n\v\f\r \u0085\u00a0",
+ },
+ {
+ "custom separators",
+ []rune{'\t', ' ', '\r', ','},
+ "\n\v\f\u0085\u00a0",
+ },
+ }
+
+ for _, s := range scenarios {
+ t.Run(s.name, func(t *testing.T) {
+ tk := NewFromString("")
+
+ if len(s.separators) > 0 {
+ tk.Separators(s.separators...)
+ }
+
+ if tk.trimCutset != s.expectedCutset {
+ t.Fatalf("Expected cutset %q, got %q", s.expectedCutset, tk.trimCutset)
+ }
+ })
+ }
+}