diff --git a/CHANGELOG.md b/CHANGELOG.md index 027c36ef..228533cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,15 @@ ``` +## v0.19.1 + +- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset. + An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`. + _This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._ + +- Fixed the `:excerpt(max, withEllipsis?)` `field` query param modifier to properly add space to the generated text fragment after block tags. + + ## v0.19.0 - Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv). diff --git a/tools/rest/excerpt_modifier.go b/tools/rest/excerpt_modifier.go index 107a8fcf..07da1a06 100644 --- a/tools/rest/excerpt_modifier.go +++ b/tools/rest/excerpt_modifier.go @@ -78,8 +78,7 @@ func (m *excerptModifier) Modify(value any) (any, error) { return "", err } - var isNotEmpty bool - var needSpace bool + var hasPrevSpace bool // for all node types and more details check // https://pkg.go.dev/golang.org/x/net/html#Parse @@ -87,37 +86,47 @@ func (m *excerptModifier) Modify(value any) (any, error) { stripTags = func(n *html.Node) { switch n.Type { case html.TextNode: - if txt := strings.TrimSpace(whitespaceRegex.ReplaceAllString(n.Data, " ")); txt != "" { - if isNotEmpty && needSpace { - needSpace = false - builder.WriteString(" ") - } + // collapse multiple spaces into one + txt := whitespaceRegex.ReplaceAllString(n.Data, " ") + + if hasPrevSpace { + txt = strings.TrimLeft(txt, " ") + } + + if txt != "" { + hasPrevSpace = strings.HasSuffix(txt, " ") builder.WriteString(txt) - - if !isNotEmpty { - isNotEmpty = true - } - } - case html.ElementNode: - if !needSpace && !list.ExistInSlice(n.Data, inlineTags) { - needSpace = true } } - if builder.Len() > m.max { + // excerpt max has been reached => no need to further iterate + // (+2 for the extra whitespace suffix/prefix that will be trimmed later) + if builder.Len() > m.max+2 { return } for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) { + isBlock := c.Type == html.ElementNode && !list.ExistInSlice(c.Data, inlineTags) + + if isBlock && !hasPrevSpace { + builder.WriteString(" ") + hasPrevSpace = true + } + stripTags(c) + + if isBlock && !hasPrevSpace { + builder.WriteString(" ") + hasPrevSpace = true + } } } } stripTags(doc) - result := builder.String() + result := strings.TrimSpace(builder.String()) if len(result) > m.max { result = strings.TrimSpace(result[:m.max]) diff --git a/tools/rest/excerpt_modifier_test.go b/tools/rest/excerpt_modifier_test.go index 67c69850..47a87a66 100644 --- a/tools/rest/excerpt_modifier_test.go +++ b/tools/rest/excerpt_modifier_test.go @@ -84,11 +84,10 @@ func TestNewExcerptModifier(t *testing.T) { } func TestExcerptModifierModify(t *testing.T) { - // plain text value: "Hello t est12 3 word" html := `

Hello

t est12 - 3

word

` + 3456word 7 89!? a b c#

title

` - plainText := "Hello t est12 3 word" + plainText := "Hello t est12 3456 word 7 89!? a b c# title" scenarios := []struct { name string diff --git a/tools/tokenizer/tokenizer.go b/tools/tokenizer/tokenizer.go index 573ed8b0..f7111a27 100644 --- a/tools/tokenizer/tokenizer.go +++ b/tools/tokenizer/tokenizer.go @@ -21,6 +21,8 @@ const eof = rune(0) // DefaultSeparators is a list with the default token separator characters. var DefaultSeparators = []rune{','} +var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0} + // NewFromString creates new Tokenizer from the provided string. func NewFromString(str string) *Tokenizer { return New(strings.NewReader(str)) @@ -33,12 +35,11 @@ func NewFromBytes(b []byte) *Tokenizer { // New creates new Tokenizer from the provided reader with DefaultSeparators. func New(r io.Reader) *Tokenizer { - return &Tokenizer{ - r: bufio.NewReader(r), - separators: DefaultSeparators, - keepSeparator: false, - ignoreParenthesis: false, - } + t := &Tokenizer{r: bufio.NewReader(r)} + + t.Separators(DefaultSeparators...) + + return t } // Tokenizer defines a struct that parses a reader into tokens while @@ -46,14 +47,18 @@ func New(r io.Reader) *Tokenizer { type Tokenizer struct { r *bufio.Reader + trimCutset string separators []rune keepSeparator bool + keepEmptyTokens bool ignoreParenthesis bool } // Separators defines the provided separatos of the current Tokenizer. func (t *Tokenizer) Separators(separators ...rune) { t.separators = separators + + t.rebuildTrimCutset() } // KeepSeparator defines whether to keep the separator rune as part @@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) { t.keepSeparator = state } +// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false). +func (t *Tokenizer) KeepEmptyTokens(state bool) { + t.keepEmptyTokens = state +} + // IgnoreParenthesis defines whether to ignore the parenthesis boundaries // and to treat the '(' and ')' as regular characters. func (t *Tokenizer) IgnoreParenthesis(state bool) { t.ignoreParenthesis = state } -// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed). +// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!). +// +// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default). // // Returns [io.EOF] error when there are no more tokens to scan. func (t *Tokenizer) Scan() (string, error) { ch := t.read() - if ch == eof { return "", io.EOF } - - if isWhitespaceRune(ch) { - t.readWhiteSpaces() - } else { - t.unread() - } + t.unread() token, err := t.readToken() if err != nil { return "", err } - // read all remaining whitespaces - t.readWhiteSpaces() + if !t.keepEmptyTokens && token == "" { + return t.Scan() + } return token, err } @@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) { break } - if !isEscapeRune(prevCh) { + if !t.isEscapeRune(prevCh) { if !t.ignoreParenthesis && ch == '(' && quoteCh == eof { parenthesis++ // opening parenthesis } else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof { parenthesis-- // closing parenthesis - } else if isQuoteRune(ch) { + } else if t.isQuoteRune(ch) { if quoteCh == ch { quoteCh = eof // closing quote } else if quoteCh == eof { @@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) { return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String()) } - return buf.String(), nil + return strings.Trim(buf.String(), t.trimCutset), nil } // readWhiteSpaces consumes all contiguous whitespace runes. @@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() { break } - if !t.isSeperatorRune(ch) { + if !t.isWhitespaceRune(ch) { t.unread() break } @@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error { return t.r.UnreadRune() } +// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes. +func (t *Tokenizer) rebuildTrimCutset() { + var cutset strings.Builder + + for _, w := range whitespaceChars { + if t.isSeperatorRune(w) { + continue + } + cutset.WriteRune(w) + } + + t.trimCutset = cutset.String() +} + // isSeperatorRune checks if a rune is a token part separator. func (t *Tokenizer) isSeperatorRune(ch rune) bool { for _, r := range t.separators { @@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool { return false } -// isWhitespaceRune checks if a rune is a space, tab, or newline. -func isWhitespaceRune(ch rune) bool { - return ch == ' ' || ch == '\t' || ch == '\n' +// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line). +func (t *Tokenizer) isWhitespaceRune(ch rune) bool { + for _, c := range whitespaceChars { + if c == ch { + return true + } + } + + return false } // isQuoteRune checks if a rune is a quote. -func isQuoteRune(ch rune) bool { +func (t *Tokenizer) isQuoteRune(ch rune) bool { return ch == '\'' || ch == '"' || ch == '`' } // isEscapeRune checks if a rune is an escape character. -func isEscapeRune(ch rune) bool { +func (t *Tokenizer) isEscapeRune(ch rune) bool { return ch == '\\' } diff --git a/tools/tokenizer/tokenizer_test.go b/tools/tokenizer/tokenizer_test.go index 3b2f634b..801ba946 100644 --- a/tools/tokenizer/tokenizer_test.go +++ b/tools/tokenizer/tokenizer_test.go @@ -95,6 +95,7 @@ func TestScanAll(t *testing.T) { content string separators []rune keepSeparator bool + keepEmptyTokens bool ignoreParenthesis bool expectError bool expectTokens []string @@ -104,6 +105,7 @@ func TestScanAll(t *testing.T) { content: "", separators: DefaultSeparators, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: false, expectTokens: nil, @@ -113,6 +115,7 @@ func TestScanAll(t *testing.T) { content: `(a,b() c`, separators: DefaultSeparators, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: true, expectTokens: []string{}, @@ -122,6 +125,7 @@ func TestScanAll(t *testing.T) { content: `'asd"`, separators: DefaultSeparators, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: true, expectTokens: []string{}, @@ -131,15 +135,18 @@ func TestScanAll(t *testing.T) { content: `a, b, c, d, e 123, "abc"`, separators: nil, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: false, expectTokens: []string{`a, b, c, d, e 123, "abc"`}, }, { - name: "default separators", - content: `a, b, c, d e, "a,b, c ", (123, 456)`, + name: "default separators", + content: `a, b , c , d e , "a,b, c " , ,, , (123, 456) + `, separators: DefaultSeparators, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: false, expectTokens: []string{ @@ -152,70 +159,49 @@ func TestScanAll(t *testing.T) { }, }, { - name: "default separators (with preserve)", - content: `a, b, c, d e, "a,b, c ", (123, 456)`, - separators: DefaultSeparators, + name: "keep separators", + content: `a, b, c, d e, "a,b, c ", (123, 456)`, + separators: []rune{',', ' '}, // the space should be removed from the cutset keepSeparator: true, + keepEmptyTokens: true, ignoreParenthesis: false, expectError: false, expectTokens: []string{ "a,", + " ", "b,", + " ", "c,", - "d e,", + " ", + "d ", + " ", + "e,", + " ", `"a,b, c ",`, `(123, 456)`, }, }, { - name: "custom separators", - content: ` a , 123.456, b, c d, ( - test (a,b,c) " 123 " - ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, - separators: []rune{',', ' ', '\t', '\n'}, + name: "custom separators", + content: `a | b c d &(e + f) & "g & h" & & &`, + separators: []rune{'|', '&'}, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: false, expectError: false, expectTokens: []string{ "a", - "123.456", - "b", - "c", - "d", - "(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)", - `"(abc d"`, - `"abc) d"`, - `"(abc) d \" "`, - `'abc "'`, - }, - }, - { - name: "custom separators (with preserve)", - content: ` a , 123.456, b, c d, ( - test (a,b,c) " 123 " - ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, - separators: []rune{',', ' ', '\t', '\n'}, - keepSeparator: true, - ignoreParenthesis: false, - expectError: false, - expectTokens: []string{ - "a ", - "123.456,", - "b,", - "c ", - "d,", - "(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t),", - `"(abc d",`, - `"abc) d",`, - `"(abc) d \" " `, - `'abc "'`, + "b c d", + "(e + f)", + `"g & h"`, }, }, { name: "ignoring parenthesis", content: `a, b, (c,d)`, - separators: []rune{','}, + separators: DefaultSeparators, keepSeparator: false, + keepEmptyTokens: false, ignoreParenthesis: true, expectError: false, expectTokens: []string{ @@ -225,6 +211,26 @@ func TestScanAll(t *testing.T) { "d)", }, }, + { + name: "keep empty tokens", + content: `a, b, (c, d), ,, , e, , f`, + separators: DefaultSeparators, + keepSeparator: false, + keepEmptyTokens: true, + ignoreParenthesis: false, + expectError: false, + expectTokens: []string{ + "a", + "b", + "(c, d)", + "", + "", + "", + "e", + "", + "f", + }, + }, } for _, s := range scenarios { @@ -233,6 +239,7 @@ func TestScanAll(t *testing.T) { tk.Separators(s.separators...) tk.KeepSeparator(s.keepSeparator) + tk.KeepEmptyTokens(s.keepEmptyTokens) tk.IgnoreParenthesis(s.ignoreParenthesis) tokens, err := tk.ScanAll() @@ -255,9 +262,42 @@ func TestScanAll(t *testing.T) { } } if !exists { - t.Fatalf("Unexpected token %s", tok) + t.Fatalf("Unexpected token %q", tok) } } }) } } + +func TestTrimCutset(t *testing.T) { + scenarios := []struct { + name string + separators []rune + expectedCutset string + }{ + { + "default factory separators", + nil, + "\t\n\v\f\r \u0085\u00a0", + }, + { + "custom separators", + []rune{'\t', ' ', '\r', ','}, + "\n\v\f\u0085\u00a0", + }, + } + + for _, s := range scenarios { + t.Run(s.name, func(t *testing.T) { + tk := NewFromString("") + + if len(s.separators) > 0 { + tk.Separators(s.separators...) + } + + if tk.trimCutset != s.expectedCutset { + t.Fatalf("Expected cutset %q, got %q", s.expectedCutset, tk.trimCutset) + } + }) + } +}