synced with master

This commit is contained in:
Gani Georgiev 2023-10-27 22:28:15 +03:00
commit f889a3fcb3
5 changed files with 173 additions and 89 deletions

View File

@ -28,6 +28,15 @@
``` ```
## v0.19.1
- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset.
An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`.
_This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._
- Fixed the `:excerpt(max, withEllipsis?)` `field` query param modifier to properly add space to the generated text fragment after block tags.
## v0.19.0 ## v0.19.0
- Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv). - Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv).

View File

@ -78,8 +78,7 @@ func (m *excerptModifier) Modify(value any) (any, error) {
return "", err return "", err
} }
var isNotEmpty bool var hasPrevSpace bool
var needSpace bool
// for all node types and more details check // for all node types and more details check
// https://pkg.go.dev/golang.org/x/net/html#Parse // https://pkg.go.dev/golang.org/x/net/html#Parse
@ -87,37 +86,47 @@ func (m *excerptModifier) Modify(value any) (any, error) {
stripTags = func(n *html.Node) { stripTags = func(n *html.Node) {
switch n.Type { switch n.Type {
case html.TextNode: case html.TextNode:
if txt := strings.TrimSpace(whitespaceRegex.ReplaceAllString(n.Data, " ")); txt != "" { // collapse multiple spaces into one
if isNotEmpty && needSpace { txt := whitespaceRegex.ReplaceAllString(n.Data, " ")
needSpace = false
builder.WriteString(" ") if hasPrevSpace {
} txt = strings.TrimLeft(txt, " ")
}
if txt != "" {
hasPrevSpace = strings.HasSuffix(txt, " ")
builder.WriteString(txt) builder.WriteString(txt)
if !isNotEmpty {
isNotEmpty = true
}
}
case html.ElementNode:
if !needSpace && !list.ExistInSlice(n.Data, inlineTags) {
needSpace = true
} }
} }
if builder.Len() > m.max { // excerpt max has been reached => no need to further iterate
// (+2 for the extra whitespace suffix/prefix that will be trimmed later)
if builder.Len() > m.max+2 {
return return
} }
for c := n.FirstChild; c != nil; c = c.NextSibling { for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) { if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) {
isBlock := c.Type == html.ElementNode && !list.ExistInSlice(c.Data, inlineTags)
if isBlock && !hasPrevSpace {
builder.WriteString(" ")
hasPrevSpace = true
}
stripTags(c) stripTags(c)
if isBlock && !hasPrevSpace {
builder.WriteString(" ")
hasPrevSpace = true
}
} }
} }
} }
stripTags(doc) stripTags(doc)
result := builder.String() result := strings.TrimSpace(builder.String())
if len(result) > m.max { if len(result) > m.max {
result = strings.TrimSpace(result[:m.max]) result = strings.TrimSpace(result[:m.max])

View File

@ -84,11 +84,10 @@ func TestNewExcerptModifier(t *testing.T) {
} }
func TestExcerptModifierModify(t *testing.T) { func TestExcerptModifierModify(t *testing.T) {
// plain text value: "Hello t est12 3 word"
html := ` <script>var a = 123;</script> <p>Hello</p><div id="test_id">t est<b>12 html := ` <script>var a = 123;</script> <p>Hello</p><div id="test_id">t est<b>12
3</b></div> <h1>word </h1> ` 3</b><span>456</span></div><span>word <b>7</b> 89<span>!<b>?</b><b> a </b><b>b </b>c</span>#<h1>title</h1>`
plainText := "Hello t est12 3 word" plainText := "Hello t est12 3456 word 7 89!? a b c# title"
scenarios := []struct { scenarios := []struct {
name string name string

View File

@ -21,6 +21,8 @@ const eof = rune(0)
// DefaultSeparators is a list with the default token separator characters. // DefaultSeparators is a list with the default token separator characters.
var DefaultSeparators = []rune{','} var DefaultSeparators = []rune{','}
var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
// NewFromString creates new Tokenizer from the provided string. // NewFromString creates new Tokenizer from the provided string.
func NewFromString(str string) *Tokenizer { func NewFromString(str string) *Tokenizer {
return New(strings.NewReader(str)) return New(strings.NewReader(str))
@ -33,12 +35,11 @@ func NewFromBytes(b []byte) *Tokenizer {
// New creates new Tokenizer from the provided reader with DefaultSeparators. // New creates new Tokenizer from the provided reader with DefaultSeparators.
func New(r io.Reader) *Tokenizer { func New(r io.Reader) *Tokenizer {
return &Tokenizer{ t := &Tokenizer{r: bufio.NewReader(r)}
r: bufio.NewReader(r),
separators: DefaultSeparators, t.Separators(DefaultSeparators...)
keepSeparator: false,
ignoreParenthesis: false, return t
}
} }
// Tokenizer defines a struct that parses a reader into tokens while // Tokenizer defines a struct that parses a reader into tokens while
@ -46,14 +47,18 @@ func New(r io.Reader) *Tokenizer {
type Tokenizer struct { type Tokenizer struct {
r *bufio.Reader r *bufio.Reader
trimCutset string
separators []rune separators []rune
keepSeparator bool keepSeparator bool
keepEmptyTokens bool
ignoreParenthesis bool ignoreParenthesis bool
} }
// Separators defines the provided separatos of the current Tokenizer. // Separators defines the provided separatos of the current Tokenizer.
func (t *Tokenizer) Separators(separators ...rune) { func (t *Tokenizer) Separators(separators ...rune) {
t.separators = separators t.separators = separators
t.rebuildTrimCutset()
} }
// KeepSeparator defines whether to keep the separator rune as part // KeepSeparator defines whether to keep the separator rune as part
@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
t.keepSeparator = state t.keepSeparator = state
} }
// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
func (t *Tokenizer) KeepEmptyTokens(state bool) {
t.keepEmptyTokens = state
}
// IgnoreParenthesis defines whether to ignore the parenthesis boundaries // IgnoreParenthesis defines whether to ignore the parenthesis boundaries
// and to treat the '(' and ')' as regular characters. // and to treat the '(' and ')' as regular characters.
func (t *Tokenizer) IgnoreParenthesis(state bool) { func (t *Tokenizer) IgnoreParenthesis(state bool) {
t.ignoreParenthesis = state t.ignoreParenthesis = state
} }
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed). // Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
//
// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
// //
// Returns [io.EOF] error when there are no more tokens to scan. // Returns [io.EOF] error when there are no more tokens to scan.
func (t *Tokenizer) Scan() (string, error) { func (t *Tokenizer) Scan() (string, error) {
ch := t.read() ch := t.read()
if ch == eof { if ch == eof {
return "", io.EOF return "", io.EOF
} }
t.unread()
if isWhitespaceRune(ch) {
t.readWhiteSpaces()
} else {
t.unread()
}
token, err := t.readToken() token, err := t.readToken()
if err != nil { if err != nil {
return "", err return "", err
} }
// read all remaining whitespaces if !t.keepEmptyTokens && token == "" {
t.readWhiteSpaces() return t.Scan()
}
return token, err return token, err
} }
@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
break break
} }
if !isEscapeRune(prevCh) { if !t.isEscapeRune(prevCh) {
if !t.ignoreParenthesis && ch == '(' && quoteCh == eof { if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
parenthesis++ // opening parenthesis parenthesis++ // opening parenthesis
} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof { } else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
parenthesis-- // closing parenthesis parenthesis-- // closing parenthesis
} else if isQuoteRune(ch) { } else if t.isQuoteRune(ch) {
if quoteCh == ch { if quoteCh == ch {
quoteCh = eof // closing quote quoteCh = eof // closing quote
} else if quoteCh == eof { } else if quoteCh == eof {
@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String()) return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
} }
return buf.String(), nil return strings.Trim(buf.String(), t.trimCutset), nil
} }
// readWhiteSpaces consumes all contiguous whitespace runes. // readWhiteSpaces consumes all contiguous whitespace runes.
@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
break break
} }
if !t.isSeperatorRune(ch) { if !t.isWhitespaceRune(ch) {
t.unread() t.unread()
break break
} }
@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
return t.r.UnreadRune() return t.r.UnreadRune()
} }
// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
func (t *Tokenizer) rebuildTrimCutset() {
var cutset strings.Builder
for _, w := range whitespaceChars {
if t.isSeperatorRune(w) {
continue
}
cutset.WriteRune(w)
}
t.trimCutset = cutset.String()
}
// isSeperatorRune checks if a rune is a token part separator. // isSeperatorRune checks if a rune is a token part separator.
func (t *Tokenizer) isSeperatorRune(ch rune) bool { func (t *Tokenizer) isSeperatorRune(ch rune) bool {
for _, r := range t.separators { for _, r := range t.separators {
@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
return false return false
} }
// isWhitespaceRune checks if a rune is a space, tab, or newline. // isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
func isWhitespaceRune(ch rune) bool { func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n' for _, c := range whitespaceChars {
if c == ch {
return true
}
}
return false
} }
// isQuoteRune checks if a rune is a quote. // isQuoteRune checks if a rune is a quote.
func isQuoteRune(ch rune) bool { func (t *Tokenizer) isQuoteRune(ch rune) bool {
return ch == '\'' || ch == '"' || ch == '`' return ch == '\'' || ch == '"' || ch == '`'
} }
// isEscapeRune checks if a rune is an escape character. // isEscapeRune checks if a rune is an escape character.
func isEscapeRune(ch rune) bool { func (t *Tokenizer) isEscapeRune(ch rune) bool {
return ch == '\\' return ch == '\\'
} }

View File

@ -95,6 +95,7 @@ func TestScanAll(t *testing.T) {
content string content string
separators []rune separators []rune
keepSeparator bool keepSeparator bool
keepEmptyTokens bool
ignoreParenthesis bool ignoreParenthesis bool
expectError bool expectError bool
expectTokens []string expectTokens []string
@ -104,6 +105,7 @@ func TestScanAll(t *testing.T) {
content: "", content: "",
separators: DefaultSeparators, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: false, expectError: false,
expectTokens: nil, expectTokens: nil,
@ -113,6 +115,7 @@ func TestScanAll(t *testing.T) {
content: `(a,b() c`, content: `(a,b() c`,
separators: DefaultSeparators, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: true, expectError: true,
expectTokens: []string{}, expectTokens: []string{},
@ -122,6 +125,7 @@ func TestScanAll(t *testing.T) {
content: `'asd"`, content: `'asd"`,
separators: DefaultSeparators, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: true, expectError: true,
expectTokens: []string{}, expectTokens: []string{},
@ -131,15 +135,18 @@ func TestScanAll(t *testing.T) {
content: `a, b, c, d, e 123, "abc"`, content: `a, b, c, d, e 123, "abc"`,
separators: nil, separators: nil,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: false, expectError: false,
expectTokens: []string{`a, b, c, d, e 123, "abc"`}, expectTokens: []string{`a, b, c, d, e 123, "abc"`},
}, },
{ {
name: "default separators", name: "default separators",
content: `a, b, c, d e, "a,b, c ", (123, 456)`, content: `a, b , c , d e , "a,b, c " , ,, , (123, 456)
`,
separators: DefaultSeparators, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: false, expectError: false,
expectTokens: []string{ expectTokens: []string{
@ -152,70 +159,49 @@ func TestScanAll(t *testing.T) {
}, },
}, },
{ {
name: "default separators (with preserve)", name: "keep separators",
content: `a, b, c, d e, "a,b, c ", (123, 456)`, content: `a, b, c, d e, "a,b, c ", (123, 456)`,
separators: DefaultSeparators, separators: []rune{',', ' '}, // the space should be removed from the cutset
keepSeparator: true, keepSeparator: true,
keepEmptyTokens: true,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: false, expectError: false,
expectTokens: []string{ expectTokens: []string{
"a,", "a,",
" ",
"b,", "b,",
" ",
"c,", "c,",
"d e,", " ",
"d ",
" ",
"e,",
" ",
`"a,b, c ",`, `"a,b, c ",`,
`(123, 456)`, `(123, 456)`,
}, },
}, },
{ {
name: "custom separators", name: "custom separators",
content: ` a , 123.456, b, c d, ( content: `a | b c d &(e + f) & "g & h" & & &`,
test (a,b,c) " 123 " separators: []rune{'|', '&'},
),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
separators: []rune{',', ' ', '\t', '\n'},
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: false, ignoreParenthesis: false,
expectError: false, expectError: false,
expectTokens: []string{ expectTokens: []string{
"a", "a",
"123.456", "b c d",
"b", "(e + f)",
"c", `"g & h"`,
"d",
"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
`"(abc d"`,
`"abc) d"`,
`"(abc) d \" "`,
`'abc "'`,
},
},
{
name: "custom separators (with preserve)",
content: ` a , 123.456, b, c d, (
test (a,b,c) " 123 "
),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
separators: []rune{',', ' ', '\t', '\n'},
keepSeparator: true,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{
"a ",
"123.456,",
"b,",
"c ",
"d,",
"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t),",
`"(abc d",`,
`"abc) d",`,
`"(abc) d \" " `,
`'abc "'`,
}, },
}, },
{ {
name: "ignoring parenthesis", name: "ignoring parenthesis",
content: `a, b, (c,d)`, content: `a, b, (c,d)`,
separators: []rune{','}, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
keepEmptyTokens: false,
ignoreParenthesis: true, ignoreParenthesis: true,
expectError: false, expectError: false,
expectTokens: []string{ expectTokens: []string{
@ -225,6 +211,26 @@ func TestScanAll(t *testing.T) {
"d)", "d)",
}, },
}, },
{
name: "keep empty tokens",
content: `a, b, (c, d), ,, , e, , f`,
separators: DefaultSeparators,
keepSeparator: false,
keepEmptyTokens: true,
ignoreParenthesis: false,
expectError: false,
expectTokens: []string{
"a",
"b",
"(c, d)",
"",
"",
"",
"e",
"",
"f",
},
},
} }
for _, s := range scenarios { for _, s := range scenarios {
@ -233,6 +239,7 @@ func TestScanAll(t *testing.T) {
tk.Separators(s.separators...) tk.Separators(s.separators...)
tk.KeepSeparator(s.keepSeparator) tk.KeepSeparator(s.keepSeparator)
tk.KeepEmptyTokens(s.keepEmptyTokens)
tk.IgnoreParenthesis(s.ignoreParenthesis) tk.IgnoreParenthesis(s.ignoreParenthesis)
tokens, err := tk.ScanAll() tokens, err := tk.ScanAll()
@ -255,9 +262,42 @@ func TestScanAll(t *testing.T) {
} }
} }
if !exists { if !exists {
t.Fatalf("Unexpected token %s", tok) t.Fatalf("Unexpected token %q", tok)
} }
} }
}) })
} }
} }
func TestTrimCutset(t *testing.T) {
scenarios := []struct {
name string
separators []rune
expectedCutset string
}{
{
"default factory separators",
nil,
"\t\n\v\f\r \u0085\u00a0",
},
{
"custom separators",
[]rune{'\t', ' ', '\r', ','},
"\n\v\f\u0085\u00a0",
},
}
for _, s := range scenarios {
t.Run(s.name, func(t *testing.T) {
tk := NewFromString("")
if len(s.separators) > 0 {
tk.Separators(s.separators...)
}
if tk.trimCutset != s.expectedCutset {
t.Fatalf("Expected cutset %q, got %q", s.expectedCutset, tk.trimCutset)
}
})
}
}