bookstack/app/Search/SearchTextTokenizer.php

<?php

namespace BookStack\Search;

/**
 * A custom text tokenizer which records & provides insight needed for our search indexing.
 * We used to use basic strtok() but this class does the following which that lacked:
 * - Tracks and provides the current/previous delimiter that we've stopped at.
 * - Returns empty tokens upon parsing a delimiter.
 */
class SearchTextTokenizer
{
    protected int $currentIndex = 0;
    protected int $length;
    protected string $currentDelimiter = '';
    protected string $previousDelimiter = '';

    public function __construct(
        protected string $text,
        protected string $delimiters = ' '
    ) {
        $this->length = strlen($this->text);
    }

    /**
     * Get the current delimiter to be found.
     */
    public function currentDelimiter(): string
    {
        return $this->currentDelimiter;
    }

    /**
     * Get the previous delimiter found.
     */
    public function previousDelimiter(): string
    {
        return $this->previousDelimiter;
    }

    /**
     * Get the next token between delimiters.
     * Returns false if there's no further tokens.
     */
    public function next(): string|false
    {
        $token = '';

        for ($i = $this->currentIndex; $i < $this->length; $i++) {
            $char = $this->text[$i];
            if (str_contains($this->delimiters, $char)) {
                $this->previousDelimiter = $this->currentDelimiter;
                $this->currentDelimiter = $char;
                $this->currentIndex = $i + 1;
                return $token;
            }

            $token .= $char;
        }

        if ($token) {
            $this->currentIndex = $this->length;
            $this->previousDelimiter = $this->currentDelimiter;
            $this->currentDelimiter = '';
            return $token;
        }

        return false;
    }
}