71 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
			
		
		
	
	
			71 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
<?php
 | 
						|
 | 
						|
namespace BookStack\Search;
 | 
						|
 | 
						|
/**
 | 
						|
 * A custom text tokenizer which records & provides insight needed for our search indexing.
 | 
						|
 * We used to use basic strtok() but this class does the following which that lacked:
 | 
						|
 * - Tracks and provides the current/previous delimiter that we've stopped at.
 | 
						|
 * - Returns empty tokens upon parsing a delimiter.
 | 
						|
 */
 | 
						|
class SearchTextTokenizer
 | 
						|
{
 | 
						|
    protected int $currentIndex = 0;
 | 
						|
    protected int $length;
 | 
						|
    protected string $currentDelimiter = '';
 | 
						|
    protected string $previousDelimiter = '';
 | 
						|
 | 
						|
    public function __construct(
 | 
						|
        protected string $text,
 | 
						|
        protected string $delimiters = ' '
 | 
						|
    ) {
 | 
						|
        $this->length = strlen($this->text);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Get the current delimiter to be found.
 | 
						|
     */
 | 
						|
    public function currentDelimiter(): string
 | 
						|
    {
 | 
						|
        return $this->currentDelimiter;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Get the previous delimiter found.
 | 
						|
     */
 | 
						|
    public function previousDelimiter(): string
 | 
						|
    {
 | 
						|
        return $this->previousDelimiter;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Get the next token between delimiters.
 | 
						|
     * Returns false if there's no further tokens.
 | 
						|
     */
 | 
						|
    public function next(): string|false
 | 
						|
    {
 | 
						|
        $token = '';
 | 
						|
 | 
						|
        for ($i = $this->currentIndex; $i < $this->length; $i++) {
 | 
						|
            $char = $this->text[$i];
 | 
						|
            if (str_contains($this->delimiters, $char)) {
 | 
						|
                $this->previousDelimiter = $this->currentDelimiter;
 | 
						|
                $this->currentDelimiter = $char;
 | 
						|
                $this->currentIndex = $i + 1;
 | 
						|
                return $token;
 | 
						|
            }
 | 
						|
 | 
						|
            $token .= $char;
 | 
						|
        }
 | 
						|
 | 
						|
        if ($token) {
 | 
						|
            $this->currentIndex = $this->length;
 | 
						|
            $this->previousDelimiter = $this->currentDelimiter;
 | 
						|
            $this->currentDelimiter = '';
 | 
						|
            return $token;
 | 
						|
        }
 | 
						|
 | 
						|
        return false;
 | 
						|
    }
 | 
						|
}
 |