71 lines
1.8 KiB
PHP
71 lines
1.8 KiB
PHP
<?php
|
|
|
|
namespace BookStack\Search;
|
|
|
|
/**
|
|
* A custom text tokenizer which records & provides insight needed for our search indexing.
|
|
* We used to use basic strtok() but this class does the following which that lacked:
|
|
* - Tracks and provides the current/previous delimiter that we've stopped at.
|
|
* - Returns empty tokens upon parsing a delimiter.
|
|
*/
|
|
class SearchTextTokenizer
|
|
{
|
|
protected int $currentIndex = 0;
|
|
protected int $length;
|
|
protected string $currentDelimiter = '';
|
|
protected string $previousDelimiter = '';
|
|
|
|
public function __construct(
|
|
protected string $text,
|
|
protected string $delimiters = ' '
|
|
) {
|
|
$this->length = strlen($this->text);
|
|
}
|
|
|
|
/**
|
|
* Get the current delimiter to be found.
|
|
*/
|
|
public function currentDelimiter(): string
|
|
{
|
|
return $this->currentDelimiter;
|
|
}
|
|
|
|
/**
|
|
* Get the previous delimiter found.
|
|
*/
|
|
public function previousDelimiter(): string
|
|
{
|
|
return $this->previousDelimiter;
|
|
}
|
|
|
|
/**
|
|
* Get the next token between delimiters.
|
|
* Returns false if there's no further tokens.
|
|
*/
|
|
public function next(): string|false
|
|
{
|
|
$token = '';
|
|
|
|
for ($i = $this->currentIndex; $i < $this->length; $i++) {
|
|
$char = $this->text[$i];
|
|
if (str_contains($this->delimiters, $char)) {
|
|
$this->previousDelimiter = $this->currentDelimiter;
|
|
$this->currentDelimiter = $char;
|
|
$this->currentIndex = $i + 1;
|
|
return $token;
|
|
}
|
|
|
|
$token .= $char;
|
|
}
|
|
|
|
if ($token) {
|
|
$this->currentIndex = $this->length;
|
|
$this->previousDelimiter = $this->currentDelimiter;
|
|
$this->currentDelimiter = '';
|
|
return $token;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|