diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index c7d9d6502..a8bd2c4b2 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -16,7 +16,13 @@ class SearchIndex /** * A list of delimiter characters used to break-up parsed content into terms for indexing. */ - public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\""; + public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\""; + + /** + * A list of delimiter which could be commonly used within a single term and also indicate a break between terms. + * The indexer will index the full term with these delimiters, plus the terms split via these delimiters. + */ + public static string $softDelimiters = ".-"; public function __construct( protected EntityProvider $entityProvider @@ -196,15 +202,36 @@ class SearchIndex protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = static::$delimiters; - $token = strtok($text, $splitChars); + $softDelims = static::$softDelimiters; + $tokenizer = new SearchTextTokenizer($text, static::$delimiters); + $extendedToken = ''; + $extendedLen = 0; + + $token = $tokenizer->next(); while ($token !== false) { - if (!isset($tokenMap[$token])) { - $tokenMap[$token] = 0; + $delim = $tokenizer->previousDelimiter(); + + if ($delim && str_contains($softDelims, $delim) && $token !== '') { + $extendedToken .= $delim . $token; + $extendedLen++; + } else { + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; + } + $extendedToken = $token; + $extendedLen = 1; } - $tokenMap[$token]++; - $token = strtok($splitChars); + + if ($token) { + $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1; + } + + $token = $tokenizer->next(); + } + + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; } return $tokenMap; diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php index a6f820299..bf527d9c3 100644 --- a/app/Search/SearchOptions.php +++ b/app/Search/SearchOptions.php @@ -181,7 +181,7 @@ class SearchOptions protected static function parseStandardTermString(string $termString): array { $terms = explode(' ', $termString); - $indexDelimiters = SearchIndex::$delimiters; + $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters))); $parsed = [ 'terms' => [], 'exacts' => [], diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php new file mode 100644 index 000000000..f43fd56f1 --- /dev/null +++ b/app/Search/SearchTextTokenizer.php @@ -0,0 +1,70 @@ +length = strlen($this->text); + } + + /** + * Get the current delimiter to be found. + */ + public function currentDelimiter(): string + { + return $this->currentDelimiter; + } + + /** + * Get the previous delimiter found. + */ + public function previousDelimiter(): string + { + return $this->previousDelimiter; + } + + /** + * Get the next token between delimiters. + * Returns false if there's no further tokens. + */ + public function next(): string|false + { + $token = ''; + + for ($i = $this->currentIndex; $i < $this->length; $i++) { + $char = $this->text[$i]; + if (str_contains($this->delimiters, $char)) { + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = $char; + $this->currentIndex = $i + 1; + return $token; + } + + $token .= $char; + } + + if ($token) { + $this->currentIndex = $this->length; + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = ''; + return $token; + } + + return false; + } +} diff --git a/tests/Search/SearchIndexingTest.php b/tests/Search/SearchIndexingTest.php index 43219a4ed..6933813b6 100644 --- a/tests/Search/SearchIndexingTest.php +++ b/tests/Search/SearchIndexingTest.php @@ -74,4 +74,20 @@ class SearchIndexingTest extends TestCase $this->assertEquals(3, $scoreByTerm->get('Animal')); $this->assertEquals(3, $scoreByTerm->get('SuperImportant')); } + + public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index() + { + $page = $this->entities->newPage(['html' => '
super.duper awesome-beans big- barry cheese.
biscuits
a-bs
']); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs']; + foreach ($expected as $term) { + $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed"); + } + + $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits']; + foreach ($nonExpected as $term) { + $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed"); + } + } }