Skip to content

Commit 08e8a6d

Browse files
authored
Performance enhancements. Rebuild the word parser and replace the whitespace checker in the match finder.
Performance enhancements. Rebuild the word parser and replace the whitespace checker in the match finder.
2 parents 6cbc63a + a16330a commit 08e8a6d

File tree

3 files changed

+148
-233
lines changed

3 files changed

+148
-233
lines changed

lib/Caxy/HtmlDiff/AbstractDiff.php

+67-129
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,8 @@ protected function purifyHtml($html)
398398

399399
protected function splitInputsToWords()
400400
{
401-
$this->setOldWords($this->convertHtmlToListOfWords($this->explode($this->oldText)));
402-
$this->setNewWords($this->convertHtmlToListOfWords($this->explode($this->newText)));
401+
$this->setOldWords($this->convertHtmlToListOfWords($this->oldText));
402+
$this->setNewWords($this->convertHtmlToListOfWords($this->newText));
403403
}
404404

405405
/**
@@ -421,146 +421,84 @@ protected function setNewWords(array $newWords)
421421
}
422422

423423
/**
424-
* @param string $text
425-
*
426-
* @return bool
424+
* @return string[]
427425
*/
428-
protected function isPartOfWord($text)
426+
protected function convertHtmlToListOfWords(string $text) : array
429427
{
430-
return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
431-
}
428+
$words = [];
429+
$sentencesAndTags = [];
432430

433-
/**
434-
* @param array $characterString
435-
*
436-
* @return array
437-
*/
438-
protected function convertHtmlToListOfWords($characterString)
439-
{
440-
$mode = 'character';
441-
$current_word = '';
442-
$words = array();
443-
$keepNewLines = $this->getConfig()->isKeepNewLines();
444-
foreach ($characterString as $i => $character) {
445-
switch ($mode) {
446-
case 'character':
447-
if ($this->isStartOfTag($character)) {
448-
if ($current_word != '') {
449-
$words[] = $current_word;
450-
}
451-
452-
$current_word = '<';
453-
$mode = 'tag';
454-
} elseif (preg_match("/\s/u", $character)) {
455-
if ($current_word !== '') {
456-
$words[] = $current_word;
457-
}
458-
$current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
459-
$mode = 'whitespace';
460-
} else {
461-
if (
462-
(($this->ctypeAlphanumUnicode($character) === true) && ($this->stringUtil->strlen($current_word) === 0 || $this->isPartOfWord($current_word))) ||
463-
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
464-
) {
465-
$current_word .= $character;
466-
} else {
467-
$words[] = $current_word;
468-
$current_word = $character;
469-
}
470-
}
471-
break;
472-
case 'tag' :
473-
if ($this->isEndOfTag($character)) {
474-
$current_word .= '>';
475-
$words[] = $current_word;
476-
$current_word = '';
477-
478-
if (!preg_match('[^\s]u', $character)) {
479-
$mode = 'whitespace';
480-
} else {
481-
$mode = 'character';
482-
}
483-
} else {
484-
$current_word .= $character;
485-
}
486-
break;
487-
case 'whitespace':
488-
if ($this->isStartOfTag($character)) {
489-
if ($current_word !== '') {
490-
$words[] = $current_word;
491-
}
492-
$current_word = '<';
493-
$mode = 'tag';
494-
} elseif (preg_match("/\s/u", $character)) {
495-
$current_word .= $character;
496-
if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
497-
} else {
498-
if ($current_word != '') {
499-
$words[] = $current_word;
500-
}
501-
$current_word = $character;
502-
$mode = 'character';
503-
}
504-
break;
505-
default:
506-
break;
507-
}
431+
$specialCharacters = '';
432+
433+
foreach ($this->config->getSpecialCaseChars() as $char) {
434+
$specialCharacters .= '\\' . $char;
508435
}
509-
if ($current_word != '') {
510-
$words[] = $current_word;
436+
437+
// Normalize no-break-spaces to regular spaces
438+
$text = str_replace("\xc2\xa0", ' ', $text);
439+
440+
preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY);
441+
442+
foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) {
443+
if ($sentenceOrHtmlTag === '') {
444+
continue;
445+
}
446+
447+
if ($sentenceOrHtmlTag[0] === '<') {
448+
$words[] = $sentenceOrHtmlTag;
449+
450+
continue;
451+
}
452+
453+
$sentenceOrHtmlTag = $this->normalizeWhitespaceInHtmlSentence($sentenceOrHtmlTag);
454+
455+
$sentenceSplitIntoWords = [];
456+
457+
// This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars
458+
// in the middle of a word, but not at the beginning or the end of a word.
459+
// Split regex compiles to this (in default config case);
460+
// /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu
461+
$regex = sprintf('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu', $specialCharacters, $specialCharacters);
462+
463+
preg_match_all(
464+
$regex,
465+
$sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the last word is found by having a space behind it.
466+
$sentenceSplitIntoWords,
467+
PREG_SPLIT_NO_EMPTY
468+
);
469+
470+
// Remove the last space, since that was added by us for the regex matcher
471+
array_pop($sentenceSplitIntoWords[0]);
472+
473+
foreach ($sentenceSplitIntoWords[0] as $word) {
474+
$words[] = $word;
475+
}
511476
}
512477

513478
return $words;
514479
}
515480

516-
/**
517-
* @param string $val
518-
*
519-
* @return bool
520-
*/
521-
protected function isStartOfTag($val)
481+
protected function normalizeWhitespaceInHtmlSentence(string $sentence) : string
522482
{
523-
return $val === '<';
524-
}
483+
if ($this->config->isKeepNewLines() === true) {
484+
return $sentence;
485+
}
525486

526-
/**
527-
* @param string $val
528-
*
529-
* @return bool
530-
*/
531-
protected function isEndOfTag($val)
532-
{
533-
return $val === '>';
534-
}
487+
$sentence = preg_replace('/\s\s+|\r+|\n+|\r\n+/', ' ', $sentence);
535488

536-
/**
537-
* @param string $value
538-
*
539-
* @return bool
540-
*/
541-
protected function isWhiteSpace($value)
542-
{
543-
return !preg_match('[^\s]u', $value);
544-
}
545489

546-
/**
547-
* @param string $value
548-
*
549-
* @return array
550-
*/
551-
protected function explode($value)
552-
{
553-
// as suggested by @onassar
554-
return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
555-
}
490+
$sentenceLength = $this->stringUtil->strlen($sentence);
491+
$firstCharacter = $this->stringUtil->substr($sentence, 0, 1);
492+
$lastCharacter = $this->stringUtil->substr($sentence, $sentenceLength -1, 1);
556493

557-
/**
558-
* @param string $str
559-
*
560-
* @return bool
561-
*/
562-
protected function ctypeAlphanumUnicode($str)
563-
{
564-
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str) === 1;
494+
if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") {
495+
$sentence = ' ' . ltrim($sentence);
496+
}
497+
498+
if ($sentenceLength > 1 && ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n")) {
499+
$sentence = rtrim($sentence) . ' ';
500+
}
501+
502+
return $sentence;
565503
}
566504
}

0 commit comments

Comments
 (0)