@@ -398,8 +398,8 @@ protected function purifyHtml($html)
398
398
399
399
protected function splitInputsToWords ()
400
400
{
401
- $ this ->setOldWords ($ this ->convertHtmlToListOfWords ($ this ->explode ( $ this -> oldText ) ));
402
- $ this ->setNewWords ($ this ->convertHtmlToListOfWords ($ this ->explode ( $ this -> newText ) ));
401
+ $ this ->setOldWords ($ this ->convertHtmlToListOfWords ($ this ->oldText ));
402
+ $ this ->setNewWords ($ this ->convertHtmlToListOfWords ($ this ->newText ));
403
403
}
404
404
405
405
/**
@@ -421,146 +421,84 @@ protected function setNewWords(array $newWords)
421
421
}
422
422
423
423
/**
424
- * @param string $text
425
- *
426
- * @return bool
424
+ * @return string[]
427
425
*/
428
- protected function isPartOfWord ( $ text )
426
+ protected function convertHtmlToListOfWords ( string $ text ) : array
429
427
{
430
- return $ this -> ctypeAlphanumUnicode ( str_replace ( $ this -> config -> getSpecialCaseChars (), '' , $ text )) ;
431
- }
428
+ $ words = [] ;
429
+ $ sentencesAndTags = [];
432
430
433
- /**
434
- * @param array $characterString
435
- *
436
- * @return array
437
- */
438
- protected function convertHtmlToListOfWords ($ characterString )
439
- {
440
- $ mode = 'character ' ;
441
- $ current_word = '' ;
442
- $ words = array ();
443
- $ keepNewLines = $ this ->getConfig ()->isKeepNewLines ();
444
- foreach ($ characterString as $ i => $ character ) {
445
- switch ($ mode ) {
446
- case 'character ' :
447
- if ($ this ->isStartOfTag ($ character )) {
448
- if ($ current_word != '' ) {
449
- $ words [] = $ current_word ;
450
- }
451
-
452
- $ current_word = '< ' ;
453
- $ mode = 'tag ' ;
454
- } elseif (preg_match ("/\s/u " , $ character )) {
455
- if ($ current_word !== '' ) {
456
- $ words [] = $ current_word ;
457
- }
458
- $ current_word = $ keepNewLines ? $ character : preg_replace ('/\s+/Su ' , ' ' , $ character );
459
- $ mode = 'whitespace ' ;
460
- } else {
461
- if (
462
- (($ this ->ctypeAlphanumUnicode ($ character ) === true ) && ($ this ->stringUtil ->strlen ($ current_word ) === 0 || $ this ->isPartOfWord ($ current_word ))) ||
463
- (in_array ($ character , $ this ->config ->getSpecialCaseChars ()) && isset ($ characterString [$ i + 1 ]) && $ this ->isPartOfWord ($ characterString [$ i + 1 ]))
464
- ) {
465
- $ current_word .= $ character ;
466
- } else {
467
- $ words [] = $ current_word ;
468
- $ current_word = $ character ;
469
- }
470
- }
471
- break ;
472
- case 'tag ' :
473
- if ($ this ->isEndOfTag ($ character )) {
474
- $ current_word .= '> ' ;
475
- $ words [] = $ current_word ;
476
- $ current_word = '' ;
477
-
478
- if (!preg_match ('[^\s]u ' , $ character )) {
479
- $ mode = 'whitespace ' ;
480
- } else {
481
- $ mode = 'character ' ;
482
- }
483
- } else {
484
- $ current_word .= $ character ;
485
- }
486
- break ;
487
- case 'whitespace ' :
488
- if ($ this ->isStartOfTag ($ character )) {
489
- if ($ current_word !== '' ) {
490
- $ words [] = $ current_word ;
491
- }
492
- $ current_word = '< ' ;
493
- $ mode = 'tag ' ;
494
- } elseif (preg_match ("/\s/u " , $ character )) {
495
- $ current_word .= $ character ;
496
- if (!$ keepNewLines ) $ current_word = preg_replace ('/\s+/Su ' , ' ' , $ current_word );
497
- } else {
498
- if ($ current_word != '' ) {
499
- $ words [] = $ current_word ;
500
- }
501
- $ current_word = $ character ;
502
- $ mode = 'character ' ;
503
- }
504
- break ;
505
- default :
506
- break ;
507
- }
431
+ $ specialCharacters = '' ;
432
+
433
+ foreach ($ this ->config ->getSpecialCaseChars () as $ char ) {
434
+ $ specialCharacters .= '\\' . $ char ;
508
435
}
509
- if ($ current_word != '' ) {
510
- $ words [] = $ current_word ;
436
+
437
+ // Normalize no-break-spaces to regular spaces
438
+ $ text = str_replace ("\xc2\xa0" , ' ' , $ text );
439
+
440
+ preg_match_all ('/<.+?>|[^<]+/mu ' , $ text , $ sentencesAndTags , PREG_SPLIT_NO_EMPTY );
441
+
442
+ foreach ($ sentencesAndTags [0 ] as $ sentenceOrHtmlTag ) {
443
+ if ($ sentenceOrHtmlTag === '' ) {
444
+ continue ;
445
+ }
446
+
447
+ if ($ sentenceOrHtmlTag [0 ] === '< ' ) {
448
+ $ words [] = $ sentenceOrHtmlTag ;
449
+
450
+ continue ;
451
+ }
452
+
453
+ $ sentenceOrHtmlTag = $ this ->normalizeWhitespaceInHtmlSentence ($ sentenceOrHtmlTag );
454
+
455
+ $ sentenceSplitIntoWords = [];
456
+
457
+ // This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars
458
+ // in the middle of a word, but not at the beginning or the end of a word.
459
+ // Split regex compiles to this (in default config case);
460
+ // /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu
461
+ $ regex = sprintf ('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu ' , $ specialCharacters , $ specialCharacters );
462
+
463
+ preg_match_all (
464
+ $ regex ,
465
+ $ sentenceOrHtmlTag . ' ' , // Inject a space at the end to make sure the last word is found by having a space behind it.
466
+ $ sentenceSplitIntoWords ,
467
+ PREG_SPLIT_NO_EMPTY
468
+ );
469
+
470
+ // Remove the last space, since that was added by us for the regex matcher
471
+ array_pop ($ sentenceSplitIntoWords [0 ]);
472
+
473
+ foreach ($ sentenceSplitIntoWords [0 ] as $ word ) {
474
+ $ words [] = $ word ;
475
+ }
511
476
}
512
477
513
478
return $ words ;
514
479
}
515
480
516
- /**
517
- * @param string $val
518
- *
519
- * @return bool
520
- */
521
- protected function isStartOfTag ($ val )
481
+ protected function normalizeWhitespaceInHtmlSentence (string $ sentence ) : string
522
482
{
523
- return $ val === '< ' ;
524
- }
483
+ if ($ this ->config ->isKeepNewLines () === true ) {
484
+ return $ sentence ;
485
+ }
525
486
526
- /**
527
- * @param string $val
528
- *
529
- * @return bool
530
- */
531
- protected function isEndOfTag ($ val )
532
- {
533
- return $ val === '> ' ;
534
- }
487
+ $ sentence = preg_replace ('/\s\s+|\r+|\n+|\r\n+/ ' , ' ' , $ sentence );
535
488
536
- /**
537
- * @param string $value
538
- *
539
- * @return bool
540
- */
541
- protected function isWhiteSpace ($ value )
542
- {
543
- return !preg_match ('[^\s]u ' , $ value );
544
- }
545
489
546
- /**
547
- * @param string $value
548
- *
549
- * @return array
550
- */
551
- protected function explode ($ value )
552
- {
553
- // as suggested by @onassar
554
- return preg_split ('//u ' , $ value , -1 , PREG_SPLIT_NO_EMPTY );
555
- }
490
+ $ sentenceLength = $ this ->stringUtil ->strlen ($ sentence );
491
+ $ firstCharacter = $ this ->stringUtil ->substr ($ sentence , 0 , 1 );
492
+ $ lastCharacter = $ this ->stringUtil ->substr ($ sentence , $ sentenceLength -1 , 1 );
556
493
557
- /**
558
- * @param string $str
559
- *
560
- * @return bool
561
- */
562
- protected function ctypeAlphanumUnicode ($ str )
563
- {
564
- return preg_match ("/^[a-zA-Z0-9\pL]+$/u " , $ str ) === 1 ;
494
+ if ($ firstCharacter === ' ' || $ firstCharacter === "\r" || $ firstCharacter === "\n" ) {
495
+ $ sentence = ' ' . ltrim ($ sentence );
496
+ }
497
+
498
+ if ($ sentenceLength > 1 && ($ lastCharacter === ' ' || $ lastCharacter === "\r" || $ lastCharacter === "\n" )) {
499
+ $ sentence = rtrim ($ sentence ) . ' ' ;
500
+ }
501
+
502
+ return $ sentence ;
565
503
}
566
504
}
0 commit comments