Astrotomic · mallardduck · Oct 2, 2022 · Oct 2, 2022 · Oct 2, 2022 · Oct 2, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 /composer.lock
 /vendor/
+.phpunit.result.cache
diff --git a/composer.json b/composer.json
@@ -22,13 +22,16 @@
         "ext-mbstring": "*"
     },
     "require-dev": {
-        "pestphp/pest": "^0.3.0",
+        "pestphp/pest": "^1.21",
         "s9e/regexp-builder": "^1.4",
         "spatie/emoji": "^2.3.0",
-        "spatie/pest-plugin-snapshots": "^1.0"
+        "spatie/pest-plugin-snapshots": "^1.0",
+        "wa72/htmlpagedom": "^2.0 || ^3.0"
     },
     "suggest": {
-        "spatie/emoji": "*"
+        "ext-dom": "*",
+        "spatie/emoji": "*",
+        "wa72/htmlpagedom": "*"
     },
     "minimum-stability": "dev",
     "prefer-stable": true,
@@ -38,7 +41,10 @@
         }
     },
     "config": {
-        "sort-packages": true
+        "sort-packages": true,
+        "allow-plugins": {
+            "pestphp/pest-plugin": true
+        }
     },
     "scripts": {
         "generate": "php ./generate.php",

diff --git a/src/Exceptions/NoTextChildrenException.php b/src/Exceptions/NoTextChildrenException.php
@@ -0,0 +1,9 @@
+<?php
+
+namespace Astrotomic\Twemoji\Exceptions;
+
+use Exception;
+
+class NoTextChildrenException extends Exception
+{
+}
diff --git a/src/HtmlReplacer.php b/src/HtmlReplacer.php
@@ -0,0 +1,128 @@
+<?php
+
+namespace Astrotomic\Twemoji;
+
+use Astrotomic\Twemoji\Concerns\Configurable;
+use Astrotomic\Twemoji\Exceptions\NoTextChildrenException;
+use DOMDocument;
+use RuntimeException;
+use Wa72\HtmlPageDom\HtmlPageCrawler;
+
+/**
+ * @internal This class is marked as Internal as it is considered Experimental. Code subject to change until warning removed.
+ */
+class HtmlReplacer
+{
+    use Configurable;
+
+    private const UTF8_META = '<meta http-equiv="content-type" content="text/html; charset=utf-8" />';
+
+    private const FRAGMENT_TEMPLATE = <<<'HTML'
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <meta http-equiv="content-type" content="text/html; charset=utf-8">
+    </head>
+    <body id="wrapper-template">
+%s
+    </body>
+</html>
+HTML;
+
+    public function __construct()
+    {
+        if (! class_exists(HtmlPageCrawler::class)) {
+            throw new RuntimeException(
+                sprintf('Cannot use %s method unless `wa72/htmlpagedom` is installed.', __METHOD__)
+            );
+        }
+    }
+
+    public function parse(string $html): string
+    {
+        // Parse the HTML page or fragment...
+        $parsedHtmlRoot = HtmlPageCrawler::create($html);
+
+        if ($parsedHtmlRoot->isHtmlDocument()) {
+            // We will only transform the body...
+            $parsedHtml = $parsedHtmlRoot->filter('body');
+        } else {
+            return $this->parseFragment($html);
+        }
+
+        try {
+            $this->findAndTwmojifyTextNodes($parsedHtml);
+        } catch (NoTextChildrenException $e) {
+            return $html;
+        }
+
+        // Find the page head and check if meta header should be added
+        $htmlHead = $parsedHtmlRoot->filter('head');
+        $addHeader = false;
+        if ($htmlHead->getNode(0)->hasChildNodes()) {
+            $contentTypeMeta = $htmlHead->children('meta[http-equiv="content-type"][content]');
+            $metaNode = $contentTypeMeta->getNode(0);
+            if (
+                $metaNode === null ||
+                iterator_to_array($metaNode->attributes)['content']->textContent !== 'text/html; charset=utf-8'
+            ) {
+                $this->addUtf8MetaTag($htmlHead);
+                $contentTypeMeta->remove();
+            }
+        } else {
+            $this->addUtf8MetaTag($htmlHead);
+        }
+
+        return $parsedHtmlRoot->saveHTML();
+    }
+
+    public function parseFragment(string $html): string
+    {
+        $wrappedFragment = sprintf(static::FRAGMENT_TEMPLATE, $html);
+
+        $parsedHtmlRoot = HtmlPageCrawler::create($wrappedFragment);
+        $parsedHtml = $parsedHtmlRoot->filter('body');
+
+        try {
+            $this->findAndTwmojifyTextNodes($parsedHtml);
+        } catch (NoTextChildrenException $e) {
+            return $html;
+        }
+
+        return trim($parsedHtmlRoot->filter('body')->getInnerHtml());
+    }
+
+    /**
+     * @throws NoTextChildrenException
+     */
+    private function findAndTwmojifyTextNodes(HtmlPageCrawler $htmlContent): HtmlPageCrawler
+    {
+        // Use xpath to filter only the "TextNodes" within every "Element"
+        $textNodes = $htmlContent->filterXPath('.//*[normalize-space(text())]');
+
+        // If the filtered DOM fragment doesn't have TextNode children, return the input HTML.
+        if ($textNodes->count() === 0) {
+            throw new NoTextChildrenException();
+        }
+
+        $textNodes->each(function (HtmlPageCrawler $node) {
+            $twemojiContent = (new EmojiText($node->getInnerHtml()))
+                ->base($this->base)
+                ->type($this->type)
+                ->toHtml();
+            $node->makeEmpty()->setInnerHtml($twemojiContent);
+
+            return $node;
+        });
+
+        return $textNodes;
+    }
+
+    private function addUtf8MetaTag($htmlHead): void
+    {
+        $doc = new DOMDocument();
+        $setUtf8Meta = $doc->createDocumentFragment();
+        $setUtf8Meta->appendXML(static::UTF8_META);
+        $htmlHead->append($setUtf8Meta);
+    }
+}
diff --git a/src/Twemoji.php b/src/Twemoji.php
@@ -25,7 +25,7 @@ public function __construct(array $codepoints)
 
     public static function emoji(string $emoji): self
     {
-        $chars = preg_split('//u', $emoji, null, PREG_SPLIT_NO_EMPTY);
+        $chars = preg_split('//u', $emoji, -1, PREG_SPLIT_NO_EMPTY);
 
         $codepoints = array_map(
             fn (string $code): string => dechex(mb_ord($code)),
@@ -58,6 +58,7 @@ public function url(): string
         );
     }
 
+    #[\ReturnTypeWillChange]
     public function jsonSerialize()
     {
         return $this->url();

diff --git a/src/emoji_bytes.regexp b/src/emoji_bytes.regexp
diff --git a/tests/Pest.php b/tests/Pest.php
@@ -0,0 +1,10 @@
+<?php
+
+use Astrotomic\Twemoji\HtmlReplacer;
+
+function htmlReplacerPngParser(string $html): string
+{
+    $htmlReplacer = (new HtmlReplacer())->png();
+
+    return $htmlReplacer->parse($html);
+}
diff --git a/tests/Unit/HtmlReplacerFragmentTest.php b/tests/Unit/HtmlReplacerFragmentTest.php
@@ -0,0 +1,78 @@
+<?php
+
+use function Spatie\Snapshots\assertMatchesTextSnapshot;
+
+it('can convert a single emoji paragraph', function () {
+    assertMatchesTextSnapshot(htmlReplacerPngParser('<p>🚀</p>'));
+});
+
+it('will not convert an emoji within HTML attributes', function () {
+    assertMatchesTextSnapshot(htmlReplacerPngParser('<img src="" alt="🎉"/>'));
+});
+
+it('will not convert an emoji within SCRIPT tags', function () {
+    assertMatchesTextSnapshot(htmlReplacerPngParser("<script>document.innerHTML = '🤷‍♂️';</script>"));
+});
+
+it('can convert many Emoji in an HTML comment section', function () {
+    $commentsHtml = <<<'HTML'
+<section class="comment-box">
+    <div class="comment-content">
+        <h2>Time for a ElePHPant RAVE!</h2>
+        <p>🐘🐘🐘🐘</p>
+        <p>🐘🐘🐘</p>
+        <p>🐘🐘🐘🐘🐘</p>
+        <p>🐘🐘</p>
+    </div>
+    <section class="sub-comments">
+        <section class="comment-box">
+            <div class="comment-content">
+                <h2>Time for a cRUSTation RAVE!</h2>
+                <p>🦀🦀🦀🦀</p>
+                <p>🦀🦀</p>
+                <p>🦀🦀🦀🦀</p>
+                <p>🦀</p>
+            </div>
+        </section>
+        <section class="comment-box">
+            <div class="comment-content">
+                <p>but what if the crabs and elephants rave together?!</p>
+            </div>
+        </section>
+    </section>
+</section>
+HTML;
+    assertMatchesTextSnapshot(htmlReplacerPngParser($commentsHtml));
+});
+
+it('can convert many Emoji in an HTML article', function () {
+    $commentsHtml = <<<'HTML'
+<article>
+    <p>Lorem 😂😂 ipsum 🕵️‍♂️dolor sit✍️ amet, consectetur adipiscing😇😇🤙 elit, sed do eiusmod🥰 tempor 😤😤🏳️‍🌈incididunt ut 👏labore 👏et👏 dolore 👏magna👏 aliqua.</p>
+    <p>Ut enim ad minim 🐵✊🏿veniam,❤️😤😫😩💦💦 quis nostrud 👿🤮exercitation ullamco 🧠👮🏿‍♀️🅱️laboris nisi ut aliquip❗️ ex ea commodo consequat.</p>
+    <p>💯Duis aute💦😂😂😂 irure dolor 👳🏻‍♂️🗿in reprehenderit 🤖👻👎in voluptate velit esse cillum dolore 🙏🙏eu fugiat🤔 nulla pariatur.</p>
+    <p>🙅‍♀️🙅‍♀️Excepteur sint occaecat🤷‍♀️🤦‍♀️ cupidatat💅 non💃 proident,👨‍👧 sunt🤗 in culpa😥😰😨 qui officia🤩🤩 deserunt mollit 🧐anim id est laborum.🤔🤔</p>
+</article>
+HTML;
+    assertMatchesTextSnapshot(htmlReplacerPngParser($commentsHtml));
+});
+
+it('can handle text with an outer P tag', function () {
+    $textContent = '<p>This is some fancy-💃 Markdown/WYSIWYG text with surrounding &lt;p&gt; tags enabled. 🎉</p>';
+    assertMatchesTextSnapshot(htmlReplacerPngParser($textContent));
+});
+
+it('can handle text with an outer P tag and CODE tag', function () {
+    $textContent = '<p>This is some fancy-💃 Markdown/WYSIWYG text with surrounding <code>&lt;p&gt;</code> tags enabled. 🎉</p>';
+    assertMatchesTextSnapshot(htmlReplacerPngParser($textContent));
+});
+
+it('can handle text without outer P tag and escaped HTML', function () {
+    $textContent = 'This is some fancy-💃 Markdown/WYSIWYG text with surrounding &lt;p&gt; tags disabled. 🎉';
+    assertMatchesTextSnapshot(htmlReplacerPngParser($textContent));
+});
+
+it('can handle text without outer P tag but inner HTML', function () {
+    $textContent = 'This is some fancy-💃 Markdown/WYSIWYG text with surrounding <code><p></code> tags disabled. 🎉';
+    assertMatchesTextSnapshot(htmlReplacerPngParser($textContent));
+})->skip('Fails: Mutates the code content to close the p tag');