-
-
Notifications
You must be signed in to change notification settings - Fork 13
Add HTML parsing features #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
67cd354
8c15b01
6a8cb4d
b8af7e5
b8bedbf
bb84284
eb121c0
92d9136
4538750
cd6e190
60c987f
f7616c0
e818b5c
b1f83c7
29f7d0a
eeb5f0a
fcdd93d
2d77cdc
e0f2540
bc61a6a
b3a57be
62fdc48
3dd8fb5
b2bc8bb
2e3278d
55badec
c446d6f
1b5b3e0
2ee59bf
891c25f
2b77947
d5b6869
894b79f
590ac97
4468c8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
/composer.lock | ||
/vendor/ | ||
.phpunit.result.cache |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<?php | ||
|
||
namespace Astrotomic\Twemoji\Exceptions; | ||
|
||
use Exception; | ||
|
||
class NoTextChildrenException extends Exception | ||
{ | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
<?php | ||
|
||
namespace Astrotomic\Twemoji; | ||
|
||
use Astrotomic\Twemoji\Concerns\Configurable; | ||
use Astrotomic\Twemoji\Exceptions\NoTextChildrenException; | ||
use DOMDocument; | ||
use RuntimeException; | ||
use Wa72\HtmlPageDom\HtmlPageCrawler; | ||
|
||
/** | ||
* @internal This class is marked as Internal as it is considered Experimental. Code subject to change until warning removed. | ||
*/ | ||
class HtmlReplacer | ||
{ | ||
use Configurable; | ||
|
||
private const UTF8_META = '<meta http-equiv="content-type" content="text/html; charset=utf-8" />'; | ||
|
||
private const FRAGMENT_TEMPLATE = <<<'HTML' | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta http-equiv="content-type" content="text/html; charset=utf-8"> | ||
</head> | ||
<body id="wrapper-template"> | ||
%s | ||
</body> | ||
</html> | ||
HTML; | ||
|
||
public function __construct() | ||
{ | ||
if (! class_exists(HtmlPageCrawler::class)) { | ||
throw new RuntimeException( | ||
sprintf('Cannot use %s method unless `wa72/htmlpagedom` is installed.', __METHOD__) | ||
); | ||
} | ||
} | ||
|
||
public function parse(string $html): string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we need to support full HTML docs and HTML fragments, then this method should:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that in PHP partial HTML is more common than a full document. Except you are implementing it as some kind of middleware to parse the whole HTML response. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed this by using the more general HTML parser, then adding a step where we check if the input HTML is a Page/Doc and selecting the |
||
{ | ||
// Parse the HTML page or fragment... | ||
$parsedHtmlRoot = HtmlPageCrawler::create($html); | ||
|
||
if ($parsedHtmlRoot->isHtmlDocument()) { | ||
// We will only transform the body... | ||
$parsedHtml = $parsedHtmlRoot->filter('body'); | ||
} else { | ||
return $this->parseFragment($html); | ||
} | ||
|
||
try { | ||
$this->findAndTwmojifyTextNodes($parsedHtml); | ||
} catch (NoTextChildrenException $e) { | ||
return $html; | ||
} | ||
|
||
// Find the page head and check if meta header should be added | ||
$htmlHead = $parsedHtmlRoot->filter('head'); | ||
$addHeader = false; | ||
if ($htmlHead->getNode(0)->hasChildNodes()) { | ||
$contentTypeMeta = $htmlHead->children('meta[http-equiv="content-type"][content]'); | ||
$metaNode = $contentTypeMeta->getNode(0); | ||
if ( | ||
$metaNode === null || | ||
iterator_to_array($metaNode->attributes)['content']->textContent !== 'text/html; charset=utf-8' | ||
) { | ||
$this->addUtf8MetaTag($htmlHead); | ||
$contentTypeMeta->remove(); | ||
} | ||
} else { | ||
$this->addUtf8MetaTag($htmlHead); | ||
} | ||
|
||
return $parsedHtmlRoot->saveHTML(); | ||
} | ||
|
||
public function parseFragment(string $html): string | ||
{ | ||
$wrappedFragment = sprintf(static::FRAGMENT_TEMPLATE, $html); | ||
|
||
$parsedHtmlRoot = HtmlPageCrawler::create($wrappedFragment); | ||
$parsedHtml = $parsedHtmlRoot->filter('body'); | ||
|
||
try { | ||
$this->findAndTwmojifyTextNodes($parsedHtml); | ||
} catch (NoTextChildrenException $e) { | ||
return $html; | ||
} | ||
|
||
return trim($parsedHtmlRoot->filter('body')->getInnerHtml()); | ||
} | ||
|
||
/** | ||
* @throws NoTextChildrenException | ||
*/ | ||
private function findAndTwmojifyTextNodes(HtmlPageCrawler $htmlContent): HtmlPageCrawler | ||
{ | ||
// Use xpath to filter only the "TextNodes" within every "Element" | ||
$textNodes = $htmlContent->filterXPath('.//*[normalize-space(text())]'); | ||
|
||
// If the filtered DOM fragment doesn't have TextNode children, return the input HTML. | ||
if ($textNodes->count() === 0) { | ||
throw new NoTextChildrenException(); | ||
} | ||
|
||
$textNodes->each(function (HtmlPageCrawler $node) { | ||
$twemojiContent = (new EmojiText($node->getInnerHtml())) | ||
->base($this->base) | ||
->type($this->type) | ||
->toHtml(); | ||
$node->makeEmpty()->setInnerHtml($twemojiContent); | ||
|
||
return $node; | ||
}); | ||
|
||
return $textNodes; | ||
} | ||
|
||
private function addUtf8MetaTag($htmlHead): void | ||
{ | ||
$doc = new DOMDocument(); | ||
$setUtf8Meta = $doc->createDocumentFragment(); | ||
$setUtf8Meta->appendXML(static::UTF8_META); | ||
$htmlHead->append($setUtf8Meta); | ||
} | ||
} |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
<?php | ||
|
||
use Astrotomic\Twemoji\HtmlReplacer; | ||
|
||
function htmlReplacerPngParser(string $html): string | ||
{ | ||
$htmlReplacer = (new HtmlReplacer())->png(); | ||
|
||
return $htmlReplacer->parse($html); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
<?php | ||
|
||
use function Spatie\Snapshots\assertMatchesTextSnapshot; | ||
|
||
it('can convert a single emoji paragraph', function () { | ||
assertMatchesTextSnapshot(htmlReplacerPngParser('<p>🚀</p>')); | ||
}); | ||
|
||
it('will not convert an emoji within HTML attributes', function () { | ||
assertMatchesTextSnapshot(htmlReplacerPngParser('<img src="" alt="🎉"/>')); | ||
}); | ||
|
||
it('will not convert an emoji within SCRIPT tags', function () { | ||
assertMatchesTextSnapshot(htmlReplacerPngParser("<script>document.innerHTML = '🤷♂️';</script>")); | ||
}); | ||
|
||
it('can convert many Emoji in an HTML comment section', function () { | ||
$commentsHtml = <<<'HTML' | ||
<section class="comment-box"> | ||
<div class="comment-content"> | ||
<h2>Time for a ElePHPant RAVE!</h2> | ||
<p>🐘🐘🐘🐘</p> | ||
<p>🐘🐘🐘</p> | ||
<p>🐘🐘🐘🐘🐘</p> | ||
<p>🐘🐘</p> | ||
</div> | ||
<section class="sub-comments"> | ||
<section class="comment-box"> | ||
<div class="comment-content"> | ||
<h2>Time for a cRUSTation RAVE!</h2> | ||
<p>🦀🦀🦀🦀</p> | ||
<p>🦀🦀</p> | ||
<p>🦀🦀🦀🦀</p> | ||
<p>🦀</p> | ||
</div> | ||
</section> | ||
<section class="comment-box"> | ||
<div class="comment-content"> | ||
<p>but what if the crabs and elephants rave together?!</p> | ||
</div> | ||
</section> | ||
</section> | ||
</section> | ||
HTML; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($commentsHtml)); | ||
}); | ||
|
||
it('can convert many Emoji in an HTML article', function () { | ||
$commentsHtml = <<<'HTML' | ||
<article> | ||
<p>Lorem 😂😂 ipsum 🕵️♂️dolor sit✍️ amet, consectetur adipiscing😇😇🤙 elit, sed do eiusmod🥰 tempor 😤😤🏳️🌈incididunt ut 👏labore 👏et👏 dolore 👏magna👏 aliqua.</p> | ||
<p>Ut enim ad minim 🐵✊🏿veniam,❤️😤😫😩💦💦 quis nostrud 👿🤮exercitation ullamco 🧠👮🏿♀️🅱️laboris nisi ut aliquip❗️ ex ea commodo consequat.</p> | ||
<p>💯Duis aute💦😂😂😂 irure dolor 👳🏻♂️🗿in reprehenderit 🤖👻👎in voluptate velit esse cillum dolore 🙏🙏eu fugiat🤔 nulla pariatur.</p> | ||
<p>🙅♀️🙅♀️Excepteur sint occaecat🤷♀️🤦♀️ cupidatat💅 non💃 proident,👨👧 sunt🤗 in culpa😥😰😨 qui officia🤩🤩 deserunt mollit 🧐anim id est laborum.🤔🤔</p> | ||
</article> | ||
HTML; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($commentsHtml)); | ||
}); | ||
|
||
it('can handle text with an outer P tag', function () { | ||
$textContent = '<p>This is some fancy-💃 Markdown/WYSIWYG text with surrounding <p> tags enabled. 🎉</p>'; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($textContent)); | ||
}); | ||
|
||
it('can handle text with an outer P tag and CODE tag', function () { | ||
$textContent = '<p>This is some fancy-💃 Markdown/WYSIWYG text with surrounding <code><p></code> tags enabled. 🎉</p>'; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($textContent)); | ||
}); | ||
|
||
it('can handle text without outer P tag and escaped HTML', function () { | ||
$textContent = 'This is some fancy-💃 Markdown/WYSIWYG text with surrounding <p> tags disabled. 🎉'; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($textContent)); | ||
}); | ||
|
||
it('can handle text without outer P tag but inner HTML', function () { | ||
$textContent = 'This is some fancy-💃 Markdown/WYSIWYG text with surrounding <code><p></code> tags disabled. 🎉'; | ||
assertMatchesTextSnapshot(htmlReplacerPngParser($textContent)); | ||
})->skip('Fails: Mutates the code content to close the p tag'); |
Uh oh!
There was an error while loading. Please reload this page.