From 32e386f1a55f76d827e264d9637870e9bc424dff Mon Sep 17 00:00:00 2001 From: osapon Date: Mon, 25 Nov 2024 11:58:17 +0900 Subject: [PATCH] Improved charset tag recognition accuracy. --- src/Document.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Document.php b/src/Document.php index 6db07663..65c299a1 100644 --- a/src/Document.php +++ b/src/Document.php @@ -28,11 +28,11 @@ public function __construct(Extractor $extractor) $encoding = null; $contentType = $extractor->getResponse()->getHeaderLine('content-type'); - preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match); + preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); } elseif (!empty($html)) { - preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match); + preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); }