diff --git a/OpenGraph.php b/OpenGraph.php index af2e7b6..b7072c7 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -13,9 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + Original can be found at https://github.com/scottmac/opengraph/blob/master/OpenGraph.php - + */ class OpenGraph implements Iterator @@ -50,25 +50,40 @@ class OpenGraph implements Iterator * @return OpenGraph */ static public function fetch($URI) { - $curl = curl_init($URI); + $cookie_path = 'cookie.txt'; + if ( defined('COOKIE_PATH_FOR_CURL') && !empty(COOKIE_PATH_FOR_CURL) ){ + $cookie_path = COOKIE_PATH_FOR_CURL; + } + $curl = curl_init($URI); - curl_setopt($curl, CURLOPT_FAILONERROR, true); - curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curl, CURLOPT_TIMEOUT, 15); - curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); - curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); + curl_setopt($curl, CURLOPT_FAILONERROR, true); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_TIMEOUT, 15); + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); + //The following 2 set up lines work with sites like www.nytimes.com + curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_path); //you can change this path to whetever you want. + curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_path); //you can change this path to whetever you want. - $response = curl_exec($curl); + $response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8'); - curl_close($curl); + curl_close($curl); - if (!empty($response)) { - return self::_parse($response); - } else { - return false; - } + if (!empty($response)) { + return self::_parse($response); + } else { + return false; + } + } + + static public function parse($HTML){ + if ( empty( $HTML ) ){ + return false; + } + $response = mb_convert_encoding($HTML, 'HTML-ENTITIES', 'UTF-8'); + return self::_parse($response); } /** @@ -83,7 +98,7 @@ static private function _parse($HTML) { $doc = new DOMDocument(); $doc->loadHTML($HTML); - + libxml_use_internal_errors($old_libxml_error); $tags = $doc->getElementsByTagName('meta'); @@ -94,16 +109,23 @@ static private function _parse($HTML) { $page = new self(); $nonOgDescription = null; - + foreach ($tags AS $tag) { - if ($tag->hasAttribute('property') && - strpos($tag->getAttribute('property'), 'og:') === 0) { + if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); - $page->_values[$key] = $tag->getAttribute('content'); + + if( array_key_exists($key, $page->_values) ){ + if ( !array_key_exists($key.'_additional', $page->_values) ){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + }else{ + $page->_values[$key] = $tag->getAttribute('content'); + } } - - //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. - if ($tag ->hasAttribute('value') && $tag->hasAttribute('property') && + + //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. + if ($tag->hasAttribute('value') && $tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); $page->_values[$key] = $tag->getAttribute('value'); @@ -112,8 +134,46 @@ static private function _parse($HTML) { if ($tag->hasAttribute('name') && $tag->getAttribute('name') === 'description') { $nonOgDescription = $tag->getAttribute('content'); } - + + if ($tag->hasAttribute('property') && + strpos($tag->getAttribute('property'), 'twitter:') === 0) { + $key = strtr($tag->getAttribute('property'), '-:', '__'); + $page->_values[$key] = $tag->getAttribute('content'); + } + + if ($tag->hasAttribute('name') && + strpos($tag->getAttribute('name'), 'twitter:') === 0) { + $key = strtr($tag->getAttribute('name'), '-:', '__'); + if( array_key_exists($key, $page->_values) ){ + if (!array_key_exists($key.'_additional', $page->_values)){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + } else { + $page->_values[$key] = $tag->getAttribute('content'); + } + } + + // Notably this will not work if you declare type after you declare type values on a page. + if ( array_key_exists('type', $page->_values) ){ + $meta_key = $page->_values['type'].':'; + if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), $meta_key) === 0) { + $meta_key_len = strlen($meta_key); + $key = strtr(substr($tag->getAttribute('property'), $meta_key_len), '-', '_'); + $key = $page->_values['type'].'_'.$key; + + if( array_key_exists($key, $page->_values) ){ + if ( !array_key_exists($key.'_additional', $page->_values) ){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + }else{ + $page->_values[$key] = $tag->getAttribute('content'); + } + } + } } + //Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php if (!isset($page->_values['title'])) { $titles = $doc->getElementsByTagName('title'); @@ -126,7 +186,7 @@ static private function _parse($HTML) { } //Fallback to use image_src if ogp::image isn't set. - if (!isset($page->values['image'])) { + if (!isset($page->_values['image'])) { $domxpath = new DOMXPath($doc); $elements = $domxpath->query("//link[@rel='image_src']"); @@ -136,11 +196,21 @@ static private function _parse($HTML) { $page->_values['image'] = $domattr->value; $page->_values['image_src'] = $domattr->value; } - } + } else if (!empty($page->_values['twitter_image'])){ + $page->_values['image'] = $page->_values['twitter_image']; + } else { + $elements = $doc->getElementsByTagName("img"); + foreach ( $elements as $tag ){ + if ($tag->hasAttribute('width') && ( ($tag->getAttribute('width') > 300) || ($tag->getAttribute('width') == '100%') ) ){ + $page->_values['image'] = $tag->getAttribute('src'); + break; + } + } + } } if (empty($page->_values)) { return false; } - + return $page; } @@ -155,7 +225,7 @@ public function __get($key) { if (array_key_exists($key, $this->_values)) { return $this->_values[$key]; } - + if ($key === 'schema') { foreach (self::$TYPES AS $schema => $types) { if (array_search($this->_values['type'], $types)) { @@ -192,7 +262,7 @@ public function hasLocation() { if (array_key_exists('latitude', $this->_values) && array_key_exists('longitude', $this->_values)) { return true; } - + $address_keys = array('street_address', 'locality', 'region', 'postal_code', 'country_name'); $valid_address = true; foreach ($address_keys AS $key) {