Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Running upstream opengraph PHP library maintenance #30

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 101 additions & 31 deletions OpenGraph.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Original can be found at https://github.com/scottmac/opengraph/blob/master/OpenGraph.php

*/

class OpenGraph implements Iterator
Expand Down Expand Up @@ -50,25 +50,40 @@ class OpenGraph implements Iterator
* @return OpenGraph
*/
static public function fetch($URI) {
$curl = curl_init($URI);
$cookie_path = 'cookie.txt';
if ( defined('COOKIE_PATH_FOR_CURL') && !empty(COOKIE_PATH_FOR_CURL) ){
$cookie_path = COOKIE_PATH_FOR_CURL;
}
$curl = curl_init($URI);

curl_setopt($curl, CURLOPT_FAILONERROR, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 15);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($curl, CURLOPT_FAILONERROR, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 15);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)");
//The following 2 set up lines work with sites like www.nytimes.com
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_path); //you can change this path to whetever you want.
curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_path); //you can change this path to whetever you want.

$response = curl_exec($curl);
$response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8');

curl_close($curl);
curl_close($curl);

if (!empty($response)) {
return self::_parse($response);
} else {
return false;
}
if (!empty($response)) {
return self::_parse($response);
} else {
return false;
}
}

static public function parse($HTML){
if ( empty( $HTML ) ){
return false;
}
$response = mb_convert_encoding($HTML, 'HTML-ENTITIES', 'UTF-8');
return self::_parse($response);
}

/**
Expand All @@ -83,7 +98,7 @@ static private function _parse($HTML) {

$doc = new DOMDocument();
$doc->loadHTML($HTML);

libxml_use_internal_errors($old_libxml_error);

$tags = $doc->getElementsByTagName('meta');
Expand All @@ -94,16 +109,23 @@ static private function _parse($HTML) {
$page = new self();

$nonOgDescription = null;

foreach ($tags AS $tag) {
if ($tag->hasAttribute('property') &&
strpos($tag->getAttribute('property'), 'og:') === 0) {
if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) {
$key = strtr(substr($tag->getAttribute('property'), 3), '-', '_');
$page->_values[$key] = $tag->getAttribute('content');

if( array_key_exists($key, $page->_values) ){
if ( !array_key_exists($key.'_additional', $page->_values) ){
$page->_values[$key.'_additional'] = array();
}
$page->_values[$key.'_additional'][] = $tag->getAttribute('content');
}else{
$page->_values[$key] = $tag->getAttribute('content');
}
}
//Added this if loop to retrieve description values from sites like the New York Times who have malformed it.
if ($tag ->hasAttribute('value') && $tag->hasAttribute('property') &&

//Added this if loop to retrieve description values from sites like the New York Times who have malformed it.
if ($tag->hasAttribute('value') && $tag->hasAttribute('property') &&
strpos($tag->getAttribute('property'), 'og:') === 0) {
$key = strtr(substr($tag->getAttribute('property'), 3), '-', '_');
$page->_values[$key] = $tag->getAttribute('value');
Expand All @@ -112,8 +134,46 @@ static private function _parse($HTML) {
if ($tag->hasAttribute('name') && $tag->getAttribute('name') === 'description') {
$nonOgDescription = $tag->getAttribute('content');
}


if ($tag->hasAttribute('property') &&
strpos($tag->getAttribute('property'), 'twitter:') === 0) {
$key = strtr($tag->getAttribute('property'), '-:', '__');
$page->_values[$key] = $tag->getAttribute('content');
}

if ($tag->hasAttribute('name') &&
strpos($tag->getAttribute('name'), 'twitter:') === 0) {
$key = strtr($tag->getAttribute('name'), '-:', '__');
if( array_key_exists($key, $page->_values) ){
if (!array_key_exists($key.'_additional', $page->_values)){
$page->_values[$key.'_additional'] = array();
}
$page->_values[$key.'_additional'][] = $tag->getAttribute('content');
} else {
$page->_values[$key] = $tag->getAttribute('content');
}
}

// Notably this will not work if you declare type after you declare type values on a page.
if ( array_key_exists('type', $page->_values) ){
$meta_key = $page->_values['type'].':';
if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), $meta_key) === 0) {
$meta_key_len = strlen($meta_key);
$key = strtr(substr($tag->getAttribute('property'), $meta_key_len), '-', '_');
$key = $page->_values['type'].'_'.$key;

if( array_key_exists($key, $page->_values) ){
if ( !array_key_exists($key.'_additional', $page->_values) ){
$page->_values[$key.'_additional'] = array();
}
$page->_values[$key.'_additional'][] = $tag->getAttribute('content');
}else{
$page->_values[$key] = $tag->getAttribute('content');
}
}
}
}

//Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php
if (!isset($page->_values['title'])) {
$titles = $doc->getElementsByTagName('title');
Expand All @@ -126,7 +186,7 @@ static private function _parse($HTML) {
}

//Fallback to use image_src if ogp::image isn't set.
if (!isset($page->values['image'])) {
if (!isset($page->_values['image'])) {
$domxpath = new DOMXPath($doc);
$elements = $domxpath->query("//link[@rel='image_src']");

Expand All @@ -136,11 +196,21 @@ static private function _parse($HTML) {
$page->_values['image'] = $domattr->value;
$page->_values['image_src'] = $domattr->value;
}
}
} else if (!empty($page->_values['twitter_image'])){
$page->_values['image'] = $page->_values['twitter_image'];
} else {
$elements = $doc->getElementsByTagName("img");
foreach ( $elements as $tag ){
if ($tag->hasAttribute('width') && ( ($tag->getAttribute('width') > 300) || ($tag->getAttribute('width') == '100%') ) ){
$page->_values['image'] = $tag->getAttribute('src');
break;
}
}
}
}

if (empty($page->_values)) { return false; }

return $page;
}

Expand All @@ -155,7 +225,7 @@ public function __get($key) {
if (array_key_exists($key, $this->_values)) {
return $this->_values[$key];
}

if ($key === 'schema') {
foreach (self::$TYPES AS $schema => $types) {
if (array_search($this->_values['type'], $types)) {
Expand Down Expand Up @@ -192,7 +262,7 @@ public function hasLocation() {
if (array_key_exists('latitude', $this->_values) && array_key_exists('longitude', $this->_values)) {
return true;
}

$address_keys = array('street_address', 'locality', 'region', 'postal_code', 'country_name');
$valid_address = true;
foreach ($address_keys AS $key) {
Expand Down