diff --git a/src/Enum/SearchEngineEnum.php b/src/Enum/SearchEngineEnum.php index 58e2922..835d85e 100644 --- a/src/Enum/SearchEngineEnum.php +++ b/src/Enum/SearchEngineEnum.php @@ -14,8 +14,6 @@ class SearchEngineEnum const PC_SOU_GOU = 'pc-sou-gou'; -// const M_SOU_GOU = 'm-sou-gou'; - const M_SHEN_MA = 'm-shen-ma'; const M_TOU_TIAO = 'm-tou-tiao'; @@ -24,7 +22,6 @@ class SearchEngineEnum self::PC_BAI_DU => 'https://www.baidu.com/s', self::M_BAI_DU => 'https://m.baidu.com/s', self::PC_SOU_GOU => 'https://www.sogou.com/web', -// self::M_SOU_GOU => '', self::PC_360 => 'https://www.so.com/s', self::M_360 => 'https://m.so.com/s', self::M_SHEN_MA => 'https://m.sm.cn/s', diff --git a/src/MatchUrlAndGetRank.php b/src/MatchUrlAndGetRank.php index 0ec3b5e..1182a5c 100644 --- a/src/MatchUrlAndGetRank.php +++ b/src/MatchUrlAndGetRank.php @@ -106,16 +106,13 @@ public static function getPcBaiDuRank($html, $url, $page, $proxy) try { $snap_shoot = '//*[@id="content_left"]//*[@id=' . '"' . $i . '"' . ']//a[@data-click="{\'rsv_snapshot\':\'1\'}"]//@href'; $snap_shootUrl = $crawler->filterXPath($snap_shoot)->text(); - } catch (\Exception $e) { - break; - } - if (!empty($snap_shootUrl)) { - $snap_shootHtml = self::getUrl($snap_shootUrl, SearchEngineEnum::PC_BAI_DU, $proxy); - if (!empty($snap_shootHtml)) { - $query1 = '//*[@id="bd_snap_note"]/a'; - $crawler1 = new Crawler(); - $crawler1->addHtmlContent($snap_shootHtml); - try { + if (!empty($snap_shootUrl)) { + $snap_shootHtml = self::getUrl($snap_shootUrl, SearchEngineEnum::PC_BAI_DU, $proxy); + if (!empty($snap_shootHtml)) { + $query1 = '//*[@id="bd_snap_note"]/a'; + $crawler1 = new Crawler(); + $crawler1->addHtmlContent($snap_shootHtml); + $match = $crawler1->filterXPath($query1)->text(); if (!empty($match)) { $match = self::verifyUrlLastStr($match); @@ -123,10 +120,9 @@ public static function getPcBaiDuRank($html, $url, $page, $proxy) array_unshift($ranks, ($page - 1) * 10 + $i); } } - } catch (\Exception $e) { - break; } } + } catch (\Exception $e) { } $i++; } @@ -154,13 +150,13 @@ public static function getMBaiDuRank($html, $url, $page) try { $snap_shoot = '//*[@id="results"]//*[@order=' . '"' . $i . '"' . ']//@data-log'; $snap_shootUrl = $crawler->filterXPath($snap_shoot)->text(); - } catch (\Exception $e) { - break; - } - if (!empty($snap_shootUrl)) { - if (strstr($snap_shootUrl, $url)) { - array_unshift($ranks, ($page - 1) * 10 + $i); + + if (!empty($snap_shootUrl)) { + if (strstr($snap_shootUrl, $url)) { + array_unshift($ranks, ($page - 1) * 10 + $i); + } } + } catch (\Exception $e) { } $i++; } @@ -188,13 +184,14 @@ public static function getPc360Rank($html, $url, $page) try { $snap_shoot = '//*[@class="result"]/li[' . $i . ']'; $snap_shootUrl = $crawler->filterXPath($snap_shoot)->text(); - } catch (\Exception $e) { - break; - } - if (!empty($snap_shootUrl)) { - if (strstr($snap_shootUrl, $url)) { - array_unshift($ranks, ($page - 1) * 10 + $i); + + if (!empty($snap_shootUrl)) { + if (strstr($snap_shootUrl, $url)) { + array_unshift($ranks, ($page - 1) * 10 + $i); + } } + + } catch (\Exception $e) { } $i++; } @@ -208,32 +205,30 @@ public static function getPcSouGouRank($html, $url, $page) $ranks = []; $crawler = new Crawler(); $crawler->addHtmlContent($html); - var_dump($html); - exit(); - $query = '//*[@class="result"]/li'; + $query = '//*[@class="results"]/div'; $num = $crawler->filterXPath($query)->count(); $i = 1; if (!empty($num) && $num > 1) { while ($i <= $num) { try { - $snap_shoot = '//*[@class="result"]/li[' . $i . ']'; + $snap_shoot = '//*[@class="results"]/div[' . $i . ']//*[@class="fb"]//@href'; $snap_shootUrl = $crawler->filterXPath($snap_shoot)->text(); - } catch (\Exception $e) { - break; - } - if (!empty($snap_shootUrl)) { - if (strstr($snap_shootUrl, $url)) { - array_unshift($ranks, ($page - 1) * 10 + $i); + if (!empty($snap_shootUrl)) { + $snap_shootUrl = (urldecode($snap_shootUrl)); + if (strstr($snap_shootUrl, $url)) { + array_unshift($ranks, ($page - 1) * 10 + $i); + } } + } catch (\Exception $e) { } $i++; } + } return $ranks; } - /** * @param $url * @param $searchEngineType @@ -275,6 +270,12 @@ private static function getUrl($url, $searchEngineType, $proxy = '') } } + /** + * @param $url + * @param $searchEngineType + * @return mixed + * @throws InvalidArgumentException + */ public static function verifyUrl($url, $searchEngineType) { $pregUrl = "/^((ht|f)tps?):\/\/([\w\-]+(\.[\w\-]+)*\/)*[\w\-]+(\.[\w\-]+)*\/?(\?([\w\-\.,@?^=%&:\/~\+#]*)+)?/"; @@ -284,31 +285,25 @@ public static function verifyUrl($url, $searchEngineType) switch ($searchEngineType) { case SearchEngineEnum::PC_BAI_DU: $preg = "/^http(s)?:\\/\\/.+/"; - if (preg_match($preg, $url)) { - return $url; - } else { + if (!preg_match($preg, $url)) { throw new InvalidArgumentException('链接缺少http://或https://'); } break; case SearchEngineEnum::M_BAI_DU: $preg = "/^http(s)?:\\/\\/.+/"; - if (preg_match($preg, $url)) { - return $url; - } else { + if (!preg_match($preg, $url)) { throw new InvalidArgumentException('链接缺少http://或https://'); } break; case SearchEngineEnum::PC_360: - $urls = explode('://', $url); - if (!empty($urls)) { - return $urls[1]; - } else { - return $url; - } + $url = self::explodeUrl($url); + break; + case SearchEngineEnum::PC_SOU_GOU: + $url = self::explodeUrl($url); break; - } + return $url; } /** @@ -323,4 +318,17 @@ private static function verifyUrlLastStr($url) } return $url; } + + /** + * @param $url + * @return mixed + */ + private static function explodeUrl($url) + { + $urls = explode('://', $url); + if (!empty($urls)) { + return $urls[1]; + } + return $url; + } } \ No newline at end of file diff --git a/src/SearchEngineRank.php b/src/SearchEngineRank.php index b0f980e..f7e1cda 100644 --- a/src/SearchEngineRank.php +++ b/src/SearchEngineRank.php @@ -49,7 +49,6 @@ public static function getRank($searchEngineType, $keyWord, $currentPage = 1, $p $m = false; break; } - var_dump($url);exit(); if (!empty($url)) { $ql = QueryList::get($url, null, [