Skip to content

Commit

Permalink
add sougou
Browse files Browse the repository at this point in the history
  • Loading branch information
suppermoment committed Jun 5, 2020
1 parent 5ba5205 commit e5a8961
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 52 deletions.
3 changes: 0 additions & 3 deletions src/Enum/SearchEngineEnum.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ class SearchEngineEnum

const PC_SOU_GOU = 'pc-sou-gou';

// const M_SOU_GOU = 'm-sou-gou';

const M_SHEN_MA = 'm-shen-ma';

const M_TOU_TIAO = 'm-tou-tiao';
Expand All @@ -24,7 +22,6 @@ class SearchEngineEnum
self::PC_BAI_DU => 'https://www.baidu.com/s',
self::M_BAI_DU => 'https://m.baidu.com/s',
self::PC_SOU_GOU => 'https://www.sogou.com/web',
// self::M_SOU_GOU => '',
self::PC_360 => 'https://www.so.com/s',
self::M_360 => 'https://m.so.com/s',
self::M_SHEN_MA => 'https://m.sm.cn/s',
Expand Down
104 changes: 56 additions & 48 deletions src/MatchUrlAndGetRank.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,27 +106,23 @@ public static function getPcBaiDuRank($html, $url, $page, $proxy)
try {
$snap_shoot = '//*[@id="content_left"]//*[@id=' . '"' . $i . '"' . ']//a[@data-click="{\'rsv_snapshot\':\'1\'}"]//@href';
$snap_shootUrl = $crawler->filterXPath($snap_shoot)->text();
} catch (\Exception $e) {
break;
}
if (!empty($snap_shootUrl)) {
$snap_shootHtml = self::getUrl($snap_shootUrl, SearchEngineEnum::PC_BAI_DU, $proxy);
if (!empty($snap_shootHtml)) {
$query1 = '//*[@id="bd_snap_note"]/a';
$crawler1 = new Crawler();
$crawler1->addHtmlContent($snap_shootHtml);
try {
if (!empty($snap_shootUrl)) {
$snap_shootHtml = self::getUrl($snap_shootUrl, SearchEngineEnum::PC_BAI_DU, $proxy);
if (!empty($snap_shootHtml)) {
$query1 = '//*[@id="bd_snap_note"]/a';
$crawler1 = new Crawler();
$crawler1->addHtmlContent($snap_shootHtml);

$match = $crawler1->filterXPath($query1)->text();
if (!empty($match)) {
$match = self::verifyUrlLastStr($match);
if (strstr($match, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);
}
}
} catch (\Exception $e) {
break;
}
}
} catch (\Exception $e) {
}
$i++;
}
Expand Down Expand Up @@ -154,13 +150,13 @@ public static function getMBaiDuRank($html, $url, $page)
try {
$snap_shoot = '//*[@id="results"]//*[@order=' . '"' . $i . '"' . ']//@data-log';
$snap_shootUrl = $crawler->filterXPath($snap_shoot)->text();
} catch (\Exception $e) {
break;
}
if (!empty($snap_shootUrl)) {
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);

if (!empty($snap_shootUrl)) {
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);
}
}
} catch (\Exception $e) {
}
$i++;
}
Expand Down Expand Up @@ -188,13 +184,14 @@ public static function getPc360Rank($html, $url, $page)
try {
$snap_shoot = '//*[@class="result"]/li[' . $i . ']';
$snap_shootUrl = $crawler->filterXPath($snap_shoot)->text();
} catch (\Exception $e) {
break;
}
if (!empty($snap_shootUrl)) {
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);

if (!empty($snap_shootUrl)) {
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);
}
}

} catch (\Exception $e) {
}
$i++;
}
Expand All @@ -208,32 +205,30 @@ public static function getPcSouGouRank($html, $url, $page)
$ranks = [];
$crawler = new Crawler();
$crawler->addHtmlContent($html);
var_dump($html);
exit();
$query = '//*[@class="result"]/li';
$query = '//*[@class="results"]/div';
$num = $crawler->filterXPath($query)->count();
$i = 1;
if (!empty($num) && $num > 1) {
while ($i <= $num) {
try {
$snap_shoot = '//*[@class="result"]/li[' . $i . ']';
$snap_shoot = '//*[@class="results"]/div[' . $i . ']//*[@class="fb"]//@href';
$snap_shootUrl = $crawler->filterXPath($snap_shoot)->text();
} catch (\Exception $e) {
break;
}
if (!empty($snap_shootUrl)) {
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);
if (!empty($snap_shootUrl)) {
$snap_shootUrl = (urldecode($snap_shootUrl));
if (strstr($snap_shootUrl, $url)) {
array_unshift($ranks, ($page - 1) * 10 + $i);
}
}
} catch (\Exception $e) {
}
$i++;
}

}

return $ranks;
}


/**
* @param $url
* @param $searchEngineType
Expand Down Expand Up @@ -275,6 +270,12 @@ private static function getUrl($url, $searchEngineType, $proxy = '')
}
}

/**
* @param $url
* @param $searchEngineType
* @return mixed
* @throws InvalidArgumentException
*/
public static function verifyUrl($url, $searchEngineType)
{
$pregUrl = "/^((ht|f)tps?):\/\/([\w\-]+(\.[\w\-]+)*\/)*[\w\-]+(\.[\w\-]+)*\/?(\?([\w\-\.,@?^=%&:\/~\+#]*)+)?/";
Expand All @@ -284,31 +285,25 @@ public static function verifyUrl($url, $searchEngineType)
switch ($searchEngineType) {
case SearchEngineEnum::PC_BAI_DU:
$preg = "/^http(s)?:\\/\\/.+/";
if (preg_match($preg, $url)) {
return $url;
} else {
if (!preg_match($preg, $url)) {
throw new InvalidArgumentException('链接缺少http://或https://');
}
break;
case SearchEngineEnum::M_BAI_DU:
$preg = "/^http(s)?:\\/\\/.+/";
if (preg_match($preg, $url)) {
return $url;
} else {
if (!preg_match($preg, $url)) {
throw new InvalidArgumentException('链接缺少http://或https://');
}
break;
case SearchEngineEnum::PC_360:
$urls = explode('://', $url);
if (!empty($urls)) {
return $urls[1];
} else {
return $url;
}
$url = self::explodeUrl($url);
break;
case SearchEngineEnum::PC_SOU_GOU:
$url = self::explodeUrl($url);
break;

}

return $url;
}

/**
Expand All @@ -323,4 +318,17 @@ private static function verifyUrlLastStr($url)
}
return $url;
}

/**
* @param $url
* @return mixed
*/
private static function explodeUrl($url)
{
$urls = explode('://', $url);
if (!empty($urls)) {
return $urls[1];
}
return $url;
}
}
1 change: 0 additions & 1 deletion src/SearchEngineRank.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public static function getRank($searchEngineType, $keyWord, $currentPage = 1, $p
$m = false;
break;
}
var_dump($url);exit();
if (!empty($url)) {
$ql = QueryList::get($url, null,
[
Expand Down

0 comments on commit e5a8961

Please sign in to comment.