Spider
Spider組件可以方便用戶快速搭建分布式多協程爬蟲,用戶只需關心product和consume,product對dom的解析推薦使用Querylist
安裝
composer require easyswoole/spider
快速使用
以百度搜索為例,根據搜索關鍵詞爬出每次檢索結果前幾頁的特定數據
純屬教學目的,如有冒犯貴公司還請及時通知,會及時調整
Product
<?php
namespace App\Spider;
use EasySwoole\HttpClient\HttpClient;
use EasySwoole\Spider\Config\ProductConfig;
use EasySwoole\Spider\Hole\ProductAbstract;
use EasySwoole\Spider\ProductResult;
use QL\QueryList;
use EasySwoole\FastCache\Cache;
class ProductTest extends ProductAbstract
{
public function product():ProductResult
{
// TODO: Implement product() method.
// 請求地址數據
$httpClient = new HttpClient($this->productConfig->getUrl());
$httpClient->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36');
$body = $httpClient->get()->getBody();
// 先將每個搜索結果的a標簽內容拿到
$rules = [
'search_result' => ['.c-container .t', 'text', 'a']
];
$searchResult = QueryList::rules($rules)->html($body)->query()->getData();
$data = [];
foreach ($searchResult as $result) {
$item = [
'href' => QueryList::html($result['search_result'])->find('a')->attr('href'),
'text' => QueryList::html($result['search_result'])->find('a')->text()
];
$data[] = $item;
}
$productJobOtherInfo = $this->productConfig->getOtherInfo();
// 下一批任務
$productJobConfigs = [];
if ($productJobOtherInfo['page'] === 1) {
for($i=1;$i<5;$i++) {
$pn = $i*10;
$productJobConfig = [
'url' => "https://www.baidu.com/s?wd={$productJobOtherInfo['word']}&pn={$pn}",
'otherInfo' => [
'word' => $productJobOtherInfo['word'],
'page' => $i+1
]
];
$productJobConfigs[] = $productJobConfig;
}
$word = Cache::getInstance()->deQueue(self::SEARCH_WORDS);
if (!empty($word)) {
$productJobConfigs[] = [
'url' => "https://www.baidu.com/s?wd={$word}&pn=0",
'otherInfo' => [
'word' => $word,
'page' => 1
]
];
}
}
$result = new ProductResult();
$result->setProductJobConfigs($productJobConfigs)->setConsumeData($data);
return $result;
}
}
Consume
我這里直接存文件了,可按照需求自己定制
<?php
namespace App\Spider;
use EasySwoole\Spider\ConsumeJob;
use EasySwoole\Spider\Hole\ConsumeAbstract;
class ConsumeTest extends ConsumeAbstract
{
public function consume()
{
// TODO: Implement consume() method.
$data = $this->getJobData();
$items = '';
foreach ($data as $item) {
$items .= implode("\t", $item)."\n";
}
file_put_contents('baidu.txt', $items, FILE_APPEND);
}
}
注冊爬蟲組件
public static function mainServerCreate(EventRegister $register)
{
$spiderConfig = [
'product' => ProductTest::class, // 必須
'consume' => ConsumeTest::class, // 必須
'queueType' => SpiderConfig::QUEUE_TYPE_FAST_CACHE, // 通信類型默認是fast-cache不支持分布式,如需分布式可使用SpiderConfig::QUEUE_TYPE_REDIS,或者自行實現通信隊列
'queue' => '自定義隊列,如使用組件自帶則不需要', // 自定義通信隊列
'queueConfig' => '自定義隊列配置,目前只有SpiderConfig::QUEUE_TYPE_REDIS需要',
'maxCurrency' => 128 // 最大協程并發數(單臺機器)
];
SpiderServer::getInstance()
->setSpiderConfig($spiderConfig)
->attachProcess(ServerManager::getInstance()->getSwooleServer());
}
投遞任務
$words = [
'php',
'java',
'go'
];
foreach ($words as $word) {
Cache::getInstance()->enQueue('SEARCH_WORDS', $word);
}
$wd = Cache::getInstance()->deQueue('SEARCH_WORDS');
SpiderClient::getInstance()->addJob(
'https://www.baidu.com/s?wd=php&pn=0',
[
'page' => 1,
'word' => $wd
]
);