目标:我们要抓 http://www.cnbaowen.net/news/list-3720-1.html 右侧的内容,保存在数据库6.1 中的spider_baowen表中;
我用的windows系统
下载
1) https://github.com/owner888/phpspider
2) https://pan.baidu.com/s/10n9ZOUQBlrJzOQx0ShOmMQ 提取码:b2zc
创建数据库与相关表
CREATE TABLE `spider_baowen` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
`content` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
`site_url` varchar(200) CHARACTER SET utf8mb4 DEFAULT '0' COMMENT '文章类型 1行业资讯 2技术资料',
`site_id` int(5) DEFAULT NULL COMMENT '站点id',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
php代码 放在demo目录下 命名为test_baowenwang.php
<?php
// composer下载方式
// 先使用composer命令下载:
// composer require owner888/phpspider
// 引入加载器
//require './vendor/autoload.php';
// GitHub下载方式
require_once __DIR__ . '/../autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\log;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array(
'name' => '保温网',
'domains' => array(
'www.cnbaowen.net'
),
'log_file' => 'data/test_baowenwang.log',
'log_type' => 'warn,debug',
'tasknum' => 5,
'max_depth' => 1,
'export' => [
'type' => 'db',
'table' => 'spider_baowen',
],
'db_config' => [
'host' => '127.0.0.1',
'port' => '3306',
'user' => 'root',
'pass' => 'root',
'name' => '6.1'
],
'scan_urls' => array(
'http://www.cnbaowen.net/news/list-3720-1.html'
),
'content_url_regexes' => [
'http://www.cnbaowen.net/news/show-\d+.html'
],
'list_url_regexes' => [
'http://www.cnbaowen.net/news/list-3720-\d+.html'
],
'fields' => [
[
'name' => "title",
'selector' => "//h1[@id='title']",
'required' => true,
],
[
'name' => 'content',
'selector' => "//div[@id='content']",
'required' => true,
],
[
'name' => 'site_url'
],
[
'name' => 'site_id'
],
],
);
$spider = new phpspider($configs);
$spider->on_start = function ($spider) {
for ($i = 1; $i <= 2; $i++) {
$url = "http://www.cnbaowen.net/news/list-3720-{$i}.html";
$spider->add_url($url);
}
};
$spider->on_list_page = function ($page, $content, $spider) {
//$content = selector::select($content,"//span[@class='f_r']"); // 中间详情页
$content = selector::select($content, "//div[@class='box_body thumb']");// 右侧详情页
if (is_array($content)) {
$content = implode('', $content);
}
$regex = "#http://www.cnbaowen.net/news/show-\d+.html#";
$urls = array();
preg_match_all($regex, $content, $out);
$urls = empty($out[0]) ? [] : $out[0];
$urls = array_unique($urls);
if (!empty($urls)) {
foreach ($urls as $url) {
$spider->add_url($url);
}
}
// 通知爬虫不再从当前网页中发现待爬url
return false;
};
$spider->on_content_page = function ($page, $content, $phpspider) {
return false;
};
$spider->on_extract_field = function ($fieldname, $data, $page) {
switch ($fieldname) {
case 'content':
{
$s = preg_replace("/<div style=\"float:right[\s\S]*?div>/", "", $data);
$s = preg_replace('/<a .*?href="(.*?)".*?>/is', "<a href='#'>", $s);
$data = preg_replace('/<img.*?>/is', "", $s);
$data = mb_substr($data, 0, 1000);
return $data;
}
case 'site_url':
return $page['url'];
case 'site_id':
return 1;
default:
return $data;
}
};
$spider->start();
打开命令行 cd /d demo所在的目录
php -f test_baowenwang.php
查看数据库表,完毕;