php采集html

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zchqq/article/details/82789070
public function getTags($html, $tag){
        $level        = 0;
        $offset        = 0;
        $return     = "";
        $len        = strlen($tag);
        $tag        = strtolower($tag);
        $html2        = strtolower($html);
        if(strpos($tag," ")){
            $temp         = explode(" ",$tag);
        }
        $tag_end    = (isset($temp[0]))?$temp[0]:$tag;
        $i = 0;
        while(1){
            $seat1    = strpos($html2,"<{$tag}",$offset);
            if(false === $seat1) return $return;
            $seat2    = strpos($html2,"</{$tag_end}>",$seat1+strlen($tag)+1);
            $seat3    = strpos($html2,"<{$tag}",$seat1+strlen($tag)+1);
            while($seat3!=false && $seat3<$seat2){
                $seat2    = strpos($html2,"</{$tag_end}>",$seat2+strlen($tag_end)+3);
                $seat3    = strpos($html2,"<{$tag}",$seat3+strlen($tag)+1);
            }
            $offset = $seat1+$len+1;
            $return[$i]['s'] = $seat1;
            $return[$i]['e'] = $seat2+$len+3-$seat1;
            $i++;
        }
    }

}

//调用
$html = _request($url);
$Content = getTags($html, "article class=\"kxSk intro \"");
if (!isset($trackContent[0]['s'])) {
    $html = '';
} else {
    $html = substr($html, $trackContent[0]['s'], $trackContent[0]['e']); 
}

function _request($curl, $https = true, $method = 'GET', $data = null,$header=null) {
    $ch = curl_init(); //开启资源
    curl_setopt($ch, CURLOPT_URL, $curl); //设置url
    curl_setopt($ch, CURLOPT_HEADER, false); //访问url时是否需要头
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //返回字符串
    /*curl_setopt($ch, CURLOPT_COOKIE, $cookies);//以变量形式发送cookie,我这里没用它,文件保险点
    curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt');    //存cookie的文件名,
    curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt');  //发送
    curl_setopt($ch, CURLOPT_TIMEOUT, 30);//设置超时限制,防止死循环*/
    if($header){
        curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
    }
    if ($https) {
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //是否进行服务器主机的验证
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //证书是否要验证
    }
    if ($method == 'POST') {
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    }
    $content = curl_exec($ch);
    curl_close($ch);
    return $content;
}

猜你喜欢

转载自blog.csdn.net/zchqq/article/details/82789070