一个网页只有一篇文章
<?php
header('Content-type:text/html;charset=utf-8');
//连接数据库
$link = @mysql_connect('localhost','root','root');
//判断连接是否成功
if(mysql_errno()){
exit('数据库连接失败'.mysql_error());
}
//选择数据库
mysql_select_db('555');
$jid = substr(__FILE__,strrpos(__FILE__,'\\')+1,strrpos(__FILE__,'.')-strrpos(__FILE__,'\\')-1);
showDir("F:/FeigeDownload/2922");
function showDir( $filedir ){
if(is_dir($filedir)){
//打开目录
$dir = @ dir($filedir);
while (($file = $dir->read())!==false){
if(is_dir($filedir."/".$file) AND ($file!=".") AND ($file!="..")) {
showDir($filedir."/".$file);
} else {
if ($file != "." and $file != ".."){
getDBData($filedir.'/'.$file,$file);
echo $filedir.'/'.$file.'<br>';
//echo $file;
}
}
}
$dir->close();
}else{
getDBData($filedir);
echo $filedir;
}
}
function getDBData($filename,$file) //file指文件名100000.HTML
{
//$urls="E:/html/auth/auth2/1000000.html";
//$allurl=file_get_contents($filename); //获取到网页的所有源代码
preg_match('/(\d{0,20})\.html/',$file,$ids);
$urls=file_get_contents($filename);
$title = gettitle($urls);
$Geo = getGeo($urls);
$Family = getFamily($urls);
$Addname = getAddname($urls);
$Veg = getVeg($urls);
$img_href = getimg_href($urls);
$img_title = getimg_title($urls);
$Plant = getPlant($urls);
$Flowering = getFlowering($urls);
$Leaf = getLeaf($urls);
$Habitat = getHabitat($urls);
$book_title = getbook_title($urls);
$book_href = getbook_href($urls);
$page_url = getpage_url($urls);
//echo $Veg;
//exit;
//echo $auth;
$sql="insert into `55`(ids,title,img_href,Geo,img_title,Veg,Family,Addname,Plant,Flowering,Leaf,Habitat,book_title,book_href,page_url) values('{$ids[1]}',
'{$title}','{$img_href}','{$Geo}','{$img_title}','{$Veg}','{$Family}','{$Addname}','{$Plant}','{$Flowering}','{$Leaf}','{$Habitat}','{$book_title}','{$book_href}','{$page_url}')";
//echo $sql;
//exit;
mysql_query($sql);
}
function gettitle($data)
{
preg_match('/<title>([\s\S]*?)<\/title>/',$data,$title);
return $title[1]; //返回目录
}
function getpage_url($data)
{
preg_match('/<meta property="og:url" content="([\s\S]*?)" \/>/',$data,$page_url);
return $page_url[1];
}
// Geo
// 进行两层遍历
function getGeo($data)
{
preg_match('/<h3>Geogr. District<\/h3>([\s\S]*?)<\/ul>/',$data,$Geo);
if(!empty($Geo))
{
//preg_match('/<a href="[\s\S]*?">([\s\S]*?)<\/a>\s+<span class="cover">([\s\S]*?)<\/span>/',$data,$Geo);
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Geo[1],$Geo1);
if(empty($Geo1))
{
return "";
}else{
$Geo2="";
foreach($Geo1[1] as $val)
{
$Geo2.=$val."@@$$||";
}
return $Geo2;
}
}
}
// Veg
function getVeg($data)
{
preg_match('/<h3>Vegetation Units<\/h3>([\s\S]*?)<\/ul>/',$data,$Veg);
if(!empty($Veg))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Veg[1],$Veg1);
//var_dump($Veg1);
//exit;
//遍历二维数组,连接拼接符
if(empty($Veg1))
{
return "";
}else{
$Veg2 = "";
foreach($Veg1[1] as $val)
{
$Veg2.=$val."@@$$||";
}
return $Veg2;
//var_dump($Veg2);
//exit;
}
}
}
// img_href
function getimg_href($data)
{
preg_match_all('/<div class="slide">\s+<a href="([\s\S]*?)"\s+title/',$data,$img_href);
if(empty($img_href))
{
return "";
}else{
$img_href1 ="";
foreach($img_href[1] as $val)
{
$img_href1.=$val."@@$$||";
}
return $img_href1;
}
}
// img_title
function getimg_title($data)
{
preg_match_all('/<div class="slide">[\s\S]*?title="([\s\S]*?)"\s+rel/',$data,$img_title);
// 用分隔符把数据分开,下面不对,用explode('|',你的数组名);
// 拆分之后在implode用分隔符
if(empty($img_title))
{
return "";
}else{
$img_title1 = "";
foreach($img_title[1] as $val)
{
$img_title1.=$val."@@$$||";
}
return clean($img_title1);
//var_dump($img_title);
//exit;
}
}
// 科名 Family
function getFamily($data)
{
preg_match('/<h2>Family<\/h2>\s+<ul>\s+<li><a href="[\s\S]*?">([\s\S]*?)<\/a>/',$data,$Family);
return $Family[1];
//echo $Family[1]; // 真
}
// 其他名称 Addname
function getAddname($data)
{
//preg_match('/<h2>Additional Names<\/h2>\s+<dl>\s+<dt>Hebrew with Vowels:</dt><dd class="dir-rtl">זוּגַן הַשִּׂיחַ</dd>/');
preg_match('/<h2>Additional Names<\/h2>\s+<dl>\s+<dt>Hebrew with Vowels:<\/dt><dd class="dir-rtl">([\s\S]*?)<\/dd>/',$data,$Addname);
return $Addname[1];
//echo $Addname[1]; // 好了
}
// 四个面板 Plant
//遍历二维数组,连接拼接符
/*array(2) { [0]=> array(4) { [0]=> string(77) "chamaephyte"
[1]=> string(70) "absent" [2]=> string(84) "leaf succulent"
[3]=> string(82) "perennating" }
[1]=> array(4) { [0]=> string(11) "chamaephyte"
[1]=> string(6) "absent"
[2]=> string(14) "leaf succulent"
[3]=> string(11) "perennating" } } */
function getPlant($data)
{
preg_match('/<div id="tab1">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Plant);
if(!empty($Plant))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Plant[1],$Plant1);
// $Plant1是一个数组
if(empty($Plant1))
{
return "";
}else{
$Plant2 = "";
foreach($Plant1[1] as $val)
{
$Plant2.=$val."@@$$||";
}
return $Plant2;
}
//var_dump($Plant1);
//exit;
}
}
// Flowering
function getFlowering($data)
{
preg_match('/<div id="tab2">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Flowering);
if(!empty($Flowering))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Flowering[1],$Flowering1);
//var_dump($Plant1);
//exit;
if(empty($Flowering))
{
return "";
}else{
$Flowering2 = "";
foreach($Flowering1[1] as $val)
{
$Flowering2.=$val."@@$$||";
}
return $Flowering2;
}
}
}
// Leaf
function getLeaf($data)
{
preg_match('/<div id="tab3">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Leaf);
if(!empty($Leaf))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Leaf[1],$Leaf1);
//var_dump($Plant1);
//exit;
if(empty($Leaf1))
{
return "";
}else{
$Leaf2 = "";
foreach($Leaf1[1] as $val)
{
$Leaf2.=$val."@@$$||";
}
//var_dump($arr);
//exit;
return $Leaf2;
}
}
}
// Habitat
function getHabitat($data)
{
preg_match('/<div id="tab4">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Habitat);
if(!empty($Habitat))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Habitat[1],$Habitat1);
//var_dump($Plant1);
//exit;
if(empty($Habitat1))
{
return "";
}else{
$Habitat2 = "";
foreach($Habitat1[1] as $val)
{
$Habitat2.=$val."@@$$||";
}
return $Habitat2;
}
}
}
// 参考文献的title book_title
// 怎么同时获取文章title和href
function getbook_title($data)
{
preg_match('/<h3 class="floraBookPart">Books <\/h3>\s+<ul>([\s\S]*?)<\/ul>/',$data,$book_title);
if(!empty($book_title))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$book_title[1],$book_title1);
//var_dump($Plant1);
//exit;
if(empty($book_title1))
{
return "";
}else{
$book_title2 = "";
foreach($book_title1[1] as $val)
{
$book_title2.=$val."@@$$||";
}
return $book_title2;
}
}
}
// 参考文献的href book_href
function getbook_href($data)
{
preg_match('/<h3 class="floraBookPart">Books <\/h3>\s+<ul>([\s\S]*?)<\/ul>/',$data,$book_href);
if(!empty($book_href))
{
preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$book_href[1],$book_href1);
//var_dump($Plant1);
//exit;
if(empty($book_href1))
{
return "";
}else{
$book_href2 = "";
foreach($book_href1[1] as $val)
{
$book_href2.=$val."@@$$||";
}
return $book_href2;
}
}
}
// 清除 html 标签
function clean($str){
$str = str_replace('\'','\\\'',$str);
$str = str_replace('\’','\\\’',$str);
$str = str_replace('\'','\\\‘',$str);
$str = preg_replace('/\s\s+/', ' ', $str);
$str = trim($str);
$str = strip_tags($str);
return $str;
}
?>