假期学习【十一】Python爬取百度词条写入csv格式 python 2020.2.10

今天主要完成了根据爬取的txt文档，从百度分类从信息科学类爬取百度词条信息，并写入CSV格式文件。

txt格式文件如图：

代码如下：

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import csv
 4 import io
 5 import re
 6 
 7 url="https://baike.baidu.com/item/"
 8 id=1
 9 
10 patton=re.compile(r'.*信息科学分类.*|.*软件.*|.*科技产品.*|.*公司.*|.*互联网人物.*|.*互联网.*|.*科技术语.*|.*技术.*|.*网站.*')
11 
12 #写入表头
13 def Head():
14     with open('E:/bdbk.csv', 'w', encoding='utf-8', newline='') as csvfile:
15         writer = csv.writer(csvfile)
16         writer.writerow(["序号", "名称", "属性", "内容", "网址"])
17 
18 
19 def Href(url):
20     try:
21         global id, name, nature, content,tag
22         kv = {'user-agent': 'Mozilla/5.0'}
23         r = requests.get(url, headers=kv)
24         r.encoding = "utf-8"
25         demo = r.text
26         soup = BeautifulSoup(demo, "html.parser")
27         print(url)
28         #print(soup.prettify())
29         tag=soup.find_all("dd",{"id":"open-tag-item"})[0].get_text().replace("（","").replace("）","").strip().replace("\n","")
30         name=soup.find_all("h1")[0].get_text().strip()
31         nature=soup.find_all("h2")[0].get_text().replace("（","").replace("）","").strip()
32         if nature=='目录':
33             nature=tag
34         content=str(soup.find_all("div",{"class":"lemma-summary"})).replace("/item","https://baike.baidu.com/item").strip().rstrip("]").lstrip("[")
35     except:
36         print("出错!")
37     if name!="百度百科错误页" and nature!="目录" and len(patton.findall(tag))!=0:
38         print("序号:"+str(id))
39         print("名称:"+name)
40         print("属性:"+nature)
41         print("内容:"+content)
42         print("网址:"+url)
43         write(id, name, nature, content, url)
44         id+=1
45 
46 def read():
47     global url
48     f=open("E:/word4.txt",'r+',encoding="utf-8")
49     for line in f:
50         url=url+line.rstrip("\n")
51         Href(url)
52         url = "https://baike.baidu.com/item/"
53     f.close()
54 
55 def write(id,name,nature,content,url):
56     f = open('E:/bdbk.csv', 'a+', encoding='utf-8', newline='')
57     csv_writer = csv.writer(f)
58     csv_writer.writerow([str(id),name,nature,content,url])
59     f.close()
60 
61 if __name__=="__main__":
62     Head()
63     #Href("https://baike.baidu.com/item/python")
64     read()

假期学习【十一】Python爬取百度词条写入csv格式 python 2020.2.10

猜你喜欢