一,语法
1.读:reader = csv.reader(csvFile) # 返回的是迭代类型
2.写:writer = csv.writer(csvFile2)
二,小实例
(1)
import csv
import json
def f1():
with open("./files/mycsv.csv","w",encoding="utf-8") as file:
csv_writer = csv.writer(file)
csv_writer.writerow(["sid","sname","sage"])
csv_writer.writerows([[1,"a",20],[2,"a",20],[3,"a",20]])
def f2():
with open("./files/tecent.json","r",encoding="utf-8") as file:
content = json.load(file)
keys = content[0].keys()
values = [i. values() for i in content]
with open("./files/tecent.csv", "w", encoding="utf-8") as file:
csv_writer = csv.writer(file)
csv_writer.writerow(keys)
csv_writer.writerows(values)
def f3():
pass
if __name__ == '__main__':
#f1()
f2()
打开的时候要选择合适的格式,否则会有乱码
(2)
# coding:utf-8
import csv
# 读取csv文件方式1
csvFile = open("csvData.csv", "r")
reader = csv.reader(csvFile) # 返回的是迭代类型
data = []
for item in reader:
print(item)
data.append(item)
print(data)
csvFile.close()
# 读取csv文件方式2
with open("csvData.csv", "r") as csvfile:
reader2 = csv.reader(csvfile) # 读取csv文件,返回的是迭代类型
for item2 in reader2:
print(item2)
csvFile.close()
# 从列表写入csv文件
csvFile2 = open('csvFile2.csv','w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
m = len(data)
for i in range(m):
writer.writerow(data[i])
csvFile2.close()
# 从字典写入csv文件
dic = {'张三':123, '李四':456, '王二娃':789}
csvFile3 = open('csvFile3.csv','w', newline='')
writer2 = csv.writer(csvFile3)
for key in dic:
writer2.writerow([key, dic[key]])
csvFile3.close()
三,运用到实战中
爬臭事百科并将数据保存到csv中:
import requests
from lxml import etree
import csv
class QiubaiSpdier:
"""臭事百科爬虫"""
def __init__(self):
"""初始化参数"""
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
self.flag = True
def get_url_list(self):
"""获取url列表,总13页"""
return [self.url_temp.format(i) for i in range(1, 14)]
def parse_url(self, url):
"""解析url"""
try:
response = requests.get(url, headers=self.headers)
return response.content.decode("utf-8")
except Exception as ex:
print(ex)
return ""
def get_page_content_list(self, html_str):
"""提取每一页的数据"""
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") # 分组
content_list = []
for div in div_list:
item = {}
item["content"] = "".join(div.xpath(".//div[@class='content']/span/text()")).replace("\n","") #文章
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class") #性别
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"]) > 0 else None
item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")#年龄
item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"]) > 0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src") #文章图片
item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src") #用户头像
item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
content_list.append(item)
return content_list
def save_page_content_list(self, content_list): #[{},{},{}]
"""保存数据"""
if self.flag:
keys = content_list[0].keys()
values = [i.values() for i in content_list]
with open("./files/臭事百科.csv", "a", encoding="utf-8") as file:
csv_writer = csv.writer(file)
csv_writer.writerow(keys)
csv_writer.writerows(values)
self.flag = False
else:
values = [i.values() for i in content_list]
with open("./files/臭事百科.csv", "a", encoding="utf-8") as file:
csv_writer = csv.writer(file)
csv_writer.writerows(values)
def run(self):
"""主要逻辑"""
# 1.url_list
url_list = self.get_url_list()
# 2.遍历,发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_page_content_list(html_str)
# 4.保存
self.save_page_content_list(content_list)
# break
if __name__ == '__main__':
qiubai = QiubaiSpdier()
qiubai.run()