最近爬了一波youtuber的UP主的联系方式,算是爬虫练手
安装Scrapy
pip install Scrapy
新建Scrapy工程
scrapy startproject YoutuberSpider D:\\Projects\\work\\YoutuberSpider
爬虫脚本
因为我只有一个IP能翻墙,所以用了很多等待来减少IP被ban的概率(然而用处都不大)
youtuber_spider.py
import scrapy
import json
import re
import os
import time
from urllib.parse import quote, unquote, urlencode
from ..items import YoutuberSpiderItem
class YoutuberSpider(scrapy.Spider):
name = "Youtuber"
def __init__(self, pagecount, keyword ):
self.pagecount = int(pagecount)
self.keyword = keyword
self.namelist = []
self.urllist = []
self.sleep_t = 0
if os.path.exists('_name.csv'):
fp = open('_name.csv', 'r+', encoding='utf-8')
all_lines = fp.readlines()
for line in all_lines:
line = line.strip()
if line != '':
self.namelist.append(line)
print(self.namelist)
fp.close()
if os.path.exists('_url.csv'):
fp = open('_url.csv', 'r+', encoding='utf-8')
all_lines = fp.readlines()
for line in all_lines:
line = line.strip()
if line != '':
li = line.split(',')
if li[0] == self.keyword:
self.urllist.append(li[2].strip())
print(self.urllist)
fp.close()
pass
def start_requests(self):
start_urls = []
start_urls.append(self.get_curators_url(query=self.keyword))
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def get_curators_url(self, query=""):
'''
https://www.youtube.com/results?search_query=video+editor&pbj=1
'''
url = "https://www.youtube.com/results?"
url += "search_query=" + str(query)
url += "&pbj=1"
print('add url: ' + url)
return url
def make_link(self, name, url):
name = name.replace('"', '')
link = "=HYPERLINK(\""+ url +"\",\"" + name +"\")"
return link
def build_item(self, item):
youtuberitem = YoutuberSpiderItem()
# youtuberitem['ID'] = item['id']
youtuberitem['KeyWord'] = item['keyword']
youtuberitem['Name'] = item['name']
youtuberitem['About'] = self.make_link('About', item['aboutpage'])
youtuberitem['Video'] = self.make_link(item['videotitle'], item['videolink'])
youtuberitem['Followers'] = item['followers']
youtuberitem['Country'] = item['country']
youtuberitem['Email'] = item['email']
count = item['webcount']
for i in range(count):
tag = 'Weblink' + str(i)
youtuberitem[tag] = item[tag]
return youtuberitem
def has_sleep(self):
self.sleep_t += 1
if self.sleep_t > 50:
self.sleep_t = 0
print('################################################################################')
print('##### ### ###### ## ## ########### ##### ############')
print('#### ####### ###### ###### ###### ### #### ######## ## #### ##########')
print('##### ### ###### ## ## ############ ### #### ##########')
print('######### ## ###### ###### ###### ######### ######## ## #### ##########')
print('#### ### ## ## ## ############### ##### ############')
print('################################################################################')
time.sleep(15)
def save_url(self, page, url):
if url not in self.urllist:
self.urllist.append(url)
fp = open('_url.csv', 'a+', encoding='utf-8')
line = str('%s,%d,%s\n' % (self.keyword, page, url))
fp.write(line)
fp.close()
def parse(self, response):
self.has_sleep()
html = response.body
page = 1 if 'page' not in response.meta.keys() else int(response.meta['page'])
retry = 0 if 'retry' not in response.meta.keys() else int(response.meta['retry'])
print('##############################')
print('#### KeyWord: %s #### Page: %d Retry: %d' % (str(unquote(self.keyword)), page, retry))
print('##############################')
pagelink = response.xpath('//div[@class="branded-page-box search-pager spf-link "]/a/@href').extract()
nextpage = None if len(pagelink) <= 0 else 'https://www.youtube.com' + pagelink[len(pagelink) - 1]
print('Next page: ' + str(nextpage))
if self.pagecount > page:
if nextpage is not None:
self.save_url(page + 1, nextpage)
yield scrapy.Request(url=nextpage, meta={'page': page + 1, 'retry': 0}, callback=self.parse)
elif retry < 10:
print("url: " + response.request.url)
time.sleep(10)
yield scrapy.Request(url=str(response.request.url), meta={'page': page, 'retry': retry + 1}, callback=self.parse, dont_filter=True)
videotitle = response.xpath('//h3[@class="yt-lockup-title "]/a/text()').extract()
videolink = response.xpath('//h3[@class="yt-lockup-title "]/a/@href').extract()
userpages = response.xpath('//div[@class="yt-lockup-byline "]/a/@href').extract()
usernames = response.xpath('//a[@class="yt-uix-sessionlink spf-link "]/text()').extract()
index = -1
skip = 0
for user in userpages:
index += 1
if 'adurl' in str(videolink[index]):
skip += 1
continue
name = str(usernames[index]).replace('\n', '').replace('"', '').strip()
if name in self.namelist:
skip += 1
print('#SKIP# name: ' + name)
continue
else:
self.namelist.append(name)
fp = open('_name.csv', 'a+', encoding='utf-8')
fp.write(name + '\n')
fp.close()
print(' name: ' + name)
item = {}
item['name'] = name
item['keyword'] = str(unquote(self.keyword))
item['videotitle'] = str(videotitle[index]).replace('\n', '').replace('"', '').strip()
item['videolink'] = 'https://www.youtube.com' + videolink[index]
item['aboutpage'] = 'https://www.youtube.com' + user + '/about'
yield scrapy.Request(url=item["aboutpage"], meta={'item': item}, callback=self.parse_about)
pass
def parse_about(self, response):
html = response.body
item = response.meta['item']
# item['name'] = response.xpath('//a[@class="spf-link branded-page-header-title-link yt-uix-sessionlink"]/text()').extract()[0]
print('------------- get user: ' + str(item['name']) + ' ----------------------------')
f = response.xpath('//span[@class="yt-subscription-button-subscriber-count-branded-horizontal subscribed yt-uix-tooltip"]/text()').extract()
item['followers'] = 0 if len(f) <= 0 else int(f[0].replace(',', ''))
country = response.xpath('//span[@class="country-inline"]/text()').extract()
item['country'] = '' if len(country) == 0 else country[0].replace('\n', '').strip()
text = response.xpath('//pre/text()').extract()
if len(text) > 0:
mail = re.search('[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.[a-zA-Z0-9]{2,6}', text[0])
item['email'] = mail.group(0) if mail else ''
else:
item['email'] = ''
webname = response.xpath('//ul[@class="about-custom-links"]/li/a/span/text()').extract()
weblink = response.xpath('//ul[@class="about-custom-links"]/li/a/@href').extract()
item['webcount'] = 0
count = len(webname)
for i in range(count):
if item['webcount'] < 6:
title = str(webname[i]).replace('\r', '').replace('\t', '').replace('\n', '').strip()
if '/redirect?' in weblink[i]:
rex = re.search('q=(.*)&?', weblink[i])
link = rex.group(1)
weblink[i] = unquote(link)
item['Weblink' + str(item['webcount'])] = self.make_link(title, weblink[i])
item['webcount'] += 1
self.has_sleep()
return self.build_item(item)
settings.py
在最后新增配置项
DOWNLOAD_DELAY = 3
DOWNLOADER_STATS = False
FEED_EXPORT_ENCODING = 'utf-8'
REDIRECT_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'youtuber_spider.middlewares.YoutuberSpiderDownloaderMiddleware': 543,
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'youtuber_spider.middlewares.MyUserAgentMiddleware': 400,
}
FEED_EXPORTERS = {
'csv': 'youtuber_spider.spiders.csv_item_exporter.MyProjectCsvItemExporter',
}
FIELDS_TO_EXPORT = ['KeyWord', 'Name', 'About', 'Video', 'Followers', 'Country', 'Email', 'Weblink0', 'Weblink1', 'Weblink2', 'Weblink3', 'Weblink4', 'Weblink5' ]
USER_AGENT = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
]
items.py
import scrapy
class YoutuberSpiderItem(scrapy.Item):
# define the fields for your item here like:
Name = scrapy.Field()
KeyWord = scrapy.Field()
About = scrapy.Field()
Video = scrapy.Field()
Followers = scrapy.Field()
Country = scrapy.Field()
Email = scrapy.Field()
Weblink0 = scrapy.Field()
Weblink1 = scrapy.Field()
Weblink2 = scrapy.Field()
Weblink3 = scrapy.Field()
Weblink4 = scrapy.Field()
Weblink5 = scrapy.Field()
middlewares.py
在末尾加上一个类MyUserAgentMiddleware
,用来随机改变请求头
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENT')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
csv_item_exporter.py
在spiders文件夹下添加文件, 用于规范化csv文件列名的输出顺序(顺序定义在settings.py的FIELDS_TO_EXPORT数组)
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
class MyProjectCsvItemExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
delimiter = settings.get('CSV_DELIMITER', ',')
kwargs['delimiter'] = delimiter
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export:
kwargs['fields_to_export'] = fields_to_export
super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
爬虫使用
至此,可以使用命令scrapy crawl Youtuber -o output.csv -a pagecount=10 -a keyword="news"
来进行数据爬取了,其中:
- -o filename 输出文件
- pagecount=10 要爬取的页数
- keyword=”news” 爬取的搜索关键字
附加:循环爬取,去重合并
另写了一个run.py
脚本,用来对多个关键字进行爬取,并根据UP的名字自动合并去重
import sys
import os
import argparse
import codecs
import time
from urllib.parse import quote, unquote, urlencode
def format_data(filename):
tempfile = filename + '.tmp'
if not os.path.exists(filename):
return 0
fp = open(filename, 'r', encoding='utf-8')
op = open(tempfile, 'w', encoding='utf-8')
first = fp.readline()
lines = fp.readlines()
op.write(first)
index = 0
for line in lines:
if line == '\n' or line == first:
continue
index += 1
op.write(line)
# print('%5ld: ' % index + line)
fp.close()
op.close()
os.remove(filename)
os.renames(tempfile, filename)
return index
def add_to_output(datafile, outputfile):
index = 0
fp = open(datafile, 'r', encoding='utf-8')
first = fp.readline()
if not os.path.exists(outputfile):
op = open(outputfile, 'w', encoding='utf-8')
op.write(first)
else:
op = open(outputfile, 'a', encoding='utf-8')
lines = fp.readlines()
for line in lines:
if line == '\n' or line == first:
continue
index += 1
op.write(line)
fp.close()
op.close()
return index
def make_file_name(keyword):
return '_' + keyword.replace(' ', '_') + '.csv'
if __name__ == '__main__':
# max is ???
pagecount = 33
keywords = [
'how to cook',
'how to edit',
'how to beauty',
'how to slim',
'how to lose',
'how to play',
'how to fix',
'how to learn',
'how to travel',
]
outputfile = '_output.csv'
for keyword in keywords:
count = 0
filename = '_' + keyword.replace(' ', '_') + '.csv'
times = 0
while times < 10 and count / (pagecount * 20) < 0.95:
times += 1
print('###########################################')
print('#### times: %2d #### %s : %.2f%% ' % (times, keyword, 100 * count / (pagecount * 20)))
print('###########################################')
cmd = "scrapy crawl Youtuber -o %s -a pagecount=%d -a keyword=%s" % (filename, pagecount, quote(keyword))
os.system(cmd)
count = format_data(filename)
time.sleep(30)
print('###########################################')
print('#### %s : %.2f%% Finished' % (keyword, 100 * count / (pagecount * 20)))
print('###########################################')
time.sleep(300)
print('----------------------------')
total = 0
for keyword in keywords:
count = 0
filename = '_' + keyword.replace(' ', '_') + '.csv'
count = add_to_output(filename, outputfile)
print('%s: %d' % (filename, count))
total += count
print('----------------------------')
print('total: %d' % total)
pass