与其说CSDN博客作为一个分享平台,还不如说,它是个代码归档存储仓库。
beautifulsoup的基本用法总结
soup=BeautifulSoup(html)#创建Beautifulsoup对象
soup.prettify()#结构化
soup.tag#取标签
soup.tag.name#取标签名
soup.tag.attrs#取标签属性
soup.tag["attrname"]#取标签相应属性的值,也可用soup.tag.get("attrname")
soup.tag.string#取出标签内容
soup.tag.contents#将tag的子节点以列表的方式输出
soup.tag.get_text()#取内容
soup.find_all('tag')#找到所有tag标签
soup.find_all(['tag1','tag2'])#找到所有tag1标签和tag2标签
soup.find_all(re.compile('^b'))#正则查找
soup.find_all(id='idname')#标签属性查找,注意,若属性名和关键字冲突,可以用形如soup.find(attrs={"name":"sakai_csrf_token"})的方式
soup.find_all(id=re.compile('^a'))#标签属性加正则
soup.find_all(id='idname',href=re.compile('^hrefb'))#多重限制查找
soup.find_all(text=re.compile('^abc'))#内容查找
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)#方法查找
soup.find('tag')#找一个,所有的find_all都可以类推
for i in soup.tag.children:
print(i)#遍历子标签
for i in soup.descendants:
print(i)#遍历子孙标签
for line i soup.strings:
print(repr(i))#遍历内容
soup.tag.parent#父节点
for parent in content.parents:
print(parent.name)#遍历所有父节点
soup.tab.next_sibling.next_sibling#下下个兄弟节点
soup.tab.previous_sibling .previous_sibling#上上个兄弟节点
for i in soup.tag.next_siblings:
print(i)#遍历所有上兄弟节点
soup.tag.next_element#上一个节点不一定是兄弟
soup.a.previous_element
一个爬虫的基本框架(urllib)
# -*- coding: utf-8 -*-
import urllib.request
#网址
url = "https://www.douban.com/"
#请求
request = urllib.request.Request(url)
#爬取结果
response = urllib.request.urlopen(request)
data = response.read()
#设置解码方式
data = data.decode('utf-8')
#打印结果
print(data)
#打印爬取网页的各类信息
print(type(response))
print(response.geturl())
print(response.info())
print(response.getcode())
一个爬虫的基本框架(session,微博)
# -*- coding: utf-8 -*-
import requests
url = 'https://passport.weibo.cn/sso/login'
dat = {
'username':'13269500113',
'password':'mima',
'savestate':'1',
'r':'http://m.weibo.cn/',
'ec':'0',
'pagerefer':'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F',
'entry':'mweibo',
'wentry':'',
'loginfrom':'',
'client_id':'',
'code':'',
'qq':'',
'mainpageflag':'1',
'hff':'',
'hfp':''
}
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':'281',
'Content-Type':'application/x-www-form-urlencoded',
#Cookie:SCF=AljbDN-Nw8b030ODeIsZ759eA7Vc_K3VPRnGqEY-2-it2vHSOz20e6iHphdYbH0sXoGX4X_HW_qjMr4RL-PeAEY.; _T_WM=35740326be0e169c0e0012349732b12f; SUHB=0oUoLaPQIcy_Mi
'Host':'passport.weibo.cn',
'Origin':'https://passport.weibo.cn',
'Referer':'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
session = requests.session()
response = session.post(url,data=dat,headers=header)
html = session.get('https://m.weibo.cn')
#html.encoding = 'gb2312'
#content = html.text
session例子:国科大课程监控【初稿】
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 19 09:48:55 2018
@author: LuSong
"""
#国科大自动选课脚本
from __future__ import print_function
import re
import time
import json
import requests
from bs4 import BeautifulSoup
#from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import smtplib
import codecs
from imp import reload
import sys
reload(sys)
with open("./private.txt") as f:
courses = []
for i, line in enumerate(f):
if i < 3: continue
courses.append(line.strip())#strip去掉换行
with codecs.open(r'./private.txt', "r", 'utf-8') as f:
username = password = None
for i, line in enumerate(f):
if i == 0:
line = bytes(line.encode('utf-8'))#utf-8编码后,转为字节类型
if line[:3] == codecs.BOM_UTF8:#容错机制
line = line[3:]
username = line.decode('utf-8').strip()
elif i == 1:
password = line.strip()
elif i == 2:
mailto_list = line.strip().split()#split 按空格读入不同的邮箱
else:
break
#mailto_list = ["[email protected]","[email protected]"] #目标邮箱,只有这里改成你自己的邮箱
#mail_host = "smtp.163.com"
#mail_user = "[email protected]"
#mail_pass = "pswd" #163邮箱smtp生成的密码
mail_host = "smtp.126.com"
mail_user = "[email protected]"
mail_pass = "pswd" #163邮箱smtp生成的密码
def send_mail(to_list, sub, content):
me = "LogServer"+"<"+mail_user+">"
msg = MIMEText(content, _subtype='plain', _charset='utf-8')
msg['Subject'] = sub
msg['From'] = me
msg['To'] = ";".join(to_list)
try:
server = smtplib.SMTP(mail_host, 25)#修改了一下端口可以了。
# server.set_debuglevel(1)
server.connect(mail_host)
server.login(mail_user, mail_pass)
server.sendmail(me, to_list, msg.as_string())
server.close()
return True
except (Exception) as e:
print(str(e))
return False
#变量的初始化
session = None
headers = None
jwxk_html = None
#course = [['021M2028H', '0'], ['021M2028H', '1']]
#username = '[email protected]'
#password = 'pswd'
#cnt = 0
#__BEAUTIFULSOUPPARSE = 'html5lib'
#登录系统
session = requests.session()
login_url = 'http://onestop.ucas.ac.cn/Ajax/Login/0'#提交信息地址,这个地址不需要验证码
headers= {
'Host': 'onestop.ucas.ac.cn',
"Connection": "keep-alive",
'Referer': 'http://onestop.ucas.ac.cn/home/index',
'X-Requested-With': 'XMLHttpRequest',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
}
post_data = {
"username": username,
"password": password,
"remember": 'checked',
}
html = session.post(login_url, data=post_data, headers=headers).text
res = json.loads(html)#登录地址是一回事,提交数据地址是一回事,返回的地址是一回事,这里打开返回的地址
html = session.get(res['msg']).text
#利用Identity进入选课系统
#打开选课系统
#获取Identity
url = "http://sep.ucas.ac.cn/portal/site/226/821"
r = session.get(url, headers=headers)
#f = open('r.html','w+',encoding='utf-8')
#f.write(r.text)
#f.close
code = re.findall(r'"http://jwxk.ucas.ac.cn/login\?Identity=(.*)"', r.text)[0]
#打开选课系统
url = "http://jwxk.ucas.ac.cn/login?Identity=" + code
#headers['Host'] = "jwxk.ucas.ac.cn"
r = session.get(url, headers=headers)
temp = r.text
#f = open('temp.html','w+',encoding='utf-8')
#f.write(temp)
#f.close
#url = 'http://jwxk.ucas.ac.cn/courseManage/main'
#r = session.get(url, headers=headers)
#jwxk_html = r.text
#f = open('jwxk_html.html','w+',encoding='utf-8')
#f.write(jwxk_html)
#f.close
count = 0
while 1:
time.sleep(1)
count = count + 1
print(count)
url = 'http://jwxk.ucas.ac.cn/course/termSchedule'
r = session.get(url, headers=headers)
jwxk_html = r.text
# f = open('termSchedule.html','w+',encoding='utf-8')
# f.write(jwxk_html)
# f.close
soup=BeautifulSoup(jwxk_html,'lxml')
# print(soup.prettify())
# f = open('soupprettify.html','w+',encoding='utf-8')
# f.write(soup.prettify())
# f.close
soup = soup.table
# courses = ['23MGB003H-21']#这里改成你要监控的课程编号们
for course in courses:
course = re.compile(course)
course_ind = soup.find_all(target='_blank',string=course)
#course_ind = soup.find_all(string=course)
course_info = course_ind[0].parent.parent
infomation = course_info.find_all('td')
lim_num = int(infomation[6].string)
num = int(infomation[7].string)
item = infomation[2].string
course_left = lim_num-num
if course_left > 0:
# flag = send_mail(mailto_list,'nihao','haoya')
flag = send_mail(mailto_list,item+'课程可选',course_info.text +'\n\n'+ '余量为:'+str(course_left))
if flag:
print('有课余量,发送成功!'+item+'余量为:'+str(course_left))
else:
print('发送邮件失败!')
#html = jwxk_html
#regular = r'<label for="id_([\S]+)">' + course[0][0][:2] + r'-'
#institute_id = re.findall(regular, html)[0]
#url = 'http://jwxk.ucas.ac.cn' + \
# re.findall(r'<form id="regfrm2" name="regfrm2" action="([\S]+)" \S*class=', html)[0]
#post_data = {'deptIds': institute_id, 'sb': '0'}
#
#html = session.post(url, data=post_data, headers=headers).text
学习语言最好的方式是去看代码,然后动手去尝试体会,而不是看一些杂七杂八的文字总结和所谓的视频教程。善于利用百度,你也就成功了一半。