python+BeautifulSoup+selenium+mysqldb完成数据抓取

# coding=utf-8
'''
Created on 2017年2月20日

@author: chenkai
'''
import MySQLdb
import sys
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.remote import webelement
from selenium.webdriver.remote.webelement import WebElement
'''
连接数据库
'''
def getConn():
    host ='127.0.0.1'
    user ='root'
    passwd ='123456'
    port = 3306
    dbcon=MySQLdb.connect(host,user,passwd,port=3306,charset="utf8")
    return dbcon
def getCursor(mysqlConn):
    return mysqlConn.cursor()
def closeDBConnandCur(cur,mysqlConn):
    cur.close()
    mysqlConn.commit() # 加上这句,关闭数据库连接前提交数据库操作
    mysqlConn.close()
#连接数据库
mysqlConn=getConn()
#得道curser
cur=getCursor(mysqlConn)
#使用test数据库
cur.execute("use test")

'''
浏览器
'''
options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",chrome_options=options)  #调用chrome浏览器
#print dir(driver)
driver.get('https://sanya.nuomi.com/326')
#点击按钮
#driver.find_element_by_class_name("next-btn").click()
#
page = driver.page_source
# print(page)
# print type(page)
#
soup = BeautifulSoup(page,'html.parser',from_encoding="utf-8")
# print soup.prettify()
#
div_list=soup.find_all("div", class_="contentbox")
shopUrl=""
shopName=""
index=1001
for con in div_list:
    index+=1
    shopUrl=("https:"+con.a.get("href")).encode('utf-8')# 转码,插入mysql后不会乱码
    shopName=(con.h4.get_text()).encode('utf-8')
#     shopUrl.encode('utf-8')
#     shopName.encode('utf-8')
    print shopUrl,shopName
    print 'insert into  k_bdnm_shopinfo values(%d,%s,%s)'%(index,shopUrl,shopName)
    try:
        cur.execute("insert into  k_bdnm_shopinfo values(%d,'%s','%s')"%(index,shopUrl,shopName))
    except MySQLdb.Error, e: 
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
       
driver.quit()
closeDBConnandCur(cur,mysqlConn)#关闭游标和数据库连接

'''
数据表信息
'''
CREATE TABLE `k_bdnm_shopinfo` (
  `shop_id` int(11) NOT NULL auto_increment,
  `shop_url` varchar(300) NOT NULL,
  `shop_name` varchar(100) NOT NULL,
  PRIMARY KEY  (`shop_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

猜你喜欢

转载自st4024589553.iteye.com/blog/2358248