在数据分析过程中往往要操作较大的数据集,这就需要连接数据库进行操作
import pandas as pd import numpy as np from pandas import Series,DataFrame from sqlalchemy import create_engine import MySQLdb as msd # 先自定义函数将表格写入数据库里,以备操作过程中有些数据要写入数据库 def savetosql(DF,tablename): import pandas as pd from sqlalchemy import create_engine yconnect = create_engine('mysql+mysqldb://root:@127.0.0.1:3306/jing?charset=utf8') pd.io.sql.to_sql(DF,tablename, yconnect, schema='jing', if_exists='append')
注意:获取一次sql对象就需要重新访问一下数据库(!!!)
# 读取数据库数据 engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/jing?charset=utf8') sql = pd.read_sql('all_gzdata', engine, chunksize = 10000) # 由于数据量太大,使用chunksize进行控制分块进行 # 获取浏览一次的所有数据 f = counts1_[counts1_['realIP']==1] del f[1] f.columns = [u'点击次数'] f.index.name = 'realIP' # g = [pd.merge(f,i[['fullURLId','fullURL','realIP']],right_on = 'realIP',left_index=True,how ='left') for i in sql] g = [i[['fullURLId','fullURL','realIP']] for i in sql] g = pd.concat(g) h = pd.merge(f,g,right_on = 'realIP',left_index=True,how ='left') h
Python操作数据库
import MySQLdb as msd # 注意py2.7是mysqldb,python3是pymysql # connect()方法用于创建数据库的连接,里面可以指定参数:用户名、密码、主机等信息 conn = msd.connect( host = 'localhost', port = 3306, user = 'root', passwd = '',#数据库密码 db = 'jing',# 数据库名 ) # 通过获取到的数据库连接conn下的cursor()方法来创建游标 cur = conn.cursor() # 通过游标cur操作execute()方法可以写入纯sql语句,操作数据库 # 创建表 cur.execute('create table student(id int, name varchar(20), class varchar(20), age varchar(10))') # 插入一条数据 cur.execute("insert into student values('2','gege','class 2 grade 3','20')") cur.execute("insert into student values('2','bob','class 2 grade 5','21')") # 修改查询条件的数据 cur.execute("update student set class = 'class 2 grade 5' where name='bob'") # 删除查询条件的数据 cur.execute("delete from student where age='9'") # cur.execute("drop table student3,student4,student5,student6") cur.close() # 关闭游标 conn.commit() # 提交事务,向数据库插入数据时一定要加这句话,否则不会真正插入 conn.close()# 关闭数据库连接