from odps.df import DataFrame
dfusers = DataFrame(o.get_table('odps_tc_257100_f673506e024.zhaopin_round2_user'))
df.dtypes
df.head(5)
df[df.sepalwidth > 3]['name', 'sepalwidth'].head(5)
print('获取方法表格')
print(users.schema)
print(users['experience'][1:10])
迭代
for jdStr in train_jd_data['job_description'].execute():
o.get_table('modified_zhaopin_round2_jd'
for jdStr in o.read_table('modified_zhaopin_round2_jd'):
with table.open_reader(partition=partition, **kw) as reader:
for jdStr in train_jd_data['job_description'][1]:
>>> dual.creation_time
>>> datetime.datetime(2014, 6, 6, 13, 28, 24)
>>> dual.is_virtual_view
>>> False
>>> dual.size
>>> 448
>>> dual.schema.columns
%sql select * from pyodps_iris limit 5
优秀
https://yq.aliyun.com/articles/300307?spm=a2c4e.11153940.0.0.2c623566uS5Kcg
要学的那么多,特征工程不会,spark不会
train_jd_data= DataFrame(o.get_table('modified_zhaopin_round2_jd')).to_pandas()
train_jd_data= o.get_table('modified_zhaopin_round2_jd').to_df().to_pandas()
train_jd_data= DataFrame(o.get_table('modified_zhaopin_round2_jd'))--novalue
users.exclude('zip_code', 'age').head(5)
users.select(users.exclude('zip_code', 'sex'), sex_bool=users.sex == 'M').head(5)
value_count(
df = users.groupby('occupation').agg(count=users['occupation'].count())
df.sort(df['count'], ascending=False)[:10]
users[users.age.between(20, 25)].count() #年龄在20到25岁之间的人有多少个
users.groupby(users.sex).agg(count=users.count())
保存新表
movies = DataFrame(o.get_table('pyodps_ml_100k_movies'))
ratings = DataFrame(o.get_table('pyodps_ml_100k_ratings'))
o.delete_table('pyodps_ml_100k_lens', if_exists=True)
lens = movies.join(ratings).join(users).persist('pyodps_ml_100k_lens')
现在我们把年龄分成从0到80岁,分成8个年龄段。
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
cut_lens = lens[lens, lens.age.cut(range(0, 81, 10), right=False, labels=labels).rename('年龄分组')]
取分组和年龄唯一的前10条查看。
cut_lens['年龄分组', 'age'].distinct()[:10]
cut_lens.groupby('年龄分组').agg(cut_lens.rating.count().rename('评分总数'), cut_lens.rating.mean().rename('评分均值'))
#2创建Dataframe
Collection(DataFrame) ,Sequence,Scalar。 这三个对象分别表示表结构(或者二维结构)、列(一维结构)、标量
>>> iris = DataFrame(o.get_table('pyodps_iris'))
>>> iris2 = o.get_table('pyodps_iris').to_df() # 使用表的to_df方法
>>> df2 = DataFrame(df, unknown_as_string=True, as_type={'null_col2': 'float'})
>>> df2.dtypes
>>> #2.1,Sequence,Scalar。
>>> iris.sepallength.head(5) #获取列 getattr(df, ‘column_name’) df[column_name]
>>> #运算
>>> (iris.sepallength * 10).log().head(5)
>>> fields = [iris.sepallength,
>>> (iris.sepallength / 2).rename('sepallength除以2'),
>>> (iris.sepallength ** 2).rename('sepallength的平方')]
pyodps
猜你喜欢
转载自blog.csdn.net/serenysdfg/article/details/103277213
今日推荐
周排行