1、进入spark-shell模式
spark-shell --master yarn --executor-memory 4g --num-executors 3 --executor-cores 4
2、spark sql查询Hive数据库
import spark.sql
sql("use database_name”)
sql("show tables").show
3、读取hdfs文件数据
val data = spark.read.format("csv").option("sep", ",").option("header","true").load("file_path + file_name")
4、存储文件(默认hdfs路径)
data.write.format("csv").save("/data/....")
5、读取hive表数据
val res = spark.sql("select * from table_1 where day='20181230'")
6、注册成表
res.registerTempTable(“Res")
7、更换属性
val ss = data.selectExpr("_c0 as like","_c1 as session_id","_c2 as uid1”)
8、删除某列属性
val s1 = data.drop("_c0”)
9、一列转换成多列
val df2 =df1.withColumn("_corrupt_record",split(col("_corrupt_record"),","))
.select(col("_corrupt_record").getItem(0).as("uid"),col("_corrupt_record").getItem(1).as("number"))
10、过滤数字(三个横线)
val uid = df2.filter($"number"===1)
11、过滤空值
val s_1 = res.filter("like is not null").filter("session_id is not null”)