我们这一节介绍一些基本的dataFrame操作,在以后的教程中会有具体的介绍
printScema()以树的形式打印dataframe结构信息
select()查询某一列
要也可以和select()\where()\orderBy()组合使用
import org.apache.spark.sql.SparkSession
object SparkSqltest1 {
def main(args: Array[String]): Unit = {
//创建sparksession
val sparkSession=SparkSession.builder().appName("test1").master("local[*]")getOrCreate()
import sparkSession.implicits._
//读取文件形成dataframe
val df=sparkSession.read.json("hdfs://192.168.1.181:9000/json/data.json")
df.show()
df.printSchema()//以树格式打印结构信息
// df.select("name").show()
df.select($"name").show()
df.select($"name",$"age"*2).orderBy($"age".asc).show()
df.groupBy($"age").count().show()
sparkSession.stop()
}
}
输出的结果为:
+---+--------+
|age| name|
+---+--------+
| 1|zhangsan|
| 2| lisi|
+---+--------+
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
+--------+
| name|
+--------+
|zhangsan|
| lisi|
+--------+
+--------+---------+
| name|(age * 2)|
+--------+---------+
|zhangsan| 2|
| lisi| 4|
+--------+---------+
+---+-----+
|age|count|
+---+-----+
| 1| 1|
| 2| 1|
+---+-----+