SparkSql在最先开始的时候是数据元与数据分开的
package com.sparksql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{
IntegerType, StringType, StructField, StructType}
object DataFrameFromStuctType {
def main(args: Array[String]): Unit = {
//SparkSession
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("DataFrameFromStuctType")
.getOrCreate()
val lineRDD: RDD[String] = spark.sparkContext.textFile("C:\\Users\\小象\\Desktop\\6月\\0627\\resources\\people.txt")
//DataFrame:(表头信息)元信息,元数据 (name,age)
val scheme = StructType(List(StructField("name",StringType),StructField("age",IntegerType)))
//DataFrame:真正需要处理的数据,Row
val rowRDD: RDD[Row] = lineRDD.map(line => {
val linearray: Array[String] = line.split(",")
Row(linearray(0), linearray(1).trim.toInt)
})
//DataFrame数据框,可以把peopleDF当做一个二维表
val peopleDF: DataFrame = spark.createDataFrame(rowRDD,scheme)
peopleDF.createOrReplaceTempView("people")
spark.sql("select avg(age) avgage from people").show()
spark.stop()
}
}
但是这种方法由于比较麻烦,所以后来被样例类的方式取代了,不过底层其实还是它只是反射分装了而已