SparkSql在最先开始的时候是怎么和RDD交互的

SparkSql在最先开始的时候是数据元与数据分开的

package com.sparksql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
    
    DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{
    
    IntegerType, StringType, StructField, StructType}

object DataFrameFromStuctType {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //SparkSession
    val spark: SparkSession = SparkSession
      .builder()
      .master("local")
      .appName("DataFrameFromStuctType")
      .getOrCreate()

    val lineRDD: RDD[String] = spark.sparkContext.textFile("C:\\Users\\小象\\Desktop\\6月\\0627\\resources\\people.txt")

    //DataFrame:(表头信息)元信息,元数据 (name,age)
    val scheme = StructType(List(StructField("name",StringType),StructField("age",IntegerType)))

    //DataFrame:真正需要处理的数据,Row
    val rowRDD: RDD[Row] = lineRDD.map(line => {
    
    
      val linearray: Array[String] = line.split(",")
      Row(linearray(0), linearray(1).trim.toInt)
    })

    //DataFrame数据框,可以把peopleDF当做一个二维表
    val peopleDF: DataFrame = spark.createDataFrame(rowRDD,scheme)

    peopleDF.createOrReplaceTempView("people")

    spark.sql("select avg(age) avgage from people").show()

    spark.stop()
  }
}

但是这种方法由于比较麻烦,所以后来被样例类的方式取代了,不过底层其实还是它只是反射分装了而已

猜你喜欢

转载自blog.csdn.net/dudadudadd/article/details/113868305