1、测试数据people.txt
Michael, 29
Andy, 30
Justin, 19
2、代码测试
package com.cn.sparkSql
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
/**
* 为RDD增加schema,并转化为dataFram
*/
object RddToDataFram {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
val peopleRdd = spark.sparkContext.textFile("people.txt")
//定义df列名字段
val schemaString="name,age";
//设置字段类型
val fields = schemaString.split(",")
.map(filedName => StructField(filedName, StringType, nullable = true))
//封装schema
val schema = StructType(fields)
// Convert records of the RDD (people) to Rows
val rowRdd = peopleRdd.map(_.split(","))
.map(attr => Row(attr(0), attr(1).trim))
//将schema和rowRdd封装为df
val peopleDF = spark.createDataFrame(rowRdd,schema)
//创建视图
peopleDF.createTempView("people")
// The results of SQL queries are DataFrames and support all the normal RDD operations
// The columns of a row in the result can be accessed by field index or by field name
val results = spark.sql("select name from people")
results.map(attr=>"name:"+attr.get(0)).show()
}
}
3、运行结果
+------------+
| value|
+------------+
|name:Michael|
| name:Andy|
| name:Justin|
+------------+