方法有很多,现提供一种比较简单的方法。
//需导入spark-xml_2.10-0.4.0.jar
package com.beagledata.spark
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
/**
* xml数据处理(SQLContext)
*
* Created by drguo on 2017/8/18.
* blog.csdn.net/dr_guo
*/
object PCSDataSQLProcess {
val conf = new SparkConf().setAppName("pcsdata-sql")
.set("spark.jars.packages", "io.netty:netty-common:4.1.8.Final")
.set("spark.jars.exclude", "io.netty:netty-common")
//.setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
def main(args: Array[String]): Unit = {
val df = sqlContext.read
.format("com.databricks.spark.xml")
.option("rowTag", "ROW")
.load("/data1/Data/sinopec/sourceFile/xx.xml")
//.load("src/main/resources/pcsTestData.xml")
val pcsdf = df.select("ZDJG_V", "BGT", "SXZDDL", "XXZDDL").na.drop()//去掉缺失值,一行中有一个字段缺失整行去掉
val pcsrdd = pcsdf//.rdd.filter(_.length==4)
.map(x => x(0).toString+";"+x(1).toString.split("\\|")(0).split("\\$")(0)+";"+x(2).toString+";"+x(3).toString )
//.map(_.split(";")).filter(_.length>1).map(x => x(0)+";"+x(1)+";"+x(2)+";"+x(3))
//pcsrdd.foreach(println)
pcsrdd.repartition(40).saveAsTextFile("/data1/Data/sinopec/xxData")
}
}
参考资料:https://github.com/databricks/spark-xml
/**
* 读取Excel,兼容 Excel 2003/2007/2010
* @param path
* @return List<String>
*/
def readXlsx(path : String) : ListBuffer[(String,String)] = {
val fmt = new SimpleDateFormat("yy-M-d")
val is = new FileInputStream(path)
val workbook = WorkbookFactory.create(is)
val sheet = workbook.getSheetAt(0) //获取第一个sheet
val rowCount = sheet.getPhysicalNumberOfRows() //获取总行数
val lb = new ListBuffer[(String, String)]()
for(i <- 1 until rowCount){
val row = sheet.getRow(i)
// 得到第一列第一行的单元格
val cellwellname: Cell = row.getCell(0)
var cellDCDATE = row.getCell(1)
val cellDNAME0 = row.getCell(3)
//print("celltype----" + cellwellname.getCellType + " ")
//println()
//print("cellstyle----" + cellwellname.getCellStyle + " ")
//同一字段不同数据类型处理
var wellname = ""
if (cellwellname.getCellType == 0){
wellname = cellwellname.getNumericCellValue.toString
} else {//cellwellname.getCellType == 1
wellname = cellwellname.getRichStringCellValue.getString.trim
}
val DCDATE = fmt.format(cellDCDATE.getDateCellValue)
val DNAME0 = cellDNAME0.getStringCellValue
lb.+=((wellname + "\t" + DCDATE, DNAME0))
}
lb
}