def main(agrs: Array[String]){ //configuration and new Spark Context val conf = new SparkConf().setAppName("SparkSQL_Select_Table") .set("spark.driver.allowMultipleContexts", "true") .set("spark.sql.shuffle.partitions","12") //本地启动 .setMaster("local[2]"); val sc = new SparkContext(conf); //new SparkSQL Context val sqlContext = new org.apache.spark.sql.SQLContext(sc); val tableName = "test"; val sql = "select count(*) from test"; //Connection URL to sqlserver val sqlsUrl = "jdbc:sqlserver://ip:port;DatabaseName=dbName;username=user;password=user" //Connection URL to postgresql //val pgUrl = "jdbc:postgresql://ip:port/dbName?currentSchema=modelName&user=pg&password=pg" val df = sqlContext.load("jdbc", Map("url" -> sqlsUrl, "dbtable" -> tableName)) df.registerTempTable(tableName); val dataResult = sqlContext.sql(sql); dataResult.show(); //release Spark Context sc.stop(); }
最近一直在用SparkSQL,将关系型数据库的数据导出到指定位置,这个指定位置概念很广泛,本地,分布式存储,分布式数据库,或者Tachyon这种内存文件系统。
上述代码需要注意,如果Master是本地,对应的JDBC驱动包在引用的jar包中即可,如果提交到Mesos或者yarn上,需要使用maven或sbt将JDBC驱动包一起打包到提交的jar包中,否则executor会报文件找不到的异常