spark Scala ml word2Vec 英文文档关键词提取

1.整体程序,没有问题,copy可以运行。path需要路径下需要放几个英文文档。
2.spark ml程序,spark 2.X,Scala 2.11.X,jdk 1.8
3.内容大概为生成英文文档的关键词提取。
4.主要注意spark ml和mlib的区别,我尽量使用的是最新的,ml + dataframe + spark SQL
5.流程:去读英文文档、分词、过滤停用词、创建word2Vec、结果格式化输出 



import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.ToAnalysis
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{CountVectorizer, RegexTokenizer, StopWordsRemover, Word2Vec}
import org.apache.spark.sql.SQLContext

object word2vec_test01 {
  def main(args: Array[String]):Unit={

    val conf = new SparkConf().setMaster("local[2]").setAppName("word2vec_test01")
    var sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    val path = "D:\\soft\\IDEA\\data\\input\\news_data\\*"
    val rdd = sc.wholeTextFiles(path)
    val filename = rdd.map(_._1); 
    //构建停词
    import scala.collection.JavaConverters._
    val stopWords = sc.textFile("stop_words_eng.txt").collect().toSeq.asJava 
     //过滤停词
    val filter = new StopRecognition().insertStopWords(stopWords)
    filter.insertStopNatures("w", null, "null")

    val splitWordRdd = rdd.map(file => {
      val str = ToAnalysis.parse(file._2).recognition(filter).toStringWithOutNature(" ")
      (file._1, str.split(" "))
    })

    val df = sqlContext.createDataFrame(splitWordRdd).toDF("fileName", "words")
    df.rdd.map(x => x.toString()).foreach(println)
    // Word2Vec 创建
    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("words")
      .setOutputCol("result")
      .setVectorSize(4)
      .setMinCount(4)
    val model = word2Vec.fit(df)
    val result = model.transform(df)
    result.rdd.foreach(println)
    // 展示topN在所有文档的权重
    val vocs = model.getVectors
    vocs.createOrReplaceTempView("vocs")
    val result1 = sqlContext.sql(
      """
        |  select word,
        |     split(table2.values, ',')[0] as values01,
        |     split(table2.values, ',')[1] as values02,
        |     split(table2.values, ',')[2] as values03,
        |     split(table2.values, ',')[3] as values04
        |  from(
        |     select word,
        |      substring(temp1,2,length(temp1)-2) as values
        |     from(
        |      select word,
        |       String(vector) as temp1
        |      from vocs
        |      )table1
        |     )table2
      """.stripMargin)
    result1.show()
    // 展示同义词topN
//    val like = model.findSynonyms("中国", 40)
//    for ((item, cos) <- like) {
//      println(s"$item  $cos")
//    }

//    最终结果可使用的官方api如下:
//    http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.feature.Word2VecModel
  }
}

有兴趣可以加我的大数据、数据分析、爬虫群:
《453908562》

猜你喜欢

转载自blog.csdn.net/qq_31032181/article/details/82702025