机器学习预测明星出轨概率------------多易大数据多易后生仔 荣誉出品

机器学习预测明星出轨概率

代码

package MLlibDemo.bayesfunc

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import scala.util.Random

/*使用贝叶斯算法对 出轨进行预测*/
object BayesPredictionDerailed {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.WARN)

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    import spark.implicits._

    //step1 : 读取样本数据
    val sampleSource: DataFrame = spark.read.option("header", true).csv("F:\\data\\MllibData\\derailed\\derailed.csv")

    //step2 : 读取待预测数据
    val predictionSource: Dataset[String] = spark.read.textFile("F:\\data\\MllibData\\derailed\\predictionDerailed.txt")

    sampleSource.printSchema()
    sampleSource.show()

    predictionSource.show()

    val random = new Random()

    //1.1 处理样本数据
    val sampleData = sampleSource.map(row => {

      //对字符串分类 转为数字分类
      val label: Int = row.getString(5) match {
          // 注意分类 数字只能从 0 开始 因为 模型中预测后的分类 是从0开始 否则匹配不上
        case "出轨" => 0
        case "没出" => 1
      }

      //组装数据返回 加随机数 进行shuffle
      (
        row.getString(0) ,
        row.getString(1).hashCode.toDouble,
        row.getString(2).hashCode.toDouble,
        row.getString(3).hashCode.toDouble,
        row.getString(4).hashCode.toDouble,
        label,
        random.nextDouble()
      )

    }).toDF("姓名","职业", "收入", "年龄", "性别", "是否出轨", "randomNumber")
      .sort("randomNumber") //排序 进而 shuffle

    //2.1 处理待预测数据
    val predictionData = predictionSource.map(line =>{
      val fields = line.split(",")
      val 姓名 = fields(0)
      val 职业: Double = fields(1).hashCode.toDouble
      val 收入: Double = fields(2).hashCode.toDouble
      val 年龄: Double = fields(3).hashCode.toDouble
      val 性别: Double = fields(4).hashCode.toDouble

      (姓名,职业,收入,年龄,性别)

    }).toDF("姓名","职业", "收入", "年龄", "性别")

    //1.2 封装样本数据 向量参数 封装后的列 叫 features
    val assembler = new VectorAssembler()
      .setInputCols(Array("职业", "收入", "年龄", "性别"))
      .setOutputCol("features")

    val train = assembler.transform(sampleData)

    train.show(100,false)

    //2.2 封装带预测数据 向量参数
    val predictionAssembler = new VectorAssembler()
      .setInputCols(Array("职业", "收入", "年龄", "性别"))
      .setOutputCol("features")

    val prediction = predictionAssembler.transform(predictionData)

    prediction.show(100,false)

    //创建贝叶斯对象
    val bayes = new NaiveBayes()
      .setFeaturesCol("features")
      .setLabelCol("是否出轨")

    val model = bayes.fit(train)


    val resFrame = model.transform(prediction)

   resFrame.show(200,false)

    resFrame.createTempView("tmp")
    spark.sql(
      """
        |select
        |`姓名`,
        |if(prediction = 0.0 ,"出轨","没出")
        |from
        |tmp
        |
        |""".stripMargin).show(false)


  }
}

样本数据

|-- 姓名: string (nullable = true)
 |-- 职业: string (nullable = true)
 |-- 收入: string (nullable = true)
 |-- 年龄: string (nullable = true)
 |-- 性别: string (nullable = true)
 |-- 是否出轨: string (nullable = true)

+---+---+---+---+---+----+
| 姓名| 职业| 收入| 年龄| 性别|是否出轨|
+---+---+---+---+---+----+
| 张飞| 老师|| 青年||  出轨|
| 赵云| 老师|| 中年||  出轨|
|陆小凤| 老师|| 青年||  没出|
|花满楼| 老师|| 老年||  出轨|
| 田汉| 老师|| 青年||  没出|
| 唐嫣|程序员|| 青年||  没出|
|刘亦菲|程序员|| 青年||  出轨|
|令狐冲|程序员|| 中年||  没出|
|向问天|程序员|| 中年||  没出|
|任我行|程序员|| 老年||  出轨|
| 郭靖|公务员|| 老年||  没出|
| 黄蓉|公务员|| 老年||  没出|
|段正淳|公务员|| 中年||  出轨|
| 段誉|公务员|| 中年||  没出|
| 虚竹|公务员|| 青年||  出轨|
+---+---+---+---+---+----+

加工后样本数据

+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+
|姓名 |职业         |收入     |年龄       |性别     |是否出轨|randomNumber       |features                               |
+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+
|赵云 |1039911.0  |20013.0|644583.0 |30007.0|0   |0.11902210706213434|[1039911.0,20013.0,644583.0,30007.0]   |
|唐嫣 |3.0796532E7|39640.0|1225058.0|30007.0|1   |0.22193461036465667|[3.0796532E7,39640.0,1225058.0,30007.0]|
|向问天|3.0796532E7|20013.0|644583.0 |30007.0|1   |0.37215955478033813|[3.0796532E7,20013.0,644583.0,30007.0] |
|田汉 |1039911.0  |20302.0|1225058.0|22899.0|1   |0.3801711879465671 |[1039911.0,20302.0,1225058.0,22899.0]  |
|虚竹 |2.0708419E7|20302.0|1225058.0|30007.0|0   |0.5338661706301351 |[2.0708419E7,20302.0,1225058.0,30007.0]|
|花满楼|1039911.0  |39640.0|1040019.0|22899.0|0   |0.6133461940370486 |[1039911.0,39640.0,1040019.0,22899.0]  |
|刘亦菲|3.0796532E7|39640.0|1225058.0|22899.0|0   |0.6264115387535791 |[3.0796532E7,39640.0,1225058.0,22899.0]|
|张飞 |1039911.0  |20013.0|1225058.0|30007.0|0   |0.7367350951493166 |[1039911.0,20013.0,1225058.0,30007.0]  |
|段誉 |2.0708419E7|20302.0|644583.0 |22899.0|1   |0.75648813271113   |[2.0708419E7,20302.0,644583.0,22899.0] |
|郭靖 |2.0708419E7|20013.0|1040019.0|22899.0|1   |0.7977059522353611 |[2.0708419E7,20013.0,1040019.0,22899.0]|
|陆小凤|1039911.0  |20302.0|1225058.0|30007.0|1   |0.8271853032439851 |[1039911.0,20302.0,1225058.0,30007.0]  |
|任我行|3.0796532E7|20013.0|1040019.0|30007.0|0   |0.8550640888666814 |[3.0796532E7,20013.0,1040019.0,30007.0]|
|令狐冲|3.0796532E7|20013.0|644583.0 |30007.0|1   |0.8560075884423889 |[3.0796532E7,20013.0,644583.0,30007.0] |
|黄蓉 |2.0708419E7|20302.0|1040019.0|22899.0|1   |0.9726944053415683 |[2.0708419E7,20302.0,1040019.0,22899.0]|
|段正淳|2.0708419E7|39640.0|644583.0 |30007.0|0   |0.9816986649324599 |[2.0708419E7,39640.0,644583.0,30007.0] |
+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+

待测试数据


+-------------+
|        value|
+-------------+
| 曹操,老师,,青年,|
|小乔,程序员,,中年,|
|吕布,公务员,,青年,|
+-------------+

加工后待测试数据

+---+-----------+-------+---------+-------+---------------------------------------+
|姓名 |职业         |收入     |年龄       |性别     |features                               |
+---+-----------+-------+---------+-------+---------------------------------------+
|曹操 |1039911.0  |20013.0|1225058.0|22899.0|[1039911.0,20013.0,1225058.0,22899.0]  |
|小乔 |3.0796532E7|39640.0|644583.0 |22899.0|[3.0796532E7,39640.0,644583.0,22899.0] |
|吕布 |2.0708419E7|20302.0|1225058.0|30007.0|[2.0708419E7,20302.0,1225058.0,30007.0]|
+---+-----------+-------+---------+-------+---------------------------------------+

训练结果

+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+
|姓名 |职业         |收入     |年龄       |性别     |features                               |rawPrediction                           |probability|prediction|
+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+
|曹操 |1039911.0  |20013.0|1225058.0|22899.0|[1039911.0,20013.0,1225058.0,22899.0]  |[-3749036.085448347,-4094948.1909440174]|[1.0,0.0]  |0.0       |
|小乔 |3.0796532E7|39640.0|644583.0 |22899.0|[3.0796532E7,39640.0,644583.0,22899.0] |[-4275772.177709845,-3947349.4869167404]|[0.0,1.0]  |1.0       |
|吕布 |2.0708419E7|20302.0|1225058.0|30007.0|[2.0708419E7,20302.0,1225058.0,30007.0]|[-5128643.399994654,-5133954.864827515] |[1.0,0.0]  |0.0       |
+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+

+---+------------------------------------------------+
|姓名 |(IF((prediction = CAST(0.0 AS DOUBLE)), 出轨, 没出))|
+---+------------------------------------------------+
|曹操 |出轨                                              |
|小乔 |没出                                              |
|吕布 |出轨                                              |
+---+------------------------------------------------+

机器学习预测明星出轨概率------------多易大数据多易后生仔 荣誉出品

发布了33 篇原创文章 · 获赞 12 · 访问量 3290

猜你喜欢

转载自blog.csdn.net/IT_BULL/article/details/104399773