机器学习预测明星出轨概率
代码
package MLlibDemo.bayesfunc
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import scala.util.Random
object BayesPredictionDerailed {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
import spark.implicits._
val sampleSource: DataFrame = spark.read.option("header", true).csv("F:\\data\\MllibData\\derailed\\derailed.csv")
val predictionSource: Dataset[String] = spark.read.textFile("F:\\data\\MllibData\\derailed\\predictionDerailed.txt")
sampleSource.printSchema()
sampleSource.show()
predictionSource.show()
val random = new Random()
val sampleData = sampleSource.map(row => {
val label: Int = row.getString(5) match {
case "出轨" => 0
case "没出" => 1
}
(
row.getString(0) ,
row.getString(1).hashCode.toDouble,
row.getString(2).hashCode.toDouble,
row.getString(3).hashCode.toDouble,
row.getString(4).hashCode.toDouble,
label,
random.nextDouble()
)
}).toDF("姓名","职业", "收入", "年龄", "性别", "是否出轨", "randomNumber")
.sort("randomNumber")
val predictionData = predictionSource.map(line =>{
val fields = line.split(",")
val 姓名 = fields(0)
val 职业: Double = fields(1).hashCode.toDouble
val 收入: Double = fields(2).hashCode.toDouble
val 年龄: Double = fields(3).hashCode.toDouble
val 性别: Double = fields(4).hashCode.toDouble
(姓名,职业,收入,年龄,性别)
}).toDF("姓名","职业", "收入", "年龄", "性别")
val assembler = new VectorAssembler()
.setInputCols(Array("职业", "收入", "年龄", "性别"))
.setOutputCol("features")
val train = assembler.transform(sampleData)
train.show(100,false)
val predictionAssembler = new VectorAssembler()
.setInputCols(Array("职业", "收入", "年龄", "性别"))
.setOutputCol("features")
val prediction = predictionAssembler.transform(predictionData)
prediction.show(100,false)
val bayes = new NaiveBayes()
.setFeaturesCol("features")
.setLabelCol("是否出轨")
val model = bayes.fit(train)
val resFrame = model.transform(prediction)
resFrame.show(200,false)
resFrame.createTempView("tmp")
spark.sql(
"""
|select
|`姓名`,
|if(prediction = 0.0 ,"出轨","没出")
|from
|tmp
|
|""".stripMargin).show(false)
}
}
样本数据
|-- 姓名: string (nullable = true)
|-- 职业: string (nullable = true)
|-- 收入: string (nullable = true)
|-- 年龄: string (nullable = true)
|-- 性别: string (nullable = true)
|-- 是否出轨: string (nullable = true)
+---+---+---+---+---+----+
| 姓名| 职业| 收入| 年龄| 性别|是否出轨|
+---+---+---+---+---+----+
| 张飞| 老师| 中| 青年| 男| 出轨|
| 赵云| 老师| 中| 中年| 男| 出轨|
|陆小凤| 老师| 低| 青年| 男| 没出|
|花满楼| 老师| 高| 老年| 女| 出轨|
| 田汉| 老师| 低| 青年| 女| 没出|
| 唐嫣|程序员| 高| 青年| 男| 没出|
|刘亦菲|程序员| 高| 青年| 女| 出轨|
|令狐冲|程序员| 中| 中年| 男| 没出|
|向问天|程序员| 中| 中年| 男| 没出|
|任我行|程序员| 中| 老年| 男| 出轨|
| 郭靖|公务员| 中| 老年| 女| 没出|
| 黄蓉|公务员| 低| 老年| 女| 没出|
|段正淳|公务员| 高| 中年| 男| 出轨|
| 段誉|公务员| 低| 中年| 女| 没出|
| 虚竹|公务员| 低| 青年| 男| 出轨|
+---+---+---+---+---+----+
加工后样本数据
+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+
|姓名 |职业 |收入 |年龄 |性别 |是否出轨|randomNumber |features |
+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+
|赵云 |1039911.0 |20013.0|644583.0 |30007.0|0 |0.11902210706213434|[1039911.0,20013.0,644583.0,30007.0] |
|唐嫣 |3.0796532E7|39640.0|1225058.0|30007.0|1 |0.22193461036465667|[3.0796532E7,39640.0,1225058.0,30007.0]|
|向问天|3.0796532E7|20013.0|644583.0 |30007.0|1 |0.37215955478033813|[3.0796532E7,20013.0,644583.0,30007.0] |
|田汉 |1039911.0 |20302.0|1225058.0|22899.0|1 |0.3801711879465671 |[1039911.0,20302.0,1225058.0,22899.0] |
|虚竹 |2.0708419E7|20302.0|1225058.0|30007.0|0 |0.5338661706301351 |[2.0708419E7,20302.0,1225058.0,30007.0]|
|花满楼|1039911.0 |39640.0|1040019.0|22899.0|0 |0.6133461940370486 |[1039911.0,39640.0,1040019.0,22899.0] |
|刘亦菲|3.0796532E7|39640.0|1225058.0|22899.0|0 |0.6264115387535791 |[3.0796532E7,39640.0,1225058.0,22899.0]|
|张飞 |1039911.0 |20013.0|1225058.0|30007.0|0 |0.7367350951493166 |[1039911.0,20013.0,1225058.0,30007.0] |
|段誉 |2.0708419E7|20302.0|644583.0 |22899.0|1 |0.75648813271113 |[2.0708419E7,20302.0,644583.0,22899.0] |
|郭靖 |2.0708419E7|20013.0|1040019.0|22899.0|1 |0.7977059522353611 |[2.0708419E7,20013.0,1040019.0,22899.0]|
|陆小凤|1039911.0 |20302.0|1225058.0|30007.0|1 |0.8271853032439851 |[1039911.0,20302.0,1225058.0,30007.0] |
|任我行|3.0796532E7|20013.0|1040019.0|30007.0|0 |0.8550640888666814 |[3.0796532E7,20013.0,1040019.0,30007.0]|
|令狐冲|3.0796532E7|20013.0|644583.0 |30007.0|1 |0.8560075884423889 |[3.0796532E7,20013.0,644583.0,30007.0] |
|黄蓉 |2.0708419E7|20302.0|1040019.0|22899.0|1 |0.9726944053415683 |[2.0708419E7,20302.0,1040019.0,22899.0]|
|段正淳|2.0708419E7|39640.0|644583.0 |30007.0|0 |0.9816986649324599 |[2.0708419E7,39640.0,644583.0,30007.0] |
+---+-----------+-------+---------+-------+----+-------------------+---------------------------------------+
待测试数据
+-------------+
| value|
+-------------+
| 曹操,老师,中,青年,女|
|小乔,程序员,高,中年,女|
|吕布,公务员,低,青年,男|
+-------------+
加工后待测试数据
+---+-----------+-------+---------+-------+---------------------------------------+
|姓名 |职业 |收入 |年龄 |性别 |features |
+---+-----------+-------+---------+-------+---------------------------------------+
|曹操 |1039911.0 |20013.0|1225058.0|22899.0|[1039911.0,20013.0,1225058.0,22899.0] |
|小乔 |3.0796532E7|39640.0|644583.0 |22899.0|[3.0796532E7,39640.0,644583.0,22899.0] |
|吕布 |2.0708419E7|20302.0|1225058.0|30007.0|[2.0708419E7,20302.0,1225058.0,30007.0]|
+---+-----------+-------+---------+-------+---------------------------------------+
训练结果
+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+
|姓名 |职业 |收入 |年龄 |性别 |features |rawPrediction |probability|prediction|
+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+
|曹操 |1039911.0 |20013.0|1225058.0|22899.0|[1039911.0,20013.0,1225058.0,22899.0] |[-3749036.085448347,-4094948.1909440174]|[1.0,0.0] |0.0 |
|小乔 |3.0796532E7|39640.0|644583.0 |22899.0|[3.0796532E7,39640.0,644583.0,22899.0] |[-4275772.177709845,-3947349.4869167404]|[0.0,1.0] |1.0 |
|吕布 |2.0708419E7|20302.0|1225058.0|30007.0|[2.0708419E7,20302.0,1225058.0,30007.0]|[-5128643.399994654,-5133954.864827515] |[1.0,0.0] |0.0 |
+---+-----------+-------+---------+-------+---------------------------------------+----------------------------------------+-----------+----------+
+---+------------------------------------------------+
|姓名 |(IF((prediction = CAST(0.0 AS DOUBLE)), 出轨, 没出))|
+---+------------------------------------------------+
|曹操 |出轨 |
|小乔 |没出 |
|吕布 |出轨 |
+---+------------------------------------------------+
机器学习预测明星出轨概率------------多易大数据多易后生仔 荣誉出品