Spark ML 实例1
步骤
- 准备样本数据(特征与标签)
- 创建逻辑回归的评估器
- 使用setter方法设置参数
- 使用存储在lr中的参数来训练一个模型
- 使用paramMap选择指定的参数
- 准备测试数据
- 预测结果
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector,Vectors}
import org.apache.spark.sql.Row
val trainning = spark.createDataFrame(Seq(
(1.0, Vectors.dense(2.0, 1.1, 0.1)),
(0.0, Vectors.dense(0.0, 1.1, -1.0)),
(0.0, Vectors.dense(0.0, 1.1, 0.1)),
(1.0, Vectors.dense(2.0, 1.1, -1.0))
)).toDF("label", "features")
val lr = new LogisticRegression()
println(lr.explainParams())
lr.setMaxIter(10).setRegParam(0.01)
val model1 - lr.fit(training)
model1.parent.extractParamMap
val paramMap = ParamMap(lr.maxIter -> 20).
put(lr.maxIter, 30 ).
put(lr.regParam -> 0.1, lr.threshold -> 0.55)
val paraMap2 = ParamMap(lr.probabilityCol -> "myProbability")
val paramMapCombined = paramMap ++ paramMap2
val model2 = lr.fit(training, paramMapCombined)
model2.parent.extractParamMap
val test = spark.createDataFrame(Seq(
(1.0, Vectors.dense(2.0, 1.1, 0.1)),
(0.0, Vectors.dense(0.0, 1.1, -1.0)),
(0.0, Vectors.dense(0.0, 1.1, 0.1)),
(1.0, Vectors.dense(2.0, 1.1, -1.0))
)).toDF("label", "features")
model1.transform(test).select("label", "features", "probability", "prediction").collect
.{case Row(label: Double, features: vector, probability: Double, predicttion: vector) = > println(s"($features, $label) -> probability=$probability, prediction=$prediction")}