Skip to content

Files

Latest commit

 

History

History
151 lines (124 loc) · 9.64 KB

Spark中组件Mllib的学习65之使用PCA进行特征转换.md

File metadata and controls

151 lines (124 loc) · 9.64 KB

更多代码请见:https://github.com/xubo245/SparkLearning

Spark中组件Mllib的学习

1.解释

使用PCA可以将project的向量转换成低维空间,达到特征转换的效果

2.代码:

/**
  * @author xubo
  *         ref:http://spark.apache.org/docs/1.5.2/mllib-guide.html
  *         more code:https://github.com/xubo245/SparkLearning
  *         more blog:http://blog.csdn.net/xubo245
  */
package org.apache.spark.mllib.FeatureExtractionAndTransformation

import org.apache.spark.util.SparkLearningFunSuite

/**
  * Created by xubo on 2016/6/13.
  */
class PCAFunSuite extends SparkLearningFunSuite {
  test("testFunSuite") {


    import org.apache.spark.mllib.regression.LinearRegressionWithSGD
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.feature.PCA

    val data = sc.textFile("file/data/mllib/input/mllibFromSpark/ridge-data/lpsa.data").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features))
    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    val valuesAndPreds_pca = test_pca.map { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)
    }

    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)

    println("data:")
    data.take(10).foreach(println)
    println("training:")
    training.take(10).foreach(println)
    println("test:")
    test.take(10).foreach(println)
    println("training_pca:")
    training_pca.take(10).foreach(println)
    println("test_pca:")
    test_pca.take(10).foreach(println)

  }
}

3.结果:

Mean Squared Error = 14.433875832615556
PCA Mean Squared Error = 9.776863212465685
data:
(-0.4307829,[-1.63735562648104,-2.00621178480549,-1.86242597251066,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(-0.1625189,[-1.98898046126935,-0.722008756122123,-0.787896192088153,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(-0.1625189,[-1.57881887548545,-2.1887840293994,1.36116336875686,-1.02470580167082,-0.522940888712441,-0.863171185425945,0.342627053981254,-0.155348103855541])
(-0.1625189,[-2.16691708463163,-0.807993896938655,-0.787896192088153,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.3715636,[-0.507874475300631,-0.458834049396776,-0.250631301876899,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.7654678,[-2.03612849966376,-0.933954647105133,-1.86242597251066,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.8544153,[-0.557312518810673,-0.208756571683607,-0.787896192088153,0.990146852537193,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.2669476,[-0.929360463147704,-0.0578991819441687,0.152317365781542,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.2669476,[-2.28833047634983,-0.0706369432557794,-0.116315079324086,0.80409888772376,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.2669476,[0.223498042876113,-1.41471935455355,-0.116315079324086,-1.02470580167082,-0.522940888712441,-0.29928234305568,0.342627053981254,0.199211097885341])
training:
(-0.4307829,[-1.63735562648104,-2.00621178480549,-1.86242597251066,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(-0.1625189,[-1.98898046126935,-0.722008756122123,-0.787896192088153,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(-0.1625189,[-1.57881887548545,-2.1887840293994,1.36116336875686,-1.02470580167082,-0.522940888712441,-0.863171185425945,0.342627053981254,-0.155348103855541])
(-0.1625189,[-2.16691708463163,-0.807993896938655,-0.787896192088153,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.3715636,[-0.507874475300631,-0.458834049396776,-0.250631301876899,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.7654678,[-2.03612849966376,-0.933954647105133,-1.86242597251066,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(0.8544153,[-0.557312518810673,-0.208756571683607,-0.787896192088153,0.990146852537193,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.2669476,[-0.929360463147704,-0.0578991819441687,0.152317365781542,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.2669476,[-2.28833047634983,-0.0706369432557794,-0.116315079324086,0.80409888772376,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.446919,[0.162180092313795,-1.32557369901905,0.286633588334355,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
test:
(1.2669476,[0.223498042876113,-1.41471935455355,-0.116315079324086,-1.02470580167082,-0.522940888712441,-0.29928234305568,0.342627053981254,0.199211097885341])
(1.3480731,[0.107785900236813,-1.47221551299731,0.420949810887169,-1.02470580167082,-0.522940888712441,-0.863171185425945,0.342627053981254,-0.687186906466865])
(1.5581446,[-1.62233848461465,-0.843294091975396,-3.07127197548598,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.6389967,[-0.171901281967138,-0.489197399065355,-0.65357996953534,-1.02470580167082,-0.522940888712441,-0.863171185425945,-1.04215728919298,-0.864466507337306])
(1.8000583,[-0.710307384579833,0.211731938156277,0.152317365781542,-1.02470580167082,-0.522940888712441,-0.442797990776478,0.342627053981254,1.61744790484887])
(1.8946169,[0.899043117369237,-0.590700340358265,0.152317365781542,-1.02470580167082,-0.522940888712441,1.28643254437683,-1.04215728919298,-0.864466507337306])
(1.9242487,[-0.903451690500615,1.07659722048274,0.152317365781542,1.28380453408541,-0.522940888712441,-0.442797990776478,-1.04215728919298,-0.864466507337306])
(2.0476928,[-1.15393789990757,-0.961853075398404,-0.116315079324086,-1.02470580167082,-0.522940888712441,-0.442797990776478,-1.04215728919298,-0.864466507337306])
(2.1916535,[-0.75731027755674,-2.92717970468456,0.018001143228728,-1.02470580167082,-0.522940888712441,-0.863171185425945,0.342627053981254,-0.332627704725983])
(2.2975726,[-0.618884859896728,-1.1366360750781,-0.519263746982526,-1.02470580167082,-0.522940888712441,-0.863171185425945,3.11219574032972,1.97200710658975])
training_pca:
(-0.4307829,[3.0012517955170352,-2.142550275291373,-0.21983330445319627,0.4753105715735904])
(-0.1625189,[2.6455485680598025,-0.9387907623575651,-0.13629522253059867,-0.2723535248719323])
(-0.1625189,[1.4079948251594074,-1.3563656005227005,2.154382489087549,-1.508169644660517])
(-0.1625189,[2.743811626523896,-0.9988692785066657,-0.05685033575302001,-0.2614682467305025])
(0.3715636,[1.7898749594315586,-0.5368044430308028,-0.40414927356521846,-0.7820446536890064])
(0.7654678,[2.967118346507534,-1.4629146901651557,-0.5030692359920445,0.5366986066009944])
(0.8544153,[1.8114775681616115,0.6431327185772484,-0.3888690871371431,0.6949751752194686])
(1.2669476,[1.8045016189744407,-0.14500110200603092,-0.26514069679827845,-1.0429660904935558])
(1.2669476,[2.4178322961920884,0.815516923847327,0.2844077846027382,0.2241689278317942])
(1.446919,[1.5304832823692565,-0.8837170978629127,-0.04077227417256124,-1.2637017426660309])
test_pca:
(1.2669476,[0.37441511582814346,-1.4969544155711367,0.7612990351311218,-0.34675589751309094])
(1.3480731,[0.9599357506994797,-1.0980394467345067,0.8650793824313101,-1.0565972216247426])
(1.5581446,[3.050269744395389,-1.828301460642064,-1.1618664110224481,1.421350935437784])
(1.6389967,[1.7396502978859232,-0.6923362419730479,-0.6551795447220509,-0.503846318147684])
(1.8000583,[-0.16296237611581177,-0.5949556974786936,1.0228361683283693,-0.03926588155085248])
(1.8946169,[0.11516537350312123,-0.8021543848049524,-0.9865728807822236,-0.9801762542088186])
(1.9242487,[1.2850675285460493,1.9053942436935138,-0.3999167209395966,0.24515775928535155])
(2.0476928,[1.9642363566050085,-0.9004119731752642,-0.0714538222617227,-0.8073116514633711])
(2.1916535,[1.5839071437088503,-2.262777725048644,1.573127462029202,-0.627873413590966])
(2.2975726,[-0.769107393625969,-1.964648985796285,2.8479155939388012,0.9969948395610135])

结果分析:使用pca进行特征转换后的MSE要好。

参考

	【1】http://spark.apache.org/docs/1.5.2/mllib-guide.html 
	【2】http://spark.apache.org/docs/1.5.2/programming-guide.html
	【3】https://github.com/xubo245/SparkLearning
	【4】book:Machine Learning with Spark ,Nick Pertreach
    【5】book:Spark MlLib机器学习实战