Search in sources :

Example 1 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class SparkDl4jMultiLayer method fitLabeledPoint.

/**
     * Fit a MultiLayerNetwork using Spark MLLib LabeledPoint instances.
     * This will convert the labeled points to the internal DL4J data format and train the model on that
     *
     * @param rdd the rdd to fitDataSet
     * @return the multi layer network that was fitDataSet
     */
public MultiLayerNetwork fitLabeledPoint(JavaRDD<LabeledPoint> rdd) {
    int nLayers = network.getLayerWiseConfigurations().getConfs().size();
    FeedForwardLayer ffl = (FeedForwardLayer) network.getLayerWiseConfigurations().getConf(nLayers - 1).getLayer();
    JavaRDD<DataSet> ds = MLLibUtil.fromLabeledPoint(sc, rdd, ffl.getNOut());
    return fit(ds);
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) FeedForwardLayer(org.deeplearning4j.nn.conf.layers.FeedForwardLayer) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint)

Example 2 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class MLLibUtil method fromLabeledPoint.

/**
     * Convert an rdd
     * of labeled point
     * based on the specified batch size
     * in to data set
     * @param data the data to convert
     * @param numPossibleLabels the number of possible labels
     * @param batchSize the batch size
     * @return the new rdd
     */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final int numPossibleLabels, int batchSize) {
    //map by index
    JavaPairRDD<Long, LabeledPoint> dataWithIndex = data.zipWithIndex().mapToPair(new PairFunction<Tuple2<LabeledPoint, Long>, Long, LabeledPoint>() {

        @Override
        public Tuple2<Long, LabeledPoint> call(Tuple2<LabeledPoint, Long> labeledPointLongTuple2) throws Exception {
            return new Tuple2<>(labeledPointLongTuple2._2(), labeledPointLongTuple2._1());
        }
    });
    JavaPairRDD<Long, DataSet> mappedData = dataWithIndex.mapToPair(new PairFunction<Tuple2<Long, LabeledPoint>, Long, DataSet>() {

        @Override
        public Tuple2<Long, DataSet> call(Tuple2<Long, LabeledPoint> longLabeledPointTuple2) throws Exception {
            return new Tuple2<>(longLabeledPointTuple2._1(), MLLibUtil.fromLabeledPoint(longLabeledPointTuple2._2(), numPossibleLabels));
        }
    });
    JavaPairRDD<Long, DataSet> aggregated = mappedData.reduceByKey(new Function2<DataSet, DataSet, DataSet>() {

        @Override
        public DataSet call(DataSet v1, DataSet v2) throws Exception {
            return new DataSet(Nd4j.vstack(v1.getFeatureMatrix(), v2.getFeatureMatrix()), Nd4j.vstack(v1.getLabels(), v2.getLabels()));
        }
    }, (int) (mappedData.count() / batchSize));
    JavaRDD<DataSet> data2 = aggregated.flatMap(new BaseFlatMapFunctionAdaptee<Tuple2<Long, DataSet>, DataSet>(new FlatMapFunctionAdapter<Tuple2<Long, DataSet>, DataSet>() {

        @Override
        public Iterable<DataSet> call(Tuple2<Long, DataSet> longDataSetTuple2) throws Exception {
            return longDataSetTuple2._2();
        }
    }));
    return data2;
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) FlatMapFunctionAdapter(org.datavec.spark.functions.FlatMapFunctionAdapter) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) Tuple2(scala.Tuple2)

Example 3 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFromSvmLightBackprop.

@Test
public void testFromSvmLightBackprop() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive().getAbsolutePath()).toJavaRDD().map(new Function<LabeledPoint, LabeledPoint>() {

        @Override
        public LabeledPoint call(LabeledPoint v1) throws Exception {
            return new LabeledPoint(v1.label(), Vectors.dense(v1.features().toArray()));
        }
    });
    Nd4j.ENFORCE_NUMERICAL_STABILITY = true;
    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(10).list().layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3).activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER).build()).backprop(true).build();
    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));
    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
    Evaluation evaluation = new Evaluation();
    evaluation.eval(d.getLabels(), network2.output(d.getFeatureMatrix()));
    System.out.println(evaluation.stats());
}
Also used : Evaluation(org.deeplearning4j.eval.Evaluation) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) ClassPathResource(org.nd4j.linalg.io.ClassPathResource) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 4 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project java_study by aloyschen.

the class GbdtAndLr method train_gbdt.

/*
    * 训练gbdt模型,设置为2分类模型,默认损失函数为log loss,设置maxBin为categorical特征类别数的最大值
    * @param data: 存储训练样本标签和特征的数据
    * @return model: 训练后的GBDT模型
     */
private GradientBoostedTreesModel train_gbdt(JavaSparkContext jsc, JavaRDD<LabeledPoint> data) {
    Date now = new Date();
    DateFormat d1 = DateFormat.getDateInstance();
    String date = d1.format(now);
    JavaRDD<LabeledPoint>[] splits;
    JavaRDD<LabeledPoint> trainingData;
    JavaRDD<LabeledPoint> testData;
    splits = data.randomSplit(new double[] { 0.7, 0.3 });
    trainingData = splits[0];
    testData = splits[1];
    GradientBoostedTreesModel model;
    BoostingStrategy boostingStrategy;
    boostingStrategy = BoostingStrategy.defaultParams("Classification");
    boostingStrategy.setNumIterations(this.maxIter);
    boostingStrategy.getTreeStrategy().setNumClasses(2);
    boostingStrategy.getTreeStrategy().setMaxDepth(this.maxDepth);
    // boostingStrategy.getTreeStrategy().setMaxBins(maxBin);
    // boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);
    System.out.println("Start train GBDT");
    model = GradientBoostedTrees.train(trainingData, boostingStrategy);
    // model.save(jsc.sc(), "./GBDT_Model");
    System.out.println("model: " + model.toDebugString());
    GradientBoostedTreesModelUtil modelUtil = new GradientBoostedTreesModelUtil(model.algo(), model.trees(), model.treeWeights());
    modelUtil.saveGradientBoostedTreesModelToFile(model, this.modelPath + "gbdt_model" + date + ".json");
    // predict_gbdt(jsc, testData);
    return model;
}
Also used : BoostingStrategy(org.apache.spark.mllib.tree.configuration.BoostingStrategy) DateFormat(java.text.DateFormat) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 5 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project java_study by aloyschen.

the class GbdtAndLr method train.

/*
    * 获取GBDT模型组合后的特征输入到Lr模型中,训练LR模型
    * @param Path: 训练数据路径
     */
public void train(String Path) {
    JavaSparkContext jsc = getSc();
    ArrayList<ArrayList<Integer>> treeLeafArray = new ArrayList<>();
    Dataset<Row> all_data = Preprocessing(jsc, Path);
    JavaRDD<LabeledPoint> gbdt_data_labelpoint = load_gbdt_data(all_data);
    GradientBoostedTreesModel gbdt = train_gbdt(jsc, gbdt_data_labelpoint);
    DecisionTreeModel[] decisionTreeModels = gbdt.trees();
    // 获取GBDT每棵树的叶子索引
    for (int i = 0; i < this.maxIter; i++) {
        treeLeafArray.add(getLeafNodes(decisionTreeModels[i].topNode()));
    // System.out.println("叶子索引");
    // System.out.println(treeLeafArray.get(i));
    }
    JavaRDD<LabeledPoint> CombineFeatures = all_data.toJavaRDD().map(line -> {
        double[] newvaluesDouble;
        double[] features = new double[24];
        // 将dataset中每列特征值放入DenseVector中
        for (Integer i = 6; i < 18; i++) {
            org.apache.spark.mllib.linalg.DenseVector den = null;
            if (line.get(i) instanceof org.apache.spark.ml.linalg.Vector) {
                den = (DenseVector) Vectors.fromML((org.apache.spark.ml.linalg.DenseVector) line.get(i));
                features[i - 6] = den.toArray()[0];
            } else {
                features[i - 6] = Double.parseDouble(line.get(i).toString());
            }
        }
        DenseVector numerical_vector = new DenseVector(features);
        ArrayList<Double> newvaluesArray = new ArrayList<>();
        for (int i = 0; i < this.maxIter; i++) {
            int treePredict = predictModify(decisionTreeModels[i].topNode(), numerical_vector);
            int len = treeLeafArray.get(i).size();
            ArrayList<Double> treeArray = new ArrayList<>(len);
            // 数组所有值初始化为0,落在的叶子节点至为1
            for (int j = 0; j < len; j++) treeArray.add(j, 0d);
            treeArray.set(treeLeafArray.get(i).indexOf(treePredict), 1d);
            newvaluesArray.addAll(treeArray);
        }
        for (int i = 18; i < 29; i++) {
            SparseVector onehot_data = (SparseVector) Vectors.fromML((org.apache.spark.ml.linalg.SparseVector) line.get(i));
            DenseVector cat_data = onehot_data.toDense();
            for (int j = 0; j < cat_data.size(); j++) {
                newvaluesArray.add(cat_data.apply(j));
            }
        }
        newvaluesDouble = newvaluesArray.stream().mapToDouble(Double::doubleValue).toArray();
        DenseVector newdenseVector = new DenseVector(newvaluesDouble);
        return (new LabeledPoint(Double.valueOf(line.get(1).toString()), newdenseVector));
    });
    JavaRDD<LabeledPoint>[] splitsLR = CombineFeatures.randomSplit(new double[] { 0.7, 0.3 });
    JavaRDD<LabeledPoint> trainingDataLR = splitsLR[0];
    JavaRDD<LabeledPoint> testDataLR = splitsLR[1];
    System.out.println("Start train LR");
    LogisticRegressionModel LR = new LogisticRegressionWithLBFGS().setNumClasses(2).run(trainingDataLR.rdd()).clearThreshold();
    System.out.println("modelLR.weights().size():" + LR.weights().size());
    JavaPairRDD<Object, Object> test_LR = testDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
        Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
        return tuple2;
    });
    BinaryClassificationMetrics test_metrics = new BinaryClassificationMetrics(test_LR.rdd());
    double test_auc = test_metrics.areaUnderROC();
    System.out.println("test data auc_score:" + test_auc);
    JavaPairRDD<Object, Object> train_LR = trainingDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
        Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
        return tuple2;
    });
    BinaryClassificationMetrics train_metrics = new BinaryClassificationMetrics(train_LR.rdd());
    double train_auc = train_metrics.areaUnderROC();
    System.out.println("train data auc_score:" + train_auc);
    // 不同阈值下的精确度排序,取前十个输出
    JavaRDD<Tuple2<Object, Object>> precision = train_metrics.precisionByThreshold().toJavaRDD();
    JavaPairRDD<Object, Object> temp = JavaPairRDD.fromJavaRDD(precision);
    JavaPairRDD<Object, Object> swap = temp.mapToPair(Tuple2::swap);
    JavaPairRDD<Object, Object> precision_sort = swap.sortByKey(false);
    System.out.println("Precision by threshold: (Precision, Threshold)");
    for (int i = 0; i < 10; i++) {
        System.out.println(precision_sort.take(10).toArray()[i]);
    }
}
Also used : Vectors(org.apache.spark.mllib.linalg.Vectors) java.util(java.util) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) Serializable(scala.Serializable) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BinaryClassificationMetrics(org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) CmdlineParser(de.tototec.cmdoption.CmdlineParser) SparseVector(org.apache.spark.mllib.linalg.SparseVector) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) utils(utils) JavaRDD(org.apache.spark.api.java.JavaRDD) FeatureType(org.apache.spark.mllib.tree.configuration.FeatureType) DateFormat(java.text.DateFormat) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DenseVector(org.apache.spark.mllib.linalg.DenseVector) GradientBoostedTrees(org.apache.spark.mllib.tree.GradientBoostedTrees) SparkConf(org.apache.spark.SparkConf) LogisticRegressionWithLBFGS(org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) Option(scala.Option) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) org.apache.spark.ml.feature(org.apache.spark.ml.feature) org.apache.spark.mllib.tree.model(org.apache.spark.mllib.tree.model) org.apache.spark.sql(org.apache.spark.sql) JavaConverters(scala.collection.JavaConverters) BoostingStrategy(org.apache.spark.mllib.tree.configuration.BoostingStrategy) PairFunction(org.apache.spark.api.java.function.PairFunction) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) SparseVector(org.apache.spark.mllib.linalg.SparseVector) LogisticRegressionWithLBFGS(org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparseVector(org.apache.spark.mllib.linalg.SparseVector) DenseVector(org.apache.spark.mllib.linalg.DenseVector) DenseVector(org.apache.spark.mllib.linalg.DenseVector) BinaryClassificationMetrics(org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple2(scala.Tuple2) DenseVector(org.apache.spark.mllib.linalg.DenseVector)

Aggregations

LabeledPoint (org.apache.spark.mllib.regression.LabeledPoint)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 DataSet (org.nd4j.linalg.dataset.DataSet)6 SparkConf (org.apache.spark.SparkConf)4 LogisticRegressionModel (org.apache.spark.mllib.classification.LogisticRegressionModel)4 Vector (org.apache.spark.mllib.linalg.Vector)4 DateFormat (java.text.DateFormat)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 HashingTF (org.apache.spark.mllib.feature.HashingTF)3 BoostingStrategy (org.apache.spark.mllib.tree.configuration.BoostingStrategy)3 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)3 BaseSparkTest (org.deeplearning4j.spark.BaseSparkTest)3 Tuple2 (scala.Tuple2)3 CmdlineParser (de.tototec.cmdoption.CmdlineParser)2 java.util (java.util)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 PairFunction (org.apache.spark.api.java.function.PairFunction)2 org.apache.spark.ml.feature (org.apache.spark.ml.feature)2 LogisticRegressionWithLBFGS (org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)2 LogisticRegressionWithSGD (org.apache.spark.mllib.classification.LogisticRegressionWithSGD)2