Search in sources :

Example 11 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class MLLibUtil method fromContinuousLabeledPoint.

/**
     * Convert rdd labeled points to a rdd dataset with continuous features
     * @param data the java rdd labeled points ready to convert
     * @return a JavaRDD<Dataset> with a continuous label
     */
@Deprecated
public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaSparkContext sc, JavaRDD<LabeledPoint> data) {
    List<LabeledPoint> labeledPoints = data.collect();
    List<DataSet> dataSets = new ArrayList<>();
    for (LabeledPoint labeledPoint : labeledPoints) {
        dataSets.add(convertToDataset(labeledPoint));
    }
    return sc.parallelize(dataSets);
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) ArrayList(java.util.ArrayList) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint)

Example 12 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFromSvmLight.

@Test
public void testFromSvmLight() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive().getAbsolutePath()).toJavaRDD().map(new Function<LabeledPoint, LabeledPoint>() {

        @Override
        public LabeledPoint call(LabeledPoint v1) throws Exception {
            return new LabeledPoint(v1.label(), Vectors.dense(v1.features().toArray()));
        }
    });
    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123).optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT).iterations(100).miniBatch(true).maxNumLineSearchIterations(10).list().layer(0, new RBM.Builder(RBM.HiddenUnit.RECTIFIED, RBM.VisibleUnit.GAUSSIAN).nIn(4).nOut(100).weightInit(WeightInit.XAVIER).activation(Activation.RELU).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3).activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER).build()).backprop(false).build();
    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(), new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));
    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
    Evaluation evaluation = new Evaluation();
    evaluation.eval(d.getLabels(), network2.output(d.getFeatureMatrix()));
    System.out.println(evaluation.stats());
}
Also used : Evaluation(org.deeplearning4j.eval.Evaluation) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) ClassPathResource(org.nd4j.linalg.io.ClassPathResource) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 13 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project deeplearning4j by deeplearning4j.

the class MLLIbUtilTest method testMlLibTest.

@Test
public void testMlLibTest() {
    DataSet dataSet = new IrisDataSetIterator(150, 150).next();
    List<DataSet> list = dataSet.asList();
    JavaRDD<DataSet> data = sc.parallelize(list);
    JavaRDD<LabeledPoint> mllLibData = MLLibUtil.fromDataSet(sc, data);
}
Also used : IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSet(org.nd4j.linalg.dataset.DataSet) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) Test(org.junit.Test) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest)

Example 14 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project java_study by aloyschen.

the class GbdtAndLr method load_gbdt_data.

/*
    * 读取正负样本训练数据,分成训练集和测试集
    * @Param Data_Path: 样本数据存放路径
    * @return Data: 经过预处理和标签特征处理之后的训练样本数据
     */
private JavaRDD<LabeledPoint> load_gbdt_data(Dataset<Row> data) {
    JavaRDD<LabeledPoint> numerical_labelpoint;
    if (data.rdd().isEmpty()) {
        System.exit(0);
    }
    JavaRDD<Row> numerical_row = data.toJavaRDD();
    numerical_labelpoint = numerical_row.map(row -> {
        // 总共12个连续特征给GBDT处理
        double[] features = new double[12];
        // 将dataset中每列特征值放入DenseVector中
        for (Integer i = 6; i < 18; i++) {
            org.apache.spark.mllib.linalg.DenseVector den = null;
            if (row.get(i) instanceof org.apache.spark.ml.linalg.Vector) {
                den = (DenseVector) Vectors.fromML((org.apache.spark.ml.linalg.DenseVector) row.get(i));
                features[i - 6] = den.toArray()[0];
            } else {
                features[i - 6] = Double.parseDouble(row.get(i).toString());
            }
        }
        DenseVector denseVector = new DenseVector(features);
        return new LabeledPoint(Double.valueOf(row.get(1).toString()), denseVector);
    });
    // //将预处理过的数据保存
    // List<LabeledPoint> data_save = data.collect();
    // try {
    // FileWriter fw = new FileWriter("./pre_data.txt");
    // BufferedWriter bufferedWriter = new BufferedWriter(fw);
    // for(LabeledPoint row_data : data_save){
    // double[] features = row_data.features().toArray();
    // for (Double element : features){
    // bufferedWriter.write(element.toString());
    // bufferedWriter.write(";");
    // }
    // bufferedWriter.write("\n");
    // }
    // bufferedWriter.close();
    // }catch (Exception e){
    // e.printStackTrace();
    // }
    System.out.println("Samples count:" + numerical_labelpoint.count());
    return numerical_labelpoint;
}
Also used : Vectors(org.apache.spark.mllib.linalg.Vectors) java.util(java.util) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) Serializable(scala.Serializable) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BinaryClassificationMetrics(org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) CmdlineParser(de.tototec.cmdoption.CmdlineParser) SparseVector(org.apache.spark.mllib.linalg.SparseVector) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) utils(utils) JavaRDD(org.apache.spark.api.java.JavaRDD) FeatureType(org.apache.spark.mllib.tree.configuration.FeatureType) DateFormat(java.text.DateFormat) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DenseVector(org.apache.spark.mllib.linalg.DenseVector) GradientBoostedTrees(org.apache.spark.mllib.tree.GradientBoostedTrees) SparkConf(org.apache.spark.SparkConf) LogisticRegressionWithLBFGS(org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) Option(scala.Option) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) org.apache.spark.ml.feature(org.apache.spark.ml.feature) org.apache.spark.mllib.tree.model(org.apache.spark.mllib.tree.model) org.apache.spark.sql(org.apache.spark.sql) JavaConverters(scala.collection.JavaConverters) BoostingStrategy(org.apache.spark.mllib.tree.configuration.BoostingStrategy) PairFunction(org.apache.spark.api.java.function.PairFunction) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) DenseVector(org.apache.spark.mllib.linalg.DenseVector)

Example 15 with LabeledPoint

use of org.apache.spark.mllib.regression.LabeledPoint in project incubator-sdap-mudrod by apache.

the class SparkSVM method main.

public static void main(String[] args) {
    MudrodEngine me = new MudrodEngine();
    JavaSparkContext jsc = me.startSparkDriver().sc;
    String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString();
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
    // Run training algorithm to build the model.
    int numIterations = 100;
    final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations);
    // Save and load model
    model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString());
    jsc.sc().stop();
}
Also used : SVMModel(org.apache.spark.mllib.classification.SVMModel) MudrodEngine(org.apache.sdap.mudrod.main.MudrodEngine) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint)

Aggregations

LabeledPoint (org.apache.spark.mllib.regression.LabeledPoint)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 DataSet (org.nd4j.linalg.dataset.DataSet)6 SparkConf (org.apache.spark.SparkConf)4 LogisticRegressionModel (org.apache.spark.mllib.classification.LogisticRegressionModel)4 Vector (org.apache.spark.mllib.linalg.Vector)4 DateFormat (java.text.DateFormat)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 HashingTF (org.apache.spark.mllib.feature.HashingTF)3 BoostingStrategy (org.apache.spark.mllib.tree.configuration.BoostingStrategy)3 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)3 BaseSparkTest (org.deeplearning4j.spark.BaseSparkTest)3 Tuple2 (scala.Tuple2)3 CmdlineParser (de.tototec.cmdoption.CmdlineParser)2 java.util (java.util)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 PairFunction (org.apache.spark.api.java.function.PairFunction)2 org.apache.spark.ml.feature (org.apache.spark.ml.feature)2 LogisticRegressionWithLBFGS (org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)2 LogisticRegressionWithSGD (org.apache.spark.mllib.classification.LogisticRegressionWithSGD)2