Search in sources :

Example 21 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class LassoRegressionTest method regressionPipelineTest.

@Test
public void regressionPipelineTest() throws Exception {
    BatchOperator<?> vecdata = new MemSourceBatchOp(Arrays.asList(vecRows), veccolNames);
    StreamOperator<?> svecdata = new MemSourceStreamOp(Arrays.asList(vecRows), veccolNames);
    String[] xVars = new String[] { "f0", "f1", "f2" };
    String yVar = "label";
    String vec = "vec";
    String svec = "svec";
    LassoRegression lasso = new LassoRegression().setLabelCol(yVar).setFeatureCols(xVars).setLambda(0.01).setMaxIter(20).setOptimMethod("owlqn").setPredictionCol("linpred");
    LassoRegression vlasso = new LassoRegression().setLabelCol(yVar).setVectorCol(vec).setMaxIter(20).setLambda(0.01).setOptimMethod("newton").setPredictionCol("vlinpred").enableLazyPrintModelInfo();
    LassoRegression svlasso = new LassoRegression().setLabelCol(yVar).setVectorCol(svec).setMaxIter(20).setLambda(0.01).setPredictionCol("svlinpred");
    Pipeline pl = new Pipeline().add(lasso).add(vlasso).add(svlasso);
    PipelineModel model = pl.fit(vecdata);
    BatchOperator<?> result = model.transform(vecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" });
    List<Row> data = result.collect();
    for (Row row : data) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.784611802507232, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.784611802507232, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.78209421260283, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.7713287283076, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.7713287283076, 0.01);
            Assert.assertEquals((double) row.getField(3), 6.826846826823054, 0.01);
        }
    }
    // below is stream test code
    CollectSinkStreamOp sop = model.transform(svecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" }).link(new CollectSinkStreamOp());
    StreamOperator.execute();
    List<Row> rows = sop.getAndRemoveValues();
    for (Row row : rows) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.784611802507232, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.784611802507232, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.78209421260283, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.7713287283076, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.7713287283076, 0.01);
            Assert.assertEquals((double) row.getField(3), 6.826846826823054, 0.01);
        }
    }
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) Row(org.apache.flink.types.Row) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 22 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class RidgeRegressionTest method regressionPipelineTest.

@Test
public void regressionPipelineTest() throws Exception {
    BatchOperator<?> vecdata = new MemSourceBatchOp(Arrays.asList(vecrows), veccolNames);
    StreamOperator<?> svecdata = new MemSourceStreamOp(Arrays.asList(vecrows), veccolNames);
    String[] xVars = new String[] { "f0", "f1", "f2" };
    String yVar = "label";
    String vec = "vec";
    String svec = "svec";
    RidgeRegression ridge = new RidgeRegression().setLabelCol(yVar).setFeatureCols(xVars).setLambda(0.01).setMaxIter(10).setPredictionCol("linpred");
    RidgeRegression vridge = new RidgeRegression().setLabelCol(yVar).setVectorCol(vec).setLambda(0.01).setMaxIter(10).setOptimMethod("newton").setPredictionCol("vlinpred");
    RidgeRegression svridge = new RidgeRegression().setLabelCol(yVar).setVectorCol(svec).setLambda(0.01).setMaxIter(10).setPredictionCol("svlinpred");
    Pipeline pl = new Pipeline().add(ridge).add(vridge).add(svridge);
    PipelineModel model = pl.fit(vecdata);
    BatchOperator<?> result = model.transform(vecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" });
    List<Row> data = result.collect();
    for (Row row : data) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.77322547668301, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.620448399254673, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.384437074591887, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.932628087721653, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.775060404865803, 0.01);
            Assert.assertEquals((double) row.getField(3), 7.425378715755974, 0.01);
        }
    }
    // below is stream test code
    CollectSinkStreamOp sop = model.transform(svecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" }).link(new CollectSinkStreamOp());
    StreamOperator.execute();
    List<Row> rows = sop.getAndRemoveValues();
    for (Row row : rows) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.77322547668301, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.620448399254673, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.384437074591887, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.932628087721653, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.775060404865803, 0.01);
            Assert.assertEquals((double) row.getField(3), 7.425378715755974, 0.01);
        }
    }
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) Row(org.apache.flink.types.Row) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 23 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class GridSearchCVTest method testSplit.

@Test
public void testSplit() throws Exception {
    List<Row> rows = Arrays.asList(Row.of(1.0, "A", 0, 0, 0), Row.of(2.0, "B", 1, 1, 0), Row.of(3.0, "C", 2, 2, 1), Row.of(4.0, "D", 3, 3, 1), Row.of(1.0, "A", 0, 0, 0), Row.of(2.0, "B", 1, 1, 0), Row.of(3.0, "C", 2, 2, 1), Row.of(4.0, "D", 3, 3, 1), Row.of(1.0, "A", 0, 0, 0), Row.of(2.0, "B", 1, 1, 0), Row.of(3.0, "C", 2, 2, 1));
    String[] colNames = new String[] { "f0", "f1", "f2", "f3", "label" };
    MemSourceBatchOp data = new MemSourceBatchOp(rows, colNames);
    String[] featureColNames = new String[] { colNames[0], colNames[1], colNames[2], colNames[3] };
    String[] categoricalColNames = new String[] { colNames[1] };
    String labelColName = colNames[4];
    RandomForestClassifier rf = new RandomForestClassifier().setFeatureCols(featureColNames).setCategoricalCols(categoricalColNames).setLabelCol(labelColName).setPredictionCol("pred_result").setPredictionDetailCol("pred_detail").setSubsamplingRatio(1.0);
    Pipeline pipeline = new Pipeline(rf);
    ParamGrid paramGrid = new ParamGrid().addGrid(rf, "SUBSAMPLING_RATIO", new Double[] { 1.0 }).addGrid(rf, "NUM_TREES", new Integer[] { 3 });
    BinaryClassificationTuningEvaluator tuning_evaluator = new BinaryClassificationTuningEvaluator().setLabelCol(labelColName).setPredictionDetailCol("pred_detail").setTuningBinaryClassMetric("Accuracy");
    GridSearchTVSplit cv = new GridSearchTVSplit().setEstimator(pipeline).setParamGrid(paramGrid).setTuningEvaluator(tuning_evaluator).setTrainRatio(0.8);
    ModelBase cvModel = cv.fit(data);
    cvModel.transform(data).print();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) ModelBase(com.alibaba.alink.pipeline.ModelBase) RandomForestClassifier(com.alibaba.alink.pipeline.classification.RandomForestClassifier) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 24 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class GridSearchCVTest method findBestCluster.

@Test
public void findBestCluster() {
    ColumnsToVector columnsToVector = new ColumnsToVector().setSelectedCols(colNames[0], colNames[1]).setVectorCol("vector");
    KMeans kMeans = new KMeans().setVectorCol("vector").setPredictionCol("pred");
    ParamGrid grid = new ParamGrid().addGrid(kMeans, KMeans.DISTANCE_TYPE, new HasKMeansDistanceType.DistanceType[] { EUCLIDEAN, COSINE });
    Pipeline pipeline = new Pipeline().add(columnsToVector).add(kMeans);
    GridSearchCV gridSearchCV = new GridSearchCV().setEstimator(pipeline).setParamGrid(grid).setNumFolds(2).setTuningEvaluator(new ClusterTuningEvaluator().setTuningClusterMetric(TuningClusterMetric.RI).setPredictionCol("pred").setVectorCol("vector").setLabelCol("label"));
    GridSearchCVModel model = gridSearchCV.fit(memSourceBatchOp);
    Assert.assertEquals(testArray.length, model.transform(memSourceBatchOp).collect().size());
}
Also used : KMeans(com.alibaba.alink.pipeline.clustering.KMeans) ColumnsToVector(com.alibaba.alink.pipeline.dataproc.format.ColumnsToVector) HasKMeansDistanceType(com.alibaba.alink.params.shared.clustering.HasKMeansDistanceType) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 25 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class GridSearchTVSplitTest method findBestCluster.

@Test
public void findBestCluster() throws Exception {
    ColumnsToVector columnsToVector = new ColumnsToVector().setSelectedCols(colNames[0], colNames[1]).setVectorCol("vector");
    KMeans kMeans = new KMeans().setVectorCol("vector").setPredictionCol("pred");
    ParamGrid grid = new ParamGrid().addGrid(kMeans, "distanceType", new HasKMeansDistanceType.DistanceType[] { EUCLIDEAN, COSINE });
    Pipeline pipeline = new Pipeline().add(columnsToVector).add(kMeans);
    GridSearchTVSplit gridSearchTVSplit = new GridSearchTVSplit().setEstimator(pipeline).setParamGrid(grid).setTrainRatio(0.5).setTuningEvaluator(new ClusterTuningEvaluator().setTuningClusterMetric(TuningClusterMetric.RI).setPredictionCol("pred").setVectorCol("vector").setLabelCol("label"));
    GridSearchTVSplitModel model = gridSearchTVSplit.fit(memSourceBatchOp);
    Assert.assertEquals(testArray.length, model.transform(memSourceBatchOp).collect().size());
}
Also used : KMeans(com.alibaba.alink.pipeline.clustering.KMeans) ColumnsToVector(com.alibaba.alink.pipeline.dataproc.format.ColumnsToVector) HasKMeansDistanceType(com.alibaba.alink.params.shared.clustering.HasKMeansDistanceType) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Aggregations

Pipeline (com.alibaba.alink.pipeline.Pipeline)63 Test (org.junit.Test)38 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)34 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)20 Row (org.apache.flink.types.Row)18 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)16 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)16 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)11 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)10 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)9 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)8 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)7 File (java.io.File)5 ArrayList (java.util.ArrayList)5 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)4 StandardScaler (com.alibaba.alink.pipeline.dataproc.StandardScaler)4 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)3 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)3 VectorToTensor (com.alibaba.alink.pipeline.dataproc.VectorToTensor)3