Search in sources :

Example 46 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class AFTRegTest method testPipeline.

@Test
public void testPipeline() {
    MemSourceBatchOp data = new MemSourceBatchOp(Arrays.asList(rows), new String[] { "id", "label", "censor", "features" });
    AftSurvivalRegression reg = new AftSurvivalRegression().setVectorCol("features").setLabelCol("label").setCensorCol("censor").setPredictionCol("result").enableLazyPrintModelInfo().enableLazyPrintTrainInfo();
    PipelineModel model = new Pipeline().add(reg).fit(data);
    BatchOperator<?> res = model.transform(data);
    List<Row> list = res.select(new String[] { "id", "result" }).collect();
    double[] actual = new double[] { 5.70, 18.10, 7.36, 13.62, 9.03 };
    for (int i = 0; i < actual.length; i++) {
        Assert.assertEquals((Double) list.get(i).getField(1), actual[(int) list.get(i).getField(0)], 0.1);
    }
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 47 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class LinearRegressionTest method testLabelNull.

@Test
public void testLabelNull() throws Exception {
    try {
        Row[] vecrows = new Row[] { Row.of("$3$0:1.0 1:7.0 2:9.0", "1.0 7.0 9.0", 1.0, 7.0, 9.0, 16.8), Row.of("$3$0:1.0 1:3.0 2:3.0", "1.0 3.0 3.0", 2.0, 3.0, 3.0, 6.7), Row.of("$3$0:1.0 1:2.0 2:4.0", "1.0 2.0 4.0", 1.0, 2.0, 4.0, null), Row.of("$3$0:1.0 1:3.0 2:4.0", "1.0 3.0 4.0", 1.0, 3.0, 4.0, 8.0) };
        String[] veccolNames = new String[] { "svec", "vec", "f0", "f1", "f2", "label" };
        BatchOperator<?> vecdata = new MemSourceBatchOp(Arrays.asList(vecrows), veccolNames);
        String[] xVars = new String[] { "f0", "f1", "f2" };
        String yVar = "label";
        LinearRegression linear = new LinearRegression().setLabelCol(yVar).setFeatureCols(xVars).setMaxIter(20).setOptimMethod("newton").setPredictionCol("linpred");
        Pipeline pl = new Pipeline().add(linear);
        PipelineModel model = pl.fit(vecdata);
        BatchOperator<?> result = model.transform(vecdata);
        result.collect();
    } catch (Exception ex) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        ex.printStackTrace(ps);
        Assert.assertTrue("label col has null value", baos.toString().contains("label col has null values, please check it!"));
    }
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) PrintStream(java.io.PrintStream) Row(org.apache.flink.types.Row) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 48 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class LinearRegressionTest method regressionPipelineTest.

@Test
public void regressionPipelineTest() throws Exception {
    BatchOperator<?> vecdata = new MemSourceBatchOp(Arrays.asList(vecrows), veccolNames);
    StreamOperator<?> svecdata = new MemSourceStreamOp(Arrays.asList(vecrows), veccolNames);
    String[] xVars = new String[] { "f0", "f1", "f2" };
    String yVar = "label";
    String vec = "vec";
    String svec = "svec";
    LinearRegression linear = new LinearRegression().setLabelCol(yVar).setFeatureCols(xVars).setMaxIter(20).setOptimMethod("newton").setPredictionCol("linpred");
    LinearRegression vlinear = new LinearRegression().setLabelCol(yVar).setVectorCol(vec).setMaxIter(20).setPredictionCol("vlinpred");
    LinearRegression svlinear = new LinearRegression().setLabelCol(yVar).setVectorCol(svec).setMaxIter(20).setPredictionCol("svlinpred");
    svlinear.enableLazyPrintModelInfo();
    svlinear.enableLazyPrintTrainInfo();
    Pipeline pl = new Pipeline().add(linear).add(vlinear).add(svlinear);
    PipelineModel model = pl.fit(vecdata);
    BatchOperator<?> result = model.transform(vecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" });
    List<Row> data = result.collect();
    for (Row row : data) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.814789059973744, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.814789059973744, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.814788687904162, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.773942836224718, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.773942836224718, 0.01);
            Assert.assertEquals((double) row.getField(3), 6.773943529327923, 0.01);
        }
    }
    // below is stream test code
    CollectSinkStreamOp sop = model.transform(svecdata).select(new String[] { "label", "linpred", "vlinpred", "svlinpred" }).link(new CollectSinkStreamOp());
    StreamOperator.execute();
    List<Row> rows = sop.getAndRemoveValues();
    for (Row row : rows) {
        if ((double) row.getField(0) == 16.8000) {
            Assert.assertEquals((double) row.getField(1), 16.814789059973744, 0.01);
            Assert.assertEquals((double) row.getField(2), 16.814789059973744, 0.01);
            Assert.assertEquals((double) row.getField(3), 16.814788687904162, 0.01);
        } else if ((double) row.getField(0) == 6.7000) {
            Assert.assertEquals((double) row.getField(1), 6.773942836224718, 0.01);
            Assert.assertEquals((double) row.getField(2), 6.773942836224718, 0.01);
            Assert.assertEquals((double) row.getField(3), 6.773943529327923, 0.01);
        }
    }
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) Row(org.apache.flink.types.Row) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 49 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class OneHotTest method pipelineTest.

@Test
public void pipelineTest() throws Exception {
    OneHotEncoder oneHot = new OneHotEncoder().setSelectedCols(binaryNames).setOutputCols("results").setDropLast(false).enableLazyPrintModelInfo();
    VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "cnt", "results" }).enableLazyPrintTransformStat("xxxxxx").setOutputCol("outN");
    Pipeline pl = new Pipeline().add(oneHot).add(va);
    PipelineModel model = pl.fit((BatchOperator<?>) getData(true));
    Row[] parray = new Row[] { Row.of("0", "doc0", "天", 4L), Row.of("1", "doc2", null, 3L) };
    List<Row> expectedRow = Arrays.asList(Row.of("0", new SparseVector(19, new int[] { 0, 3, 10, 16 }, new double[] { 4.0, 1.0, 1.0, 1.0 })), Row.of("1", new SparseVector(19, new int[] { 0, 1, 12, 15 }, new double[] { 3.0, 1.0, 1.0, 1.0 })));
    // batch predict
    MemSourceBatchOp predData = new MemSourceBatchOp(Arrays.asList(parray), schema);
    List<Row> rows = model.transform(predData).select("id, outN").collect();
    assertListRowEqual(expectedRow, rows, 0);
    // stream predict
    MemSourceStreamOp predSData = new MemSourceStreamOp(Arrays.asList(parray), schema);
    CollectSinkStreamOp sink = model.transform(predSData).select("id, outN").link(new CollectSinkStreamOp());
    StreamOperator.execute();
    assertListRowEqual(expectedRow, sink.getAndRemoveValues(), 0);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 50 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class Chap23 method c_1.

static void c_1() throws Exception {
    BatchOperator<?> train_set = new LibSvmSourceBatchOp().setFilePath(ORIGIN_DATA_DIR + "train" + File.separator + "labeledBow.feat").setStartIndex(0);
    train_set.lazyPrint(1, "train_set");
    train_set.groupBy("label", "label, COUNT(label) AS cnt").orderBy("label", 100).lazyPrint(-1, "labels of train_set");
    BatchOperator<?> test_set = new LibSvmSourceBatchOp().setFilePath(ORIGIN_DATA_DIR + "test" + File.separator + "labeledBow.feat").setStartIndex(0);
    train_set = train_set.select("CASE WHEN label>5 THEN 'pos' ELSE 'neg' END AS label, " + "features AS " + VECTOR_COL_NAME);
    test_set = test_set.select("CASE WHEN label>5 THEN 'pos' ELSE 'neg' END AS label, " + "features AS " + VECTOR_COL_NAME);
    train_set.lazyPrint(1, "train_set");
    new NaiveBayesTextClassifier().setModelType("Multinomial").setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintModelInfo().fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NaiveBayesTextClassifier + Multinomial"));
    BatchOperator.execute();
    new Pipeline().add(new Binarizer().setSelectedCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1, "After Binarizer")).add(new NaiveBayesTextClassifier().setModelType("Bernoulli").setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintModelInfo()).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("Binarizer + NaiveBayesTextClassifier + Bernoulli"));
    BatchOperator.execute();
    new LogisticRegression().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintTrainInfo("< LR train info >").enableLazyPrintModelInfo("< LR model info >").fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    BatchOperator.execute();
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    LogisticRegression lr = new LogisticRegression().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME);
    GridSearchCV gridSearch = new GridSearchCV().setEstimator(new Pipeline().add(lr)).setParamGrid(new ParamGrid().addGrid(lr, LogisticRegression.MAX_ITER, new Integer[] { 10, 20, 30, 40, 50, 60, 80, 100 })).setTuningEvaluator(new BinaryClassificationTuningEvaluator().setLabelCol(LABEL_COL_NAME).setPositiveLabelValueString("pos").setPredictionDetailCol(PRED_DETAIL_COL_NAME).setTuningBinaryClassMetric(TuningBinaryClassMetric.AUC)).setNumFolds(6).enableLazyPrintTrainInfo();
    GridSearchCVModel bestModel = gridSearch.fit(train_set);
    bestModel.transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    BatchOperator.execute();
}
Also used : ParamGrid(com.alibaba.alink.pipeline.tuning.ParamGrid) LibSvmSourceBatchOp(com.alibaba.alink.operator.batch.source.LibSvmSourceBatchOp) GridSearchCV(com.alibaba.alink.pipeline.tuning.GridSearchCV) NaiveBayesTextClassifier(com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) GridSearchCVModel(com.alibaba.alink.pipeline.tuning.GridSearchCVModel) Binarizer(com.alibaba.alink.pipeline.feature.Binarizer) BinaryClassificationTuningEvaluator(com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Aggregations

Pipeline (com.alibaba.alink.pipeline.Pipeline)63 Test (org.junit.Test)38 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)34 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)20 Row (org.apache.flink.types.Row)18 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)16 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)16 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)11 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)10 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)9 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)8 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)7 File (java.io.File)5 ArrayList (java.util.ArrayList)5 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)4 StandardScaler (com.alibaba.alink.pipeline.dataproc.StandardScaler)4 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)3 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)3 VectorToTensor (com.alibaba.alink.pipeline.dataproc.VectorToTensor)3