Search in sources :

Example 26 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class EqualWidthDiscretizerTest method test.

@Test
public void test() throws Exception {
    try {
        NumSeqSourceBatchOp numSeqSourceBatchOp = new NumSeqSourceBatchOp(0, 10, "col0");
        Pipeline pipeline = new Pipeline().add(new EqualWidthDiscretizer().setNumBuckets(3).enableLazyPrintModelInfo().setSelectedCols("col0"));
        pipeline.fit(numSeqSourceBatchOp).transform(numSeqSourceBatchOp).collect();
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail("Should not throw exception here.");
    }
}
Also used : NumSeqSourceBatchOp(com.alibaba.alink.operator.batch.source.NumSeqSourceBatchOp) ExpectedException(org.junit.rules.ExpectedException) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 27 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class KMeansTest method testKmeans.

@Test
public void testKmeans() throws Exception {
    KMeans kMeans = new KMeans().setVectorCol("vector").setPredictionCol("pred").setPredictionDistanceCol("distance").setK(2);
    PipelineModel model = new Pipeline().add(kMeans).fit(inputBatchOp);
    BatchOperator<?> batchPredOp = model.transform(inputBatchOp).select(new String[] { "id", "distance" });
    verifyPredResult(batchPredOp.collect());
    StreamOperator<?> streamPredOp = model.transform(inputStreamOp).select(new String[] { "id", "distance" });
    CollectSinkStreamOp sinkOp = streamPredOp.link(new CollectSinkStreamOp());
    StreamOperator.execute();
    verifyPredResult(sinkOp.getAndRemoveValues());
}
Also used : CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 28 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class QuantileDiscretizerTest method train.

@Test
public void train() {
    try {
        NumSeqSourceBatchOp numSeqSourceBatchOp = new NumSeqSourceBatchOp(0, 1000, "col0");
        Pipeline pipeline = new Pipeline().add(new QuantileDiscretizer().setNumBuckets(2).setSelectedCols(new String[] { "col0" }).enableLazyPrintModelInfo());
        Assert.assertEquals(1001, pipeline.fit(numSeqSourceBatchOp).transform(numSeqSourceBatchOp).collect().size());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail("Should not throw exception here.");
    }
}
Also used : NumSeqSourceBatchOp(com.alibaba.alink.operator.batch.source.NumSeqSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline) Test(org.junit.Test)

Example 29 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class FTRLExample method main.

public static void main(String[] args) throws Exception {
    String schemaStr = "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, " + "site_category string, app_id string, app_domain string, app_category string, device_id string, " + "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, " + "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int";
    CsvSourceBatchOp trainBatchData = new CsvSourceBatchOp().setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv").setSchemaStr(schemaStr);
    trainBatchData.firstN(10).print();
    String labelColName = "click";
    String[] selectedColNames = new String[] { "C1", "banner_pos", "site_category", "app_domain", "app_category", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "site_id", "site_domain", "device_id", "device_model" };
    String[] categoryColNames = new String[] { "C1", "banner_pos", "site_category", "app_domain", "app_category", "device_type", "device_conn_type", "site_id", "site_domain", "device_id", "device_model" };
    String[] numericalColNames = new String[] { "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21" };
    // result column name of feature engineering
    String vecColName = "vec";
    int numHashFeatures = 30000;
    // setup feature engineering pipeline
    Pipeline featurePipeline = new Pipeline().add(new StandardScaler().setSelectedCols(numericalColNames)).add(new FeatureHasher().setSelectedCols(selectedColNames).setCategoricalCols(categoryColNames).setOutputCol(vecColName).setNumFeatures(numHashFeatures));
    // fit feature pipeline model
    PipelineModel featurePipelineModel = featurePipeline.fit(trainBatchData);
    // prepare stream train data
    CsvSourceStreamOp data = new CsvSourceStreamOp().setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-ctr-train-8M.csv").setSchemaStr(schemaStr).setIgnoreFirstLine(true);
    // split stream to train and eval data
    SplitStreamOp splitter = new SplitStreamOp().setFraction(0.5).linkFrom(data);
    // train initial batch model
    LogisticRegressionTrainBatchOp lr = new LogisticRegressionTrainBatchOp().setVectorCol(vecColName).setLabelCol(labelColName).setWithIntercept(true).setMaxIter(10);
    BatchOperator<?> initModel = featurePipelineModel.transform(trainBatchData).link(lr);
    // ftrl train
    FtrlTrainStreamOp model = new FtrlTrainStreamOp(initModel).setVectorCol(vecColName).setLabelCol(labelColName).setWithIntercept(true).setAlpha(0.1).setBeta(0.1).setL1(0.01).setL2(0.01).setTimeInterval(10).setVectorSize(numHashFeatures).linkFrom(featurePipelineModel.transform(splitter));
    // ftrl predict
    FtrlPredictStreamOp predictResult = new FtrlPredictStreamOp(initModel).setVectorCol(vecColName).setPredictionCol("pred").setReservedCols(new String[] { labelColName }).setPredictionDetailCol("details").linkFrom(model, featurePipelineModel.transform(splitter.getSideOutput(0)));
    // ftrl eval
    predictResult.link(new EvalBinaryClassStreamOp().setLabelCol(labelColName).setPredictionCol("pred").setPredictionDetailCol("details").setTimeInterval(10)).link(new JsonValueStreamOp().setSelectedCol("Data").setReservedCols(new String[] { "Statistics" }).setOutputCols(new String[] { "Accuracy", "AUC", "ConfusionMatrix" }).setJsonPath(new String[] { "$.Accuracy", "$.AUC", "$.ConfusionMatrix" })).print();
}
Also used : JsonValueStreamOp(com.alibaba.alink.operator.stream.dataproc.JsonValueStreamOp) LogisticRegressionTrainBatchOp(com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp) FtrlPredictStreamOp(com.alibaba.alink.operator.stream.onlinelearning.FtrlPredictStreamOp) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) SplitStreamOp(com.alibaba.alink.operator.stream.dataproc.SplitStreamOp) FeatureHasher(com.alibaba.alink.pipeline.feature.FeatureHasher) FtrlTrainStreamOp(com.alibaba.alink.operator.stream.onlinelearning.FtrlTrainStreamOp) StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) EvalBinaryClassStreamOp(com.alibaba.alink.operator.stream.evaluation.EvalBinaryClassStreamOp) CsvSourceStreamOp(com.alibaba.alink.operator.stream.source.CsvSourceStreamOp)

Example 30 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class KMeansExample method main.

public static void main(String[] args) throws Exception {
    String URL = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv";
    String SCHEMA_STR = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string";
    BatchOperator data = new CsvSourceBatchOp().setFilePath(URL).setSchemaStr(SCHEMA_STR);
    VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "sepal_length", "sepal_width", "petal_length", "petal_width" }).setOutputCol("features");
    KMeans kMeans = new KMeans().setVectorCol("features").setK(3).setPredictionCol("prediction_result").setPredictionDetailCol("prediction_detail").setReservedCols("category").setMaxIter(100);
    Pipeline pipeline = new Pipeline().add(va).add(kMeans);
    pipeline.fit(data).transform(data).print();
}
Also used : KMeans(com.alibaba.alink.pipeline.clustering.KMeans) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Aggregations

Pipeline (com.alibaba.alink.pipeline.Pipeline)63 Test (org.junit.Test)38 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)34 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)20 Row (org.apache.flink.types.Row)18 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)16 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)16 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)11 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)10 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)9 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)8 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)7 File (java.io.File)5 ArrayList (java.util.ArrayList)5 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)4 StandardScaler (com.alibaba.alink.pipeline.dataproc.StandardScaler)4 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)3 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)3 VectorToTensor (com.alibaba.alink.pipeline.dataproc.VectorToTensor)3