Search in sources :

Example 21 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class PipelineSaveAndLoadTest method testLocalPredictorMultiFile.

@Test
public void testLocalPredictorMultiFile() throws Exception {
    VectorAssembler va = new VectorAssembler().setSelectedCols(Iris.getFeatureColNames()).setOutputCol("features");
    MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier().setVectorCol("features").setLabelCol(Iris.getLabelColName()).setLayers(new int[] { 4, 5, 3 }).setMaxIter(30).setPredictionCol("pred_label").setPredictionDetailCol("pred_detail").setReservedCols(Iris.getLabelColName());
    Pipeline pipeline = new Pipeline().add(va).add(classifier);
    PipelineModel model = pipeline.fit(data);
    FilePath filePath = new FilePath(folder.newFile().getAbsolutePath());
    model.save(filePath, true, 2);
    BatchOperator.execute();
    LocalPredictor localPredictor = new LocalPredictor(filePath, new TableSchema(ArrayUtils.add(data.getColNames(), "features"), ArrayUtils.add(data.getColTypes(), VectorTypes.DENSE_VECTOR)));
    Row result = localPredictor.map(Row.of(5.1, 3.5, 1.4, 0.2, "Iris-setosanew", new DenseVector(new double[] { 5.1, 3.5, 1.4, 0.2 })));
    System.out.println(JsonConverter.toJson(result));
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) MultilayerPerceptronClassifier(com.alibaba.alink.pipeline.classification.MultilayerPerceptronClassifier) TableSchema(org.apache.flink.table.api.TableSchema) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) Row(org.apache.flink.types.Row) DenseVector(com.alibaba.alink.common.linalg.DenseVector) Test(org.junit.Test)

Example 22 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class OneHotTest method pipelineTest.

@Test
public void pipelineTest() throws Exception {
    OneHotEncoder oneHot = new OneHotEncoder().setSelectedCols(binaryNames).setOutputCols("results").setDropLast(false).enableLazyPrintModelInfo();
    VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "cnt", "results" }).enableLazyPrintTransformStat("xxxxxx").setOutputCol("outN");
    Pipeline pl = new Pipeline().add(oneHot).add(va);
    PipelineModel model = pl.fit((BatchOperator<?>) getData(true));
    Row[] parray = new Row[] { Row.of("0", "doc0", "天", 4L), Row.of("1", "doc2", null, 3L) };
    List<Row> expectedRow = Arrays.asList(Row.of("0", new SparseVector(19, new int[] { 0, 3, 10, 16 }, new double[] { 4.0, 1.0, 1.0, 1.0 })), Row.of("1", new SparseVector(19, new int[] { 0, 1, 12, 15 }, new double[] { 3.0, 1.0, 1.0, 1.0 })));
    // batch predict
    MemSourceBatchOp predData = new MemSourceBatchOp(Arrays.asList(parray), schema);
    List<Row> rows = model.transform(predData).select("id, outN").collect();
    assertListRowEqual(expectedRow, rows, 0);
    // stream predict
    MemSourceStreamOp predSData = new MemSourceStreamOp(Arrays.asList(parray), schema);
    CollectSinkStreamOp sink = model.transform(predSData).select("id, outN").link(new CollectSinkStreamOp());
    StreamOperator.execute();
    assertListRowEqual(expectedRow, sink.getAndRemoveValues(), 0);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 23 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class Chap23 method c_4.

static void c_4() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    if (!new File(DATA_DIR + PIPELINE_MODEL).exists()) {
        new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(50000).setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).save(DATA_DIR + PIPELINE_MODEL);
        BatchOperator.execute();
    }
    PipelineModel pipeline_model = PipelineModel.load(DATA_DIR + PIPELINE_MODEL);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
    AkSourceStreamOp test_stream = new AkSourceStreamOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_stream).sample(0.001).select(PREDICTION_COL_NAME + ", " + LABEL_COL_NAME + ", " + TXT_COL_NAME).print();
    StreamOperator.execute();
    String str = "Oh dear. good cast, but to write and direct is an art and to write wit and direct wit is a bit of a " + "task. Even doing good comedy you have to get the timing and moment right. Im not putting it all down " + "there were parts where i laughed loud but that was at very few times. The main focus to me was on the " + "fast free flowing dialogue, that made some people in the film annoying. It may sound great while " + "reading the script in your head but getting that out and to the camera is a different task. And the " + "hand held camera work does give energy to few parts of the film. Overall direction was good but the " + "script was not all that to me, but I'm sure you was reading the script in your head it would sound good" + ". Sorry.";
    Row pred_row;
    LocalPredictor local_predictor = pipeline_model.collectLocalPredictor("review string");
    System.out.println(local_predictor.getOutputSchema());
    pred_row = local_predictor.map(Row.of(str));
    System.out.println(pred_row.getField(4));
    LocalPredictor local_predictor_2 = new LocalPredictor(DATA_DIR + PIPELINE_MODEL, "review string");
    System.out.println(local_predictor_2.getOutputSchema());
    pred_row = local_predictor_2.map(Row.of(str));
    System.out.println(pred_row.getField(4));
}
Also used : LocalPredictor(com.alibaba.alink.pipeline.LocalPredictor) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) AkSourceStreamOp(com.alibaba.alink.operator.stream.source.AkSourceStreamOp) Row(org.apache.flink.types.Row) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File)

Example 24 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class Chap25 method dnnReg.

public static void dnnReg(BatchOperator<?> train_set, BatchOperator<?> test_set) throws Exception {
    BatchOperator.setParallelism(1);
    new Pipeline().add(new StandardScaler().setSelectedCols(Chap16.FEATURE_COL_NAMES)).add(new VectorAssembler().setSelectedCols(Chap16.FEATURE_COL_NAMES).setOutputCol("vec")).add(new VectorToTensor().setSelectedCol("vec").setOutputCol("tensor").setReservedCols("quality")).add(new KerasSequentialRegressor().setTensorCol("tensor").setLabelCol("quality").setPredictionCol("pred").setLayers("Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')").setNumEpochs(20)).fit(train_set).transform(test_set).lazyPrintStatistics().link(new EvalRegressionBatchOp().setLabelCol("quality").setPredictionCol("pred").lazyPrintMetrics());
    BatchOperator.execute();
}
Also used : StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) EvalRegressionBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalRegressionBatchOp) KerasSequentialRegressor(com.alibaba.alink.pipeline.regression.KerasSequentialRegressor) VectorToTensor(com.alibaba.alink.pipeline.dataproc.VectorToTensor) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 25 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class Chap08 method c_8.

static void c_8() throws Exception {
    BatchOperator<?> train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    BatchOperator<?> test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    PipelineModel featureExpand = new Pipeline().add(new VectorAssembler().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VEC_COL_NAME + "_0")).add(new VectorPolynomialExpand().setSelectedCol(VEC_COL_NAME + "_0").setOutputCol(VEC_COL_NAME).setDegree(2)).fit(train_data);
    train_data = featureExpand.transform(train_data);
    test_data = featureExpand.transform(test_data);
    train_data.lazyPrint(1);
    new LinearSvm().setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LinearSVM"));
    new LogisticRegression().setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    new LogisticRegression().setOptimMethod(OptimMethod.Newton).setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression + OptimMethod.Newton"));
    BatchOperator.execute();
}
Also used : VectorPolynomialExpand(com.alibaba.alink.pipeline.dataproc.vector.VectorPolynomialExpand) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) LinearSvm(com.alibaba.alink.pipeline.classification.LinearSvm) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Aggregations

VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)26 Test (org.junit.Test)16 Pipeline (com.alibaba.alink.pipeline.Pipeline)11 MultilayerPerceptronClassifier (com.alibaba.alink.pipeline.classification.MultilayerPerceptronClassifier)9 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)8 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)7 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)7 Row (org.apache.flink.types.Row)7 FilePath (com.alibaba.alink.common.io.filesystem.FilePath)4 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)4 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)4 OneHotEncoder (com.alibaba.alink.pipeline.feature.OneHotEncoder)3 TableSchema (org.apache.flink.table.api.TableSchema)3 DenseVector (com.alibaba.alink.common.linalg.DenseVector)2 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)2 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)2 Lda (com.alibaba.alink.pipeline.clustering.Lda)2 Binarizer (com.alibaba.alink.pipeline.feature.Binarizer)2 QuantileDiscretizer (com.alibaba.alink.pipeline.feature.QuantileDiscretizer)2 DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)2