Search in sources :

Example 1 with VectorAssemblerBatchOp

use of com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp in project Alink by alibaba.

the class Chap07 method c_4_2.

static void c_4_2() throws Exception {
    BatchOperator<?> source = new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING).link(new VectorAssemblerBatchOp().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VECTOR_COL_NAME).setReservedCols(LABEL_COL_NAME));
    source.link(new VectorNormalizeBatchOp().setSelectedCol(VECTOR_COL_NAME).setP(1.0)).firstN(5).print();
}
Also used : VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) VectorNormalizeBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorNormalizeBatchOp) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)

Example 2 with VectorAssemblerBatchOp

use of com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp in project Alink by alibaba.

the class Chap07 method c_4_1.

static void c_4_1() throws Exception {
    BatchOperator<?> source = new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING).link(new VectorAssemblerBatchOp().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VECTOR_COL_NAME).setReservedCols(LABEL_COL_NAME));
    source.link(new VectorSummarizerBatchOp().setSelectedCol(VECTOR_COL_NAME).lazyPrintVectorSummary("< Origin data >"));
    new VectorStandardScaler().setSelectedCol(VECTOR_COL_NAME).fit(source).transform(source).link(new VectorSummarizerBatchOp().setSelectedCol(VECTOR_COL_NAME).lazyPrintVectorSummary("< after Vector Standard Scale >"));
    new VectorMinMaxScaler().setSelectedCol(VECTOR_COL_NAME).fit(source).transform(source).link(new VectorSummarizerBatchOp().setSelectedCol(VECTOR_COL_NAME).lazyPrintVectorSummary("< after Vector MinMax Scale >"));
    new VectorMaxAbsScaler().setSelectedCol(VECTOR_COL_NAME).fit(source).transform(source).link(new VectorSummarizerBatchOp().setSelectedCol(VECTOR_COL_NAME).lazyPrintVectorSummary("< after Vector MaxAbs Scale >"));
    BatchOperator.execute();
}
Also used : VectorMinMaxScaler(com.alibaba.alink.pipeline.dataproc.vector.VectorMinMaxScaler) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) VectorStandardScaler(com.alibaba.alink.pipeline.dataproc.vector.VectorStandardScaler) VectorSummarizerBatchOp(com.alibaba.alink.operator.batch.statistics.VectorSummarizerBatchOp) VectorMaxAbsScaler(com.alibaba.alink.pipeline.dataproc.vector.VectorMaxAbsScaler) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)

Example 3 with VectorAssemblerBatchOp

use of com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp in project Alink by alibaba.

the class GbdtBatchOpTest method linkFromVectorEps.

@Test
public void linkFromVectorEps() throws Exception {
    Row[] testArray = new Row[] { Row.of(1, 2, 0), Row.of(1, 2, 0), Row.of(0, 3, 1), Row.of(0, 2, 0), Row.of(1, 3, 1), Row.of(4, 3, 1), Row.of(4, 4, 1), Row.of(5, 3, 0), Row.of(5, 4, 0), Row.of(5, 2, 1) };
    String[] colNames = new String[] { "col0", "col1", "label" };
    MemSourceBatchOp memSourceBatchOp = new MemSourceBatchOp(Arrays.asList(testArray), colNames);
    VectorAssemblerBatchOp vectorAssemblerBatchOp = new VectorAssemblerBatchOp().setSelectedCols(colNames[0], colNames[1]).setReservedCols(colNames[2]).setOutputCol("vector");
    GbdtTrainBatchOp gbdtTrainBatchOp = new GbdtTrainBatchOp(new Params().set(BaseGbdtTrainBatchOp.USE_EPSILON_APPRO_QUANTILE, true)).setVectorCol("vector").setLabelCol(colNames[2]).setMaxLeaves(3).setMinSamplesPerLeaf(1).setLearningRate(1.0).setNumTrees(1).setCriteria(GbdtTrainParams.CriteriaType.XGBOOST);
    BatchOperator<?> model = gbdtTrainBatchOp.linkFrom(vectorAssemblerBatchOp.linkFrom(memSourceBatchOp));
    GbdtPredictBatchOp predictBatchOp = new GbdtPredictBatchOp().setVectorCol("vector").setPredictionCol("pred").setPredictionDetailCol("detail");
    Assert.assertEquals(testArray.length, predictBatchOp.linkFrom(model, vectorAssemblerBatchOp.linkFrom(memSourceBatchOp)).collect().size());
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) GbdtTrainParams(com.alibaba.alink.params.classification.GbdtTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) BaseGbdtTrainBatchOp(com.alibaba.alink.operator.common.tree.parallelcart.BaseGbdtTrainBatchOp) Test(org.junit.Test)

Example 4 with VectorAssemblerBatchOp

use of com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp in project Alink by alibaba.

the class MultiHotTest method testMultiHot.

@Test
public void testMultiHot() {
    BatchOperator<?> vecdata = new MemSourceBatchOp(Arrays.asList(array), veccolNames);
    BatchOperator<?> predData = new MemSourceBatchOp(Arrays.asList(predArray), veccolNames);
    MultiHotTrainBatchOp mh = new MultiHotTrainBatchOp().setSelectedCols(new String[] { "svec", "vec" }).setDiscreteThresholdsArray(3, 6).setDelimiter(" ").linkFrom(vecdata);
    mh.lazyPrintModelInfo();
    Row result = new MultiHotPredictBatchOp().setSelectedCols(new String[] { "svec", "vec" }).setHandleInvalid(HandleInvalid.KEEP).setEncode(Encode.VECTOR).setOutputCols("kv1", "kv2").linkFrom(mh, predData).link(new VectorAssemblerBatchOp().setSelectedCols("kv1", "kv2").setOutputCol("kv").setReservedCols()).collect().get(0);
    Assert.assertEquals(result.getField(0), VectorUtil.getVector("$14$1:1.0 2:1.0 4:1.0 5:1.0 11:1.0"));
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) Row(org.apache.flink.types.Row) Test(org.junit.Test)

Example 5 with VectorAssemblerBatchOp

use of com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp in project Alink by alibaba.

the class KnnTrainBatchOp method linkFrom.

@Override
public KnnTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    Preconditions.checkArgument(getFeatureCols() == null ^ getVectorCol() == null, "Must either set featureCols or vectorCol!");
    BatchOperator in = checkAndGetFirst(inputs);
    if (null != getFeatureCols()) {
        in = new VectorAssemblerBatchOp().setSelectedCols(getFeatureCols()).setOutputCol(VECTOR_COL).setReservedCols(getLabelCol()).linkFrom(in);
        this.setVectorCol(VECTOR_COL);
    }
    VectorNearestNeighborTrainBatchOp train = new VectorNearestNeighborTrainBatchOp(getParams()).setMetric(getDistanceType().name()).setSelectedCol(getVectorCol()).setIdCol(getLabelCol()).linkFrom(in);
    this.setOutput(train.getDataSet(), train.getSchema());
    return this;
}
Also used : VectorNearestNeighborTrainBatchOp(com.alibaba.alink.operator.batch.similarity.VectorNearestNeighborTrainBatchOp) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Aggregations

VectorAssemblerBatchOp (com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp)8 Test (org.junit.Test)4 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)3 Row (org.apache.flink.types.Row)3 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)2 BaseGbdtTrainBatchOp (com.alibaba.alink.operator.common.tree.parallelcart.BaseGbdtTrainBatchOp)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 GmmPredictBatchOp (com.alibaba.alink.operator.batch.clustering.GmmPredictBatchOp)1 GmmTrainBatchOp (com.alibaba.alink.operator.batch.clustering.GmmTrainBatchOp)1 KMeansPredictBatchOp (com.alibaba.alink.operator.batch.clustering.KMeansPredictBatchOp)1 KMeansTrainBatchOp (com.alibaba.alink.operator.batch.clustering.KMeansTrainBatchOp)1 VectorNormalizeBatchOp (com.alibaba.alink.operator.batch.dataproc.vector.VectorNormalizeBatchOp)1 VectorNearestNeighborTrainBatchOp (com.alibaba.alink.operator.batch.similarity.VectorNearestNeighborTrainBatchOp)1 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)1 VectorSummarizerBatchOp (com.alibaba.alink.operator.batch.statistics.VectorSummarizerBatchOp)1 ClusterMetrics (com.alibaba.alink.operator.common.evaluation.ClusterMetrics)1 GbdtTrainParams (com.alibaba.alink.params.classification.GbdtTrainParams)1 BisectingKMeans (com.alibaba.alink.pipeline.clustering.BisectingKMeans)1