Search in sources :

Example 11 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class KMeansExample method main.

public static void main(String[] args) throws Exception {
    String URL = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv";
    String SCHEMA_STR = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string";
    BatchOperator data = new CsvSourceBatchOp().setFilePath(URL).setSchemaStr(SCHEMA_STR);
    VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "sepal_length", "sepal_width", "petal_length", "petal_width" }).setOutputCol("features");
    KMeans kMeans = new KMeans().setVectorCol("features").setK(3).setPredictionCol("prediction_result").setPredictionDetailCol("prediction_detail").setReservedCols("category").setMaxIter(100);
    Pipeline pipeline = new Pipeline().add(va).add(kMeans);
    pipeline.fit(data).transform(data).print();
}
Also used : KMeans(com.alibaba.alink.pipeline.clustering.KMeans) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 12 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class Chap23 method c_3.

static void c_3() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2").enableLazyPrintTransformData(1, "2-gram")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 13 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class FmRecommTrainBatchOp method linkFrom.

/**
 * There are 3 input tables: 1) user-item-label table, 2) user features table, 3) item features table.
 * If user or item features table is missing, then use their IDs as features.
 */
@Override
public FmRecommTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> samplesOp = inputs[0];
    final Long envId = samplesOp.getMLEnvironmentId();
    BatchOperator<?> userFeaturesOp = inputs.length >= 2 ? inputs[1] : null;
    BatchOperator<?> itemFeaturesOp = inputs.length >= 3 ? inputs[2] : null;
    Params params = getParams();
    String userCol = params.get(USER_COL);
    String itemCol = params.get(ITEM_COL);
    String labelCol = params.get(RATE_COL);
    String[] userFeatureCols = params.get(USER_FEATURE_COLS);
    String[] itemFeatureCols = params.get(ITEM_FEATURE_COLS);
    String[] userCateFeatureCols = params.get(USER_CATEGORICAL_FEATURE_COLS);
    String[] itemCateFeatureCols = params.get(ITEM_CATEGORICAL_FEATURE_COLS);
    if (userFeaturesOp == null) {
        userFeaturesOp = samplesOp.select("`" + userCol + "`").distinct();
        userFeatureCols = new String[] { userCol };
        userCateFeatureCols = new String[] { userCol };
    } else {
        Preconditions.checkArgument(TableUtil.findColTypeWithAssert(userFeaturesOp.getSchema(), userCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), userCol)), "user column type mismatch");
    }
    if (itemFeaturesOp == null) {
        itemFeaturesOp = samplesOp.select("`" + itemCol + "`").distinct();
        itemFeatureCols = new String[] { itemCol };
        itemCateFeatureCols = new String[] { itemCol };
    } else {
        Preconditions.checkArgument(TableUtil.findColTypeWithAssert(itemFeaturesOp.getSchema(), itemCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), itemCol)), "item column type mismatch");
    }
    BatchOperator<?> history = samplesOp.select(new String[] { userCol, itemCol });
    userFeaturesOp = createFeatureVectors(userFeaturesOp, userCol, userFeatureCols, userCateFeatureCols);
    itemFeaturesOp = createFeatureVectors(itemFeaturesOp, itemCol, itemFeatureCols, itemCateFeatureCols);
    LeftOuterJoinBatchOp joinOp1 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + userCol + "`=" + "b.`" + userCol + "`").setSelectClause("a.*, b.__fm_features__ as __user_features__");
    LeftOuterJoinBatchOp joinOp2 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + itemCol + "`=" + "b.`" + itemCol + "`").setSelectClause("a.*, b.__fm_features__ as __item_features__");
    samplesOp = joinOp1.linkFrom(samplesOp, userFeaturesOp);
    samplesOp = joinOp2.linkFrom(samplesOp, itemFeaturesOp);
    samplesOp = samplesOp.udf("__user_features__", "__user_features__", new CheckNotNull());
    samplesOp = samplesOp.udf("__item_features__", "__item_features__", new CheckNotNull());
    VectorAssembler va = new VectorAssembler().setMLEnvironmentId(envId).setSelectedCols("__user_features__", "__item_features__").setOutputCol("__alink_features__").setReservedCols(labelCol);
    samplesOp = va.transform(samplesOp);
    BatchOperator<?> fmModel;
    if (!implicitFeedback) {
        fmModel = new FmRegressorTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
    } else {
        fmModel = new FmClassifierTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
    }
    fmModel.linkFrom(samplesOp);
    BatchOperator<?> model = PackBatchOperatorUtil.packBatchOps(new BatchOperator<?>[] { fmModel, userFeaturesOp, itemFeaturesOp, history });
    setOutputTable(model.getOutputTable());
    return this;
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) FmRegressorTrainBatchOp(com.alibaba.alink.operator.batch.regression.FmRegressorTrainBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) FmRecommTrainParams(com.alibaba.alink.params.recommendation.FmRecommTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) FmClassifierTrainBatchOp(com.alibaba.alink.operator.batch.classification.FmClassifierTrainBatchOp)

Example 14 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class LogisticRegressionMixVecTest method batchMixVecTest14.

@Test
public void batchMixVecTest14() {
    BatchOperator<?> trainData = (BatchOperator<?>) getData();
    Pipeline pipeline = new Pipeline().add(new VectorAssembler().setSelectedCols(new String[] { "svec", "vec", "f0", "f1", "f2", "f3" }).setOutputCol("allvec"));
    PipelineModel model = pipeline.fit(trainData);
    BatchOperator<?> result = model.transform(trainData);
    result.collect();
}
Also used : VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Test(org.junit.Test)

Example 15 with VectorAssembler

use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.

the class LocalPredictorTest method getPipeline.

protected Pipeline getPipeline() {
    // model mapper
    QuantileDiscretizer quantileDiscretizer = new QuantileDiscretizer().setNumBuckets(2).setSelectedCols("sepal_length");
    // SISO mapper
    Binarizer binarizer = new Binarizer().setSelectedCol("petal_width").setOutputCol("bina").setReservedCols("sepal_length", "petal_width", "petal_length", "category").setThreshold(1.);
    // MISO Mapper
    VectorAssembler assembler = new VectorAssembler().setSelectedCols("sepal_length", "petal_width").setOutputCol("assem").setReservedCols("sepal_length", "petal_width", "petal_length", "category");
    // Lda
    Lda lda = new Lda().setPredictionCol("lda_pred").setPredictionDetailCol("lda_pred_detail").setSelectedCol("category").setTopicNum(2).setRandomSeed(0);
    Select select = new Select().setClause("cast(sepal_length as double) as sepal_length, " + "cast(petal_width as double) as petal_width, " + "cast(petal_length as double) as petal_length, " + "category");
    // Glm
    GeneralizedLinearRegression glm = new GeneralizedLinearRegression().setFeatureCols("sepal_length", "petal_width").setLabelCol("petal_length").setPredictionCol("glm_pred");
    return new Pipeline().add(binarizer).add(assembler).add(quantileDiscretizer).add(glm);
}
Also used : VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) GeneralizedLinearRegression(com.alibaba.alink.pipeline.regression.GeneralizedLinearRegression) Lda(com.alibaba.alink.pipeline.clustering.Lda) Select(com.alibaba.alink.pipeline.sql.Select) Binarizer(com.alibaba.alink.pipeline.feature.Binarizer) QuantileDiscretizer(com.alibaba.alink.pipeline.feature.QuantileDiscretizer)

Aggregations

VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)26 Test (org.junit.Test)16 Pipeline (com.alibaba.alink.pipeline.Pipeline)11 MultilayerPerceptronClassifier (com.alibaba.alink.pipeline.classification.MultilayerPerceptronClassifier)9 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)8 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)7 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)7 Row (org.apache.flink.types.Row)7 FilePath (com.alibaba.alink.common.io.filesystem.FilePath)4 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)4 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)4 OneHotEncoder (com.alibaba.alink.pipeline.feature.OneHotEncoder)3 TableSchema (org.apache.flink.table.api.TableSchema)3 DenseVector (com.alibaba.alink.common.linalg.DenseVector)2 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)2 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)2 Lda (com.alibaba.alink.pipeline.clustering.Lda)2 Binarizer (com.alibaba.alink.pipeline.feature.Binarizer)2 QuantileDiscretizer (com.alibaba.alink.pipeline.feature.QuantileDiscretizer)2 DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)2