use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.
the class KMeansExample method main.
public static void main(String[] args) throws Exception {
String URL = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv";
String SCHEMA_STR = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string";
BatchOperator data = new CsvSourceBatchOp().setFilePath(URL).setSchemaStr(SCHEMA_STR);
VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "sepal_length", "sepal_width", "petal_length", "petal_width" }).setOutputCol("features");
KMeans kMeans = new KMeans().setVectorCol("features").setK(3).setPredictionCol("prediction_result").setPredictionDetailCol("prediction_detail").setReservedCols("category").setMaxIter(100);
Pipeline pipeline = new Pipeline().add(va).add(kMeans);
pipeline.fit(data).transform(data).print();
}
use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.
the class Chap23 method c_3.
static void c_3() throws Exception {
AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2").enableLazyPrintTransformData(1, "2-gram")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2"));
BatchOperator.execute();
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
BatchOperator.execute();
}
use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.
the class FmRecommTrainBatchOp method linkFrom.
/**
* There are 3 input tables: 1) user-item-label table, 2) user features table, 3) item features table.
* If user or item features table is missing, then use their IDs as features.
*/
@Override
public FmRecommTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> samplesOp = inputs[0];
final Long envId = samplesOp.getMLEnvironmentId();
BatchOperator<?> userFeaturesOp = inputs.length >= 2 ? inputs[1] : null;
BatchOperator<?> itemFeaturesOp = inputs.length >= 3 ? inputs[2] : null;
Params params = getParams();
String userCol = params.get(USER_COL);
String itemCol = params.get(ITEM_COL);
String labelCol = params.get(RATE_COL);
String[] userFeatureCols = params.get(USER_FEATURE_COLS);
String[] itemFeatureCols = params.get(ITEM_FEATURE_COLS);
String[] userCateFeatureCols = params.get(USER_CATEGORICAL_FEATURE_COLS);
String[] itemCateFeatureCols = params.get(ITEM_CATEGORICAL_FEATURE_COLS);
if (userFeaturesOp == null) {
userFeaturesOp = samplesOp.select("`" + userCol + "`").distinct();
userFeatureCols = new String[] { userCol };
userCateFeatureCols = new String[] { userCol };
} else {
Preconditions.checkArgument(TableUtil.findColTypeWithAssert(userFeaturesOp.getSchema(), userCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), userCol)), "user column type mismatch");
}
if (itemFeaturesOp == null) {
itemFeaturesOp = samplesOp.select("`" + itemCol + "`").distinct();
itemFeatureCols = new String[] { itemCol };
itemCateFeatureCols = new String[] { itemCol };
} else {
Preconditions.checkArgument(TableUtil.findColTypeWithAssert(itemFeaturesOp.getSchema(), itemCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), itemCol)), "item column type mismatch");
}
BatchOperator<?> history = samplesOp.select(new String[] { userCol, itemCol });
userFeaturesOp = createFeatureVectors(userFeaturesOp, userCol, userFeatureCols, userCateFeatureCols);
itemFeaturesOp = createFeatureVectors(itemFeaturesOp, itemCol, itemFeatureCols, itemCateFeatureCols);
LeftOuterJoinBatchOp joinOp1 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + userCol + "`=" + "b.`" + userCol + "`").setSelectClause("a.*, b.__fm_features__ as __user_features__");
LeftOuterJoinBatchOp joinOp2 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + itemCol + "`=" + "b.`" + itemCol + "`").setSelectClause("a.*, b.__fm_features__ as __item_features__");
samplesOp = joinOp1.linkFrom(samplesOp, userFeaturesOp);
samplesOp = joinOp2.linkFrom(samplesOp, itemFeaturesOp);
samplesOp = samplesOp.udf("__user_features__", "__user_features__", new CheckNotNull());
samplesOp = samplesOp.udf("__item_features__", "__item_features__", new CheckNotNull());
VectorAssembler va = new VectorAssembler().setMLEnvironmentId(envId).setSelectedCols("__user_features__", "__item_features__").setOutputCol("__alink_features__").setReservedCols(labelCol);
samplesOp = va.transform(samplesOp);
BatchOperator<?> fmModel;
if (!implicitFeedback) {
fmModel = new FmRegressorTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
} else {
fmModel = new FmClassifierTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
}
fmModel.linkFrom(samplesOp);
BatchOperator<?> model = PackBatchOperatorUtil.packBatchOps(new BatchOperator<?>[] { fmModel, userFeaturesOp, itemFeaturesOp, history });
setOutputTable(model.getOutputTable());
return this;
}
use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.
the class LogisticRegressionMixVecTest method batchMixVecTest14.
@Test
public void batchMixVecTest14() {
BatchOperator<?> trainData = (BatchOperator<?>) getData();
Pipeline pipeline = new Pipeline().add(new VectorAssembler().setSelectedCols(new String[] { "svec", "vec", "f0", "f1", "f2", "f3" }).setOutputCol("allvec"));
PipelineModel model = pipeline.fit(trainData);
BatchOperator<?> result = model.transform(trainData);
result.collect();
}
use of com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler in project Alink by alibaba.
the class LocalPredictorTest method getPipeline.
protected Pipeline getPipeline() {
// model mapper
QuantileDiscretizer quantileDiscretizer = new QuantileDiscretizer().setNumBuckets(2).setSelectedCols("sepal_length");
// SISO mapper
Binarizer binarizer = new Binarizer().setSelectedCol("petal_width").setOutputCol("bina").setReservedCols("sepal_length", "petal_width", "petal_length", "category").setThreshold(1.);
// MISO Mapper
VectorAssembler assembler = new VectorAssembler().setSelectedCols("sepal_length", "petal_width").setOutputCol("assem").setReservedCols("sepal_length", "petal_width", "petal_length", "category");
// Lda
Lda lda = new Lda().setPredictionCol("lda_pred").setPredictionDetailCol("lda_pred_detail").setSelectedCol("category").setTopicNum(2).setRandomSeed(0);
Select select = new Select().setClause("cast(sepal_length as double) as sepal_length, " + "cast(petal_width as double) as petal_width, " + "cast(petal_length as double) as petal_length, " + "category");
// Glm
GeneralizedLinearRegression glm = new GeneralizedLinearRegression().setFeatureCols("sepal_length", "petal_width").setLabelCol("petal_length").setPredictionCol("glm_pred");
return new Pipeline().add(binarizer).add(assembler).add(quantileDiscretizer).add(glm);
}
Aggregations