use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.
the class Chap19 method c_3.
static void c_3() throws Exception {
AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
source.link(new PcaTrainBatchOp().setK(39).setCalculationType(CalculationType.COV).setVectorCol(VECTOR_COL_NAME).lazyPrintModelInfo()).link(new AkSinkBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE).setOverwriteSink(true));
BatchOperator.execute();
BatchOperator<?> pca_result = new PcaPredictBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME).linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE), source);
Stopwatch sw = new Stopwatch();
KMeans kmeans = new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME);
sw.reset();
sw.start();
kmeans.fit(source).transform(source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
kmeans.fit(pca_result).transform(pca_result).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans + PCA"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.
the class GaussianMixtureTest method testLazyPrintClusterSummaries.
@Test
public void testLazyPrintClusterSummaries() throws Exception {
VectorAssemblerBatchOp op = new VectorAssemblerBatchOp().setSelectedCols(Iris.getFeatureColNames()).setOutputCol("x").linkFrom(Iris.getBatchData());
GmmTrainBatchOp gmm = new GmmTrainBatchOp().setVectorCol("x").setK(2).setEpsilon(0.).linkFrom(op);
GmmPredictBatchOp predict = new GmmPredictBatchOp().setVectorCol("x").setPredictionCol("pred").linkFrom(gmm, op);
ClusterMetrics eval = new EvalClusterBatchOp().setVectorCol("x").setLabelCol(Iris.getLabelColName()).setPredictionCol("pred").linkFrom(predict).collectMetrics();
Assert.assertEquals(eval.getDb(), 1.15, 0.01);
Assert.assertEquals(eval.getAri(), 0.35, 0.01);
}
use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.
the class Chap21 method c_7.
private static void c_7() throws Exception {
BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
docs.lazyPrint(10);
if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
docs.link(lda);
lda.lazyPrintModelInfo();
lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
BatchOperator.execute();
}
new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
pwz.sample(0.001).lazyPrint(10);
for (int t = 0; t < 10; t++) {
pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
}
BatchOperator.execute();
}
use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.
the class Chap17 method c_2_2.
static void c_2_2() throws Exception {
if (!new File(DATA_DIR + VECTOR_FILE).exists()) {
new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING).link(new VectorAssemblerBatchOp().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VECTOR_COL_NAME).setReservedCols(LABEL_COL_NAME)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + VECTOR_FILE));
BatchOperator.execute();
}
AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(DATA_DIR + VECTOR_FILE);
source.lazyPrint(5);
KMeansTrainBatchOp kmeans_model = new KMeansTrainBatchOp().setK(2).setVectorCol(VECTOR_COL_NAME);
KMeansPredictBatchOp kmeans_pred = new KMeansPredictBatchOp().setPredictionCol(PREDICTION_COL_NAME);
source.link(kmeans_model);
kmeans_pred.linkFrom(kmeans_model, source);
kmeans_model.lazyPrintModelInfo();
kmeans_pred.lazyPrint(5);
kmeans_pred.link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("KMeans EUCLIDEAN"));
kmeans_pred.orderBy(PREDICTION_COL_NAME + ", " + LABEL_COL_NAME, 200, false).lazyPrint(-1, "all data");
BatchOperator.execute();
new KMeans().setK(2).setDistanceType(DistanceType.COSINE).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).enableLazyPrintModelInfo().fit(source).transform(source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans COSINE"));
BatchOperator.execute();
}
use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.
the class Chap18 method c_1.
static void c_1() throws Exception {
AkSourceBatchOp dense_source = new AkSourceBatchOp().setFilePath(DATA_DIR + DENSE_TRAIN_FILE);
AkSourceBatchOp sparse_source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
Stopwatch sw = new Stopwatch();
ArrayList<Tuple2<String, Pipeline>> pipelineList = new ArrayList<>();
pipelineList.add(new Tuple2<>("KMeans EUCLIDEAN", new Pipeline().add(new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
pipelineList.add(new Tuple2<>("KMeans COSINE", new Pipeline().add(new KMeans().setDistanceType(DistanceType.COSINE).setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
pipelineList.add(new Tuple2<>("BisectingKMeans", new Pipeline().add(new BisectingKMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
for (Tuple2<String, Pipeline> pipelineTuple2 : pipelineList) {
sw.reset();
sw.start();
pipelineTuple2.f1.fit(dense_source).transform(dense_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " DENSE"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
pipelineTuple2.f1.fit(sparse_source).transform(sparse_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " SPARSE"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
}
Aggregations