Search in sources :

Example 6 with EvalClusterBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.

the class Chap19 method c_3.

static void c_3() throws Exception {
    AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
    source.link(new PcaTrainBatchOp().setK(39).setCalculationType(CalculationType.COV).setVectorCol(VECTOR_COL_NAME).lazyPrintModelInfo()).link(new AkSinkBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE).setOverwriteSink(true));
    BatchOperator.execute();
    BatchOperator<?> pca_result = new PcaPredictBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME).linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE), source);
    Stopwatch sw = new Stopwatch();
    KMeans kmeans = new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME);
    sw.reset();
    sw.start();
    kmeans.fit(source).transform(source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    kmeans.fit(pca_result).transform(pca_result).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans + PCA"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) KMeans(com.alibaba.alink.pipeline.clustering.KMeans) PcaPredictBatchOp(com.alibaba.alink.operator.batch.feature.PcaPredictBatchOp) PcaTrainBatchOp(com.alibaba.alink.operator.batch.feature.PcaTrainBatchOp) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 7 with EvalClusterBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.

the class GaussianMixtureTest method testLazyPrintClusterSummaries.

@Test
public void testLazyPrintClusterSummaries() throws Exception {
    VectorAssemblerBatchOp op = new VectorAssemblerBatchOp().setSelectedCols(Iris.getFeatureColNames()).setOutputCol("x").linkFrom(Iris.getBatchData());
    GmmTrainBatchOp gmm = new GmmTrainBatchOp().setVectorCol("x").setK(2).setEpsilon(0.).linkFrom(op);
    GmmPredictBatchOp predict = new GmmPredictBatchOp().setVectorCol("x").setPredictionCol("pred").linkFrom(gmm, op);
    ClusterMetrics eval = new EvalClusterBatchOp().setVectorCol("x").setLabelCol(Iris.getLabelColName()).setPredictionCol("pred").linkFrom(predict).collectMetrics();
    Assert.assertEquals(eval.getDb(), 1.15, 0.01);
    Assert.assertEquals(eval.getAri(), 0.35, 0.01);
}
Also used : GmmPredictBatchOp(com.alibaba.alink.operator.batch.clustering.GmmPredictBatchOp) GmmTrainBatchOp(com.alibaba.alink.operator.batch.clustering.GmmTrainBatchOp) ClusterMetrics(com.alibaba.alink.operator.common.evaluation.ClusterMetrics) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp) Test(org.junit.Test)

Example 8 with EvalClusterBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.

the class Chap21 method c_7.

private static void c_7() throws Exception {
    BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
    docs.lazyPrint(10);
    if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
        LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
        docs.link(lda);
        lda.lazyPrintModelInfo();
        lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
        lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
        BatchOperator.execute();
    }
    new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
    AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
    pwz.sample(0.001).lazyPrint(10);
    for (int t = 0; t < 10; t++) {
        pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
    }
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) LdaTrainBatchOp(com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp) LdaPredictBatchOp(com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 9 with EvalClusterBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.

the class Chap17 method c_2_2.

static void c_2_2() throws Exception {
    if (!new File(DATA_DIR + VECTOR_FILE).exists()) {
        new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING).link(new VectorAssemblerBatchOp().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VECTOR_COL_NAME).setReservedCols(LABEL_COL_NAME)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + VECTOR_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(DATA_DIR + VECTOR_FILE);
    source.lazyPrint(5);
    KMeansTrainBatchOp kmeans_model = new KMeansTrainBatchOp().setK(2).setVectorCol(VECTOR_COL_NAME);
    KMeansPredictBatchOp kmeans_pred = new KMeansPredictBatchOp().setPredictionCol(PREDICTION_COL_NAME);
    source.link(kmeans_model);
    kmeans_pred.linkFrom(kmeans_model, source);
    kmeans_model.lazyPrintModelInfo();
    kmeans_pred.lazyPrint(5);
    kmeans_pred.link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("KMeans EUCLIDEAN"));
    kmeans_pred.orderBy(PREDICTION_COL_NAME + ", " + LABEL_COL_NAME, 200, false).lazyPrint(-1, "all data");
    BatchOperator.execute();
    new KMeans().setK(2).setDistanceType(DistanceType.COSINE).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).enableLazyPrintModelInfo().fit(source).transform(source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans COSINE"));
    BatchOperator.execute();
}
Also used : KMeansPredictBatchOp(com.alibaba.alink.operator.batch.clustering.KMeansPredictBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) BisectingKMeans(com.alibaba.alink.pipeline.clustering.BisectingKMeans) KMeans(com.alibaba.alink.pipeline.clustering.KMeans) GeoKMeans(com.alibaba.alink.pipeline.clustering.GeoKMeans) VectorAssemblerBatchOp(com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) KMeansTrainBatchOp(com.alibaba.alink.operator.batch.clustering.KMeansTrainBatchOp) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 10 with EvalClusterBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp in project Alink by alibaba.

the class Chap18 method c_1.

static void c_1() throws Exception {
    AkSourceBatchOp dense_source = new AkSourceBatchOp().setFilePath(DATA_DIR + DENSE_TRAIN_FILE);
    AkSourceBatchOp sparse_source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
    Stopwatch sw = new Stopwatch();
    ArrayList<Tuple2<String, Pipeline>> pipelineList = new ArrayList<>();
    pipelineList.add(new Tuple2<>("KMeans EUCLIDEAN", new Pipeline().add(new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    pipelineList.add(new Tuple2<>("KMeans COSINE", new Pipeline().add(new KMeans().setDistanceType(DistanceType.COSINE).setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    pipelineList.add(new Tuple2<>("BisectingKMeans", new Pipeline().add(new BisectingKMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    for (Tuple2<String, Pipeline> pipelineTuple2 : pipelineList) {
        sw.reset();
        sw.start();
        pipelineTuple2.f1.fit(dense_source).transform(dense_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " DENSE"));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
        sw.reset();
        sw.start();
        pipelineTuple2.f1.fit(sparse_source).transform(sparse_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " SPARSE"));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) BisectingKMeans(com.alibaba.alink.pipeline.clustering.BisectingKMeans) KMeans(com.alibaba.alink.pipeline.clustering.KMeans) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) ArrayList(java.util.ArrayList) BisectingKMeans(com.alibaba.alink.pipeline.clustering.BisectingKMeans) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Aggregations

EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)11 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)9 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)4 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)4 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)3 BisectingKMeans (com.alibaba.alink.pipeline.clustering.BisectingKMeans)3 File (java.io.File)3 KMeansPredictBatchOp (com.alibaba.alink.operator.batch.clustering.KMeansPredictBatchOp)2 KMeansTrainBatchOp (com.alibaba.alink.operator.batch.clustering.KMeansTrainBatchOp)2 VectorAssemblerBatchOp (com.alibaba.alink.operator.batch.dataproc.vector.VectorAssemblerBatchOp)2 AkSinkStreamOp (com.alibaba.alink.operator.stream.sink.AkSinkStreamOp)2 AkSourceStreamOp (com.alibaba.alink.operator.stream.source.AkSourceStreamOp)2 GeoKMeans (com.alibaba.alink.pipeline.clustering.GeoKMeans)2 GmmPredictBatchOp (com.alibaba.alink.operator.batch.clustering.GmmPredictBatchOp)1 GmmTrainBatchOp (com.alibaba.alink.operator.batch.clustering.GmmTrainBatchOp)1 LdaPredictBatchOp (com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp)1 LdaTrainBatchOp (com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp)1 PcaPredictBatchOp (com.alibaba.alink.operator.batch.feature.PcaPredictBatchOp)1 PcaTrainBatchOp (com.alibaba.alink.operator.batch.feature.PcaTrainBatchOp)1 SegmentBatchOp (com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)1