use of com.alibaba.alink.pipeline.clustering.KMeans in project Alink by alibaba.
the class Chap19 method c_3.
static void c_3() throws Exception {
AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
source.link(new PcaTrainBatchOp().setK(39).setCalculationType(CalculationType.COV).setVectorCol(VECTOR_COL_NAME).lazyPrintModelInfo()).link(new AkSinkBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE).setOverwriteSink(true));
BatchOperator.execute();
BatchOperator<?> pca_result = new PcaPredictBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME).linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE), source);
Stopwatch sw = new Stopwatch();
KMeans kmeans = new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME);
sw.reset();
sw.start();
kmeans.fit(source).transform(source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
kmeans.fit(pca_result).transform(pca_result).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics("KMeans + PCA"));
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
use of com.alibaba.alink.pipeline.clustering.KMeans in project Alink by alibaba.
the class EvalClusterBatchOpTest method testNoVector.
@Test
public void testNoVector() throws Exception {
MemSourceBatchOp inOp = new MemSourceBatchOp(Arrays.asList(rows), new String[] { "label", "Y" });
KMeans train = new KMeans().setVectorCol("Y").setPredictionCol("pred").setK(2);
ClusterMetrics metrics = new EvalClusterBatchOp().setPredictionCol("pred").linkFrom(train.fit(inOp).transform(inOp)).collectMetrics();
Assert.assertEquals(metrics.getCount().intValue(), 6);
Assert.assertArrayEquals(metrics.getClusterArray(), new String[] { "0", "1" });
}
use of com.alibaba.alink.pipeline.clustering.KMeans in project Alink by alibaba.
the class GridSearchCVTest method findBestCluster.
@Test
public void findBestCluster() {
ColumnsToVector columnsToVector = new ColumnsToVector().setSelectedCols(colNames[0], colNames[1]).setVectorCol("vector");
KMeans kMeans = new KMeans().setVectorCol("vector").setPredictionCol("pred");
ParamGrid grid = new ParamGrid().addGrid(kMeans, KMeans.DISTANCE_TYPE, new HasKMeansDistanceType.DistanceType[] { EUCLIDEAN, COSINE });
Pipeline pipeline = new Pipeline().add(columnsToVector).add(kMeans);
GridSearchCV gridSearchCV = new GridSearchCV().setEstimator(pipeline).setParamGrid(grid).setNumFolds(2).setTuningEvaluator(new ClusterTuningEvaluator().setTuningClusterMetric(TuningClusterMetric.RI).setPredictionCol("pred").setVectorCol("vector").setLabelCol("label"));
GridSearchCVModel model = gridSearchCV.fit(memSourceBatchOp);
Assert.assertEquals(testArray.length, model.transform(memSourceBatchOp).collect().size());
}
use of com.alibaba.alink.pipeline.clustering.KMeans in project Alink by alibaba.
the class GridSearchTVSplitTest method findBestCluster.
@Test
public void findBestCluster() throws Exception {
ColumnsToVector columnsToVector = new ColumnsToVector().setSelectedCols(colNames[0], colNames[1]).setVectorCol("vector");
KMeans kMeans = new KMeans().setVectorCol("vector").setPredictionCol("pred");
ParamGrid grid = new ParamGrid().addGrid(kMeans, "distanceType", new HasKMeansDistanceType.DistanceType[] { EUCLIDEAN, COSINE });
Pipeline pipeline = new Pipeline().add(columnsToVector).add(kMeans);
GridSearchTVSplit gridSearchTVSplit = new GridSearchTVSplit().setEstimator(pipeline).setParamGrid(grid).setTrainRatio(0.5).setTuningEvaluator(new ClusterTuningEvaluator().setTuningClusterMetric(TuningClusterMetric.RI).setPredictionCol("pred").setVectorCol("vector").setLabelCol("label"));
GridSearchTVSplitModel model = gridSearchTVSplit.fit(memSourceBatchOp);
Assert.assertEquals(testArray.length, model.transform(memSourceBatchOp).collect().size());
}
use of com.alibaba.alink.pipeline.clustering.KMeans in project Alink by alibaba.
the class KMeansExample method main.
public static void main(String[] args) throws Exception {
String URL = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv";
String SCHEMA_STR = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string";
BatchOperator data = new CsvSourceBatchOp().setFilePath(URL).setSchemaStr(SCHEMA_STR);
VectorAssembler va = new VectorAssembler().setSelectedCols(new String[] { "sepal_length", "sepal_width", "petal_length", "petal_width" }).setOutputCol("features");
KMeans kMeans = new KMeans().setVectorCol("features").setK(3).setPredictionCol("prediction_result").setPredictionDetailCol("prediction_detail").setReservedCols("category").setMaxIter(100);
Pipeline pipeline = new Pipeline().add(va).add(kMeans);
pipeline.fit(data).transform(data).print();
}
Aggregations