use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.
the class ClusterEvaluationUtil method calMeanAndSum.
public static Tuple3<String, DenseVector, DenseVector> calMeanAndSum(Iterable<Tuple2<Vector, String>> rows, int vectorSize, FastDistance distance) {
int total = 0;
String clusterId = null;
DenseVector sumVector = DenseVector.zeros(vectorSize);
for (Tuple2<Vector, String> t : rows) {
if (null == clusterId) {
clusterId = t.f1;
}
Vector vec = t.f0;
if (distance instanceof EuclideanDistance) {
sumVector.plusEqual(vec);
} else {
vec.scaleEqual(1.0 / vec.normL2());
sumVector.plusEqual(vec);
}
total++;
}
DenseVector meanVector = sumVector.scale(1.0 / total);
if (distance instanceof CosineDistance) {
meanVector.scaleEqual(1.0 / meanVector.normL2());
}
return Tuple3.of(clusterId, meanVector, sumVector);
}
use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.
the class LocalKmeansFuncTest method kmeansSparseTest.
@Test
public void kmeansSparseTest() {
int len = 10;
int k = 2;
int size = 20;
EuclideanDistance distance = new EuclideanDistance();
long[] sampleWeights = new long[len];
FastDistanceVectorData[] samples = new FastDistanceVectorData[len];
for (int i = 0; i < 10; i++) {
sampleWeights[i] = i;
samples[i] = distance.prepareVectorData(Tuple2.of(new SparseVector(size, new int[] { i, i + 1 }, new double[] { i, i }), null));
}
FastDistanceMatrixData initCentroid = kmeans(k, sampleWeights, samples, distance, size, 0);
DenseMatrix initCentroidData = initCentroid.getVectors();
Assert.assertEquals(initCentroidData.numCols(), k);
Assert.assertEquals(new DenseVector(initCentroidData.getColumn(0)).normL2(), 8.615, 0.001);
Assert.assertEquals(new DenseVector(initCentroidData.getColumn(1)).normL2(), 4.128, 0.001);
}
use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.
the class KDTreeModelDataConverter method buildIndex.
@Override
public DataSet<Row> buildIndex(BatchOperator in, Params params) {
Preconditions.checkArgument(params.get(VectorApproxNearestNeighborTrainParams.METRIC).equals(VectorApproxNearestNeighborTrainParams.Metric.EUCLIDEAN), "KDTree solver only supports Euclidean distance!");
EuclideanDistance distance = new EuclideanDistance();
Tuple2<DataSet<Vector>, DataSet<BaseVectorSummary>> statistics = StatisticsHelper.summaryHelper(in, null, params.get(VectorApproxNearestNeighborTrainParams.SELECTED_COL));
return in.getDataSet().rebalance().mapPartition(new RichMapPartitionFunction<Row, Row>() {
private static final long serialVersionUID = 6654757741959479783L;
@Override
public void mapPartition(Iterable<Row> values, Collector<Row> out) throws Exception {
BaseVectorSummary summary = (BaseVectorSummary) getRuntimeContext().getBroadcastVariable("vectorSize").get(0);
int vectorSize = summary.vectorSize();
List<FastDistanceVectorData> list = new ArrayList<>();
for (Row row : values) {
FastDistanceVectorData vector = distance.prepareVectorData(row, 1, 0);
list.add(vector);
vectorSize = vector.getVector().size();
}
if (list.size() > 0) {
FastDistanceVectorData[] vectorArray = list.toArray(new FastDistanceVectorData[0]);
KDTree tree = new KDTree(vectorArray, vectorSize, distance);
tree.buildTree();
int taskId = getRuntimeContext().getIndexOfThisSubtask();
Row row = new Row(ROW_SIZE);
row.setField(TASKID_INDEX, (long) taskId);
for (int i = 0; i < vectorArray.length; i++) {
row.setField(DATA_ID_INDEX, (long) i);
row.setField(DATA_IDNEX, vectorArray[i].toString());
out.collect(row);
}
row.setField(DATA_ID_INDEX, null);
row.setField(DATA_IDNEX, null);
row.setField(ROOT_IDDEX, JsonConverter.toJson(tree.getRoot()));
out.collect(row);
}
}
}).withBroadcastSet(statistics.f1, "vectorSize").mapPartition(new RichMapPartitionFunction<Row, Row>() {
private static final long serialVersionUID = 6849403933586157611L;
@Override
public void mapPartition(Iterable<Row> values, Collector<Row> out) throws Exception {
Params meta = null;
if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
meta = params;
BaseVectorSummary summary = (BaseVectorSummary) getRuntimeContext().getBroadcastVariable("vectorSize").get(0);
int vectorSize = summary.vectorSize();
meta.set(VECTOR_SIZE, vectorSize);
}
new KDTreeModelDataConverter().save(Tuple2.of(meta, values), out);
}
}).withBroadcastSet(statistics.f1, "vectorSize");
}
use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.
the class ClusterEvaluationUtilTest method getClusterStatisticsEuclideanTest.
@Test
public void getClusterStatisticsEuclideanTest() {
Tuple2[] rows0 = new Tuple2[] { Tuple2.of(new DenseVector(new double[] { 0, 0, 0 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.1, 0.1, 0.1 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.2, 0.2, 0.2 }), "0") };
Tuple3<String, DenseVector, DenseVector> meanAndSum = ClusterEvaluationUtil.calMeanAndSum(Arrays.asList(rows0), 3, new EuclideanDistance());
ClusterMetricsSummary clusterMetricsSummary = ClusterEvaluationUtil.getClusterStatistics(Arrays.asList(rows0), new EuclideanDistance(), Tuple3.of("0", meanAndSum.f1, meanAndSum.f2));
Assert.assertEquals(clusterMetricsSummary.k, 1);
// Tuple6<String, Integer, Double, Double, Double, DenseVector> t = clusterMetricsSummary.map.get(0);
Assert.assertEquals(clusterMetricsSummary.clusterId.get(0), "0");
Assert.assertEquals(clusterMetricsSummary.clusterCnt.get(0).intValue(), 3);
Assert.assertEquals(clusterMetricsSummary.compactness.get(0), 0.115, 0.001);
Assert.assertEquals(clusterMetricsSummary.distanceSquareSum.get(0), 0.06, 0.01);
Assert.assertEquals(clusterMetricsSummary.vectorNormL2Sum.get(0), 0.15, 0.01);
Assert.assertEquals(clusterMetricsSummary.meanVector.get(0), new DenseVector(new double[] { 0.1, 0.1, 0.1 }));
Assert.assertEquals(clusterMetricsSummary.k, 1);
Assert.assertEquals(clusterMetricsSummary.total, 3);
}
use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.
the class ClusterEvaluationUtilTest method calMeanAndSumEuclideanTest.
@Test
public void calMeanAndSumEuclideanTest() {
Tuple2[] rows0 = new Tuple2[] { Tuple2.of(new DenseVector(new double[] { 0, 0, 0 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.1, 0.1, 0.1 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.2, 0.2, 0.2 }), "0") };
Tuple3<String, DenseVector, DenseVector> t = ClusterEvaluationUtil.calMeanAndSum(Arrays.asList(rows0), 3, new EuclideanDistance());
System.out.println(t);
}
Aggregations