Search in sources :

Example 1 with EuclideanDistance

use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.

the class ClusterEvaluationUtil method calMeanAndSum.

public static Tuple3<String, DenseVector, DenseVector> calMeanAndSum(Iterable<Tuple2<Vector, String>> rows, int vectorSize, FastDistance distance) {
    int total = 0;
    String clusterId = null;
    DenseVector sumVector = DenseVector.zeros(vectorSize);
    for (Tuple2<Vector, String> t : rows) {
        if (null == clusterId) {
            clusterId = t.f1;
        }
        Vector vec = t.f0;
        if (distance instanceof EuclideanDistance) {
            sumVector.plusEqual(vec);
        } else {
            vec.scaleEqual(1.0 / vec.normL2());
            sumVector.plusEqual(vec);
        }
        total++;
    }
    DenseVector meanVector = sumVector.scale(1.0 / total);
    if (distance instanceof CosineDistance) {
        meanVector.scaleEqual(1.0 / meanVector.normL2());
    }
    return Tuple3.of(clusterId, meanVector, sumVector);
}
Also used : EuclideanDistance(com.alibaba.alink.operator.common.distance.EuclideanDistance) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) CosineDistance(com.alibaba.alink.operator.common.distance.CosineDistance)

Example 2 with EuclideanDistance

use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.

the class LocalKmeansFuncTest method kmeansSparseTest.

@Test
public void kmeansSparseTest() {
    int len = 10;
    int k = 2;
    int size = 20;
    EuclideanDistance distance = new EuclideanDistance();
    long[] sampleWeights = new long[len];
    FastDistanceVectorData[] samples = new FastDistanceVectorData[len];
    for (int i = 0; i < 10; i++) {
        sampleWeights[i] = i;
        samples[i] = distance.prepareVectorData(Tuple2.of(new SparseVector(size, new int[] { i, i + 1 }, new double[] { i, i }), null));
    }
    FastDistanceMatrixData initCentroid = kmeans(k, sampleWeights, samples, distance, size, 0);
    DenseMatrix initCentroidData = initCentroid.getVectors();
    Assert.assertEquals(initCentroidData.numCols(), k);
    Assert.assertEquals(new DenseVector(initCentroidData.getColumn(0)).normL2(), 8.615, 0.001);
    Assert.assertEquals(new DenseVector(initCentroidData.getColumn(1)).normL2(), 4.128, 0.001);
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) EuclideanDistance(com.alibaba.alink.operator.common.distance.EuclideanDistance) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) SparseVector(com.alibaba.alink.common.linalg.SparseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix) Test(org.junit.Test)

Example 3 with EuclideanDistance

use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.

the class KDTreeModelDataConverter method buildIndex.

@Override
public DataSet<Row> buildIndex(BatchOperator in, Params params) {
    Preconditions.checkArgument(params.get(VectorApproxNearestNeighborTrainParams.METRIC).equals(VectorApproxNearestNeighborTrainParams.Metric.EUCLIDEAN), "KDTree solver only supports Euclidean distance!");
    EuclideanDistance distance = new EuclideanDistance();
    Tuple2<DataSet<Vector>, DataSet<BaseVectorSummary>> statistics = StatisticsHelper.summaryHelper(in, null, params.get(VectorApproxNearestNeighborTrainParams.SELECTED_COL));
    return in.getDataSet().rebalance().mapPartition(new RichMapPartitionFunction<Row, Row>() {

        private static final long serialVersionUID = 6654757741959479783L;

        @Override
        public void mapPartition(Iterable<Row> values, Collector<Row> out) throws Exception {
            BaseVectorSummary summary = (BaseVectorSummary) getRuntimeContext().getBroadcastVariable("vectorSize").get(0);
            int vectorSize = summary.vectorSize();
            List<FastDistanceVectorData> list = new ArrayList<>();
            for (Row row : values) {
                FastDistanceVectorData vector = distance.prepareVectorData(row, 1, 0);
                list.add(vector);
                vectorSize = vector.getVector().size();
            }
            if (list.size() > 0) {
                FastDistanceVectorData[] vectorArray = list.toArray(new FastDistanceVectorData[0]);
                KDTree tree = new KDTree(vectorArray, vectorSize, distance);
                tree.buildTree();
                int taskId = getRuntimeContext().getIndexOfThisSubtask();
                Row row = new Row(ROW_SIZE);
                row.setField(TASKID_INDEX, (long) taskId);
                for (int i = 0; i < vectorArray.length; i++) {
                    row.setField(DATA_ID_INDEX, (long) i);
                    row.setField(DATA_IDNEX, vectorArray[i].toString());
                    out.collect(row);
                }
                row.setField(DATA_ID_INDEX, null);
                row.setField(DATA_IDNEX, null);
                row.setField(ROOT_IDDEX, JsonConverter.toJson(tree.getRoot()));
                out.collect(row);
            }
        }
    }).withBroadcastSet(statistics.f1, "vectorSize").mapPartition(new RichMapPartitionFunction<Row, Row>() {

        private static final long serialVersionUID = 6849403933586157611L;

        @Override
        public void mapPartition(Iterable<Row> values, Collector<Row> out) throws Exception {
            Params meta = null;
            if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                meta = params;
                BaseVectorSummary summary = (BaseVectorSummary) getRuntimeContext().getBroadcastVariable("vectorSize").get(0);
                int vectorSize = summary.vectorSize();
                meta.set(VECTOR_SIZE, vectorSize);
            }
            new KDTreeModelDataConverter().save(Tuple2.of(meta, values), out);
        }
    }).withBroadcastSet(statistics.f1, "vectorSize");
}
Also used : DataSet(org.apache.flink.api.java.DataSet) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) ArrayList(java.util.ArrayList) VectorApproxNearestNeighborTrainParams(com.alibaba.alink.params.similarity.VectorApproxNearestNeighborTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) EuclideanDistance(com.alibaba.alink.operator.common.distance.EuclideanDistance) KDTree(com.alibaba.alink.operator.common.similarity.KDTree) BaseVectorSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary) Collector(org.apache.flink.util.Collector) Row(org.apache.flink.types.Row)

Example 4 with EuclideanDistance

use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.

the class ClusterEvaluationUtilTest method getClusterStatisticsEuclideanTest.

@Test
public void getClusterStatisticsEuclideanTest() {
    Tuple2[] rows0 = new Tuple2[] { Tuple2.of(new DenseVector(new double[] { 0, 0, 0 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.1, 0.1, 0.1 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.2, 0.2, 0.2 }), "0") };
    Tuple3<String, DenseVector, DenseVector> meanAndSum = ClusterEvaluationUtil.calMeanAndSum(Arrays.asList(rows0), 3, new EuclideanDistance());
    ClusterMetricsSummary clusterMetricsSummary = ClusterEvaluationUtil.getClusterStatistics(Arrays.asList(rows0), new EuclideanDistance(), Tuple3.of("0", meanAndSum.f1, meanAndSum.f2));
    Assert.assertEquals(clusterMetricsSummary.k, 1);
    // Tuple6<String, Integer, Double, Double, Double, DenseVector> t = clusterMetricsSummary.map.get(0);
    Assert.assertEquals(clusterMetricsSummary.clusterId.get(0), "0");
    Assert.assertEquals(clusterMetricsSummary.clusterCnt.get(0).intValue(), 3);
    Assert.assertEquals(clusterMetricsSummary.compactness.get(0), 0.115, 0.001);
    Assert.assertEquals(clusterMetricsSummary.distanceSquareSum.get(0), 0.06, 0.01);
    Assert.assertEquals(clusterMetricsSummary.vectorNormL2Sum.get(0), 0.15, 0.01);
    Assert.assertEquals(clusterMetricsSummary.meanVector.get(0), new DenseVector(new double[] { 0.1, 0.1, 0.1 }));
    Assert.assertEquals(clusterMetricsSummary.k, 1);
    Assert.assertEquals(clusterMetricsSummary.total, 3);
}
Also used : EuclideanDistance(com.alibaba.alink.operator.common.distance.EuclideanDistance) Tuple2(org.apache.flink.api.java.tuple.Tuple2) DenseVector(com.alibaba.alink.common.linalg.DenseVector) Test(org.junit.Test)

Example 5 with EuclideanDistance

use of com.alibaba.alink.operator.common.distance.EuclideanDistance in project Alink by alibaba.

the class ClusterEvaluationUtilTest method calMeanAndSumEuclideanTest.

@Test
public void calMeanAndSumEuclideanTest() {
    Tuple2[] rows0 = new Tuple2[] { Tuple2.of(new DenseVector(new double[] { 0, 0, 0 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.1, 0.1, 0.1 }), "0"), Tuple2.of(new DenseVector(new double[] { 0.2, 0.2, 0.2 }), "0") };
    Tuple3<String, DenseVector, DenseVector> t = ClusterEvaluationUtil.calMeanAndSum(Arrays.asList(rows0), 3, new EuclideanDistance());
    System.out.println(t);
}
Also used : EuclideanDistance(com.alibaba.alink.operator.common.distance.EuclideanDistance) Tuple2(org.apache.flink.api.java.tuple.Tuple2) DenseVector(com.alibaba.alink.common.linalg.DenseVector) Test(org.junit.Test)

Aggregations

EuclideanDistance (com.alibaba.alink.operator.common.distance.EuclideanDistance)15 DenseVector (com.alibaba.alink.common.linalg.DenseVector)12 Test (org.junit.Test)11 FastDistanceVectorData (com.alibaba.alink.operator.common.distance.FastDistanceVectorData)9 Vector (com.alibaba.alink.common.linalg.Vector)6 ArrayList (java.util.ArrayList)4 Row (org.apache.flink.types.Row)4 FastDistanceMatrixData (com.alibaba.alink.operator.common.distance.FastDistanceMatrixData)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3 DenseMatrix (com.alibaba.alink.common.linalg.DenseMatrix)2 CosineDistance (com.alibaba.alink.operator.common.distance.CosineDistance)2 KDTree (com.alibaba.alink.operator.common.similarity.KDTree)2 SparseVector (com.alibaba.alink.common.linalg.SparseVector)1 KDTreeModelData (com.alibaba.alink.operator.common.similarity.modeldata.KDTreeModelData)1 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)1 VectorApproxNearestNeighborTrainParams (com.alibaba.alink.params.similarity.VectorApproxNearestNeighborTrainParams)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)1