Search in sources :

Example 1 with FastDistanceVectorData

use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.

the class KMeansAssignCluster method calc.

@Override
public void calc(ComContext context) {
    LOG.info("StepNo {}, TaskId {} Assign cluster begins!", context.getStepNo(), context.getTaskId());
    Integer vectorSize = context.getObj(KMeansTrainBatchOp.VECTOR_SIZE);
    Integer k = context.getObj(KMeansTrainBatchOp.K);
    // get iterative coefficient from static memory.
    Tuple2<Integer, FastDistanceMatrixData> stepNumCentroids;
    if (context.getStepNo() % 2 == 0) {
        stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID1);
    } else {
        stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID2);
    }
    if (null == distanceMatrix) {
        distanceMatrix = new DenseMatrix(k, 1);
    }
    double[] sumMatrixData = context.getObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE);
    if (sumMatrixData == null) {
        sumMatrixData = new double[k * (vectorSize + 1)];
        context.putObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE, sumMatrixData);
    }
    Iterable<FastDistanceVectorData> trainData = context.getObj(KMeansTrainBatchOp.TRAIN_DATA);
    if (trainData == null) {
        return;
    }
    Arrays.fill(sumMatrixData, 0.0);
    for (FastDistanceVectorData sample : trainData) {
        KMeansUtil.updateSumMatrix(sample, 1, stepNumCentroids.f1, vectorSize, sumMatrixData, k, fastDistance, distanceMatrix);
    }
    LOG.info("StepNo {}, TaskId {} Assign cluster ends!", context.getStepNo(), context.getTaskId());
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 2 with FastDistanceVectorData

use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.

the class KMeansInitCentroids method kMeansPlusPlusInit.

private static DataSet<FastDistanceMatrixData> kMeansPlusPlusInit(DataSet<FastDistanceVectorData> data, final int k, final int initSteps, final FastDistance distance, DataSet<Integer> vectorSize, int seed) {
    final HashFunction hashFunc = murmur3_128(seed);
    DataSet<Tuple2<Long, FastDistanceVectorData>> dataWithId = data.map(new MapFunction<FastDistanceVectorData, Tuple2<Long, FastDistanceVectorData>>() {

        private static final long serialVersionUID = 1539229008777267709L;

        @Override
        public Tuple2<Long, FastDistanceVectorData> map(FastDistanceVectorData value) throws Exception {
            Long hashValue = hashFunc.hashUnencodedChars(value.toString()).asLong();
            return Tuple2.of(hashValue, value);
        }
    });
    // id, vectorData, nearestCenterId, nearestCenterDist, mark(center/data)
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> dataNeighborMark = dataWithId.map(new MapFunction<Tuple2<Long, FastDistanceVectorData>, Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>>() {

        private static final long serialVersionUID = -8289894247468770813L;

        @Override
        public Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean> map(Tuple2<Long, FastDistanceVectorData> value) {
            return Tuple5.of(value.f0, value.f1, -1L, Double.MAX_VALUE, false);
        }
    }).withForwardedFields("f0;f1");
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> centers = dataNeighborMark.maxBy(0).map(new TransformToCenter()).withForwardedFields("f0;f1");
    dataNeighborMark = dataNeighborMark.map(new CalWeight(distance)).withBroadcastSet(centers, CENTER).withForwardedFields("f0;f1;f4");
    IterativeDataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> loop = dataNeighborMark.iterate(initSteps - 1);
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> dataOnly = loop.filter(new FilterData());
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> oldCenter = loop.filter(new FilterCenter());
    DataSet<Tuple1<Double>> sumCosts = dataOnly.<Tuple1<Double>>project(3).aggregate(SUM, 0);
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> newCenter = dataOnly.partitionCustom(new Partitioner<Long>() {

        private static final long serialVersionUID = 8742959167492464159L;

        @Override
        public int partition(Long key, int numPartitions) {
            return (int) (Math.abs(key) % numPartitions);
        }
    }, 0).sortPartition(0, Order.DESCENDING).filter(new FilterNewCenter(k, seed)).withBroadcastSet(sumCosts, SUM_COSTS).name("kmeans_||_pick").map(new TransformToCenter()).withForwardedFields("f0;f1");
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> updateData = dataOnly.map(new CalWeight(distance)).withBroadcastSet(newCenter, CENTER).withForwardedFields("f0;f1;f4");
    DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> finalDataAndCenter = loop.closeWith(updateData.union(oldCenter));
    DataSet<Tuple2<Long, FastDistanceVectorData>> finalCenters = finalDataAndCenter.filter(new FilterCenter()).project(0, 1);
    DataSet<Tuple2<Long, FastDistanceVectorData>> weight = finalDataAndCenter.filter(new FilterData()).map(new MapFunction<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>, Tuple2<Long, Long>>() {

        private static final long serialVersionUID = -7230628651729304469L;

        @Override
        public Tuple2<Long, Long> map(Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean> t) {
            return Tuple2.of(t.f2, 1L);
        }
    }).withForwardedFields("f2->f0").groupBy(0).aggregate(SUM, 1).join(finalCenters).where(0).equalTo(0).projectFirst(1).projectSecond(1);
    return weight.mapPartition(new LocalKmeans(k, distance, seed)).withBroadcastSet(vectorSize, VECTOR_SIZE).setParallelism(1);
}
Also used : FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) MapFunction(org.apache.flink.api.common.functions.MapFunction) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Tuple5(org.apache.flink.api.java.tuple.Tuple5) HashFunction(org.apache.flink.shaded.guava18.com.google.common.hash.HashFunction) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2)

Example 3 with FastDistanceVectorData

use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.

the class KMeansModelMapper method map.

@Override
protected void map(SlicedSelectedSample selection, SlicedResult result) throws Exception {
    Vector record;
    if (colIdx.length > 1) {
        record = new DenseVector(2);
        record.set(0, ((Number) selection.get(colIdx[0])).doubleValue());
        record.set(1, ((Number) selection.get(colIdx[1])).doubleValue());
    } else {
        record = VectorUtil.getVector(selection.get(colIdx[0]));
    }
    if (null == record) {
        result.set(0, null);
        if (isPredDetail) {
            result.set(1, null);
            if (isPredDistance) {
                result.set(2, null);
            }
        } else {
            if (isPredDistance) {
                result.set(1, null);
            }
        }
    } else {
        DenseMatrix distanceMatrix = new DenseMatrix(this.modelData.params.k, 1);
        FastDistanceVectorData vectorData = distance.prepareVectorData(Tuple2.of(record, null));
        double[] clusterDistances = KMeansUtil.getClusterDistances(vectorData, this.modelData.centroids, distance, distanceMatrix);
        int index = KMeansUtil.getMinPointIndex(clusterDistances, this.modelData.params.k);
        result.set(0, (long) index);
        if (isPredDetail) {
            double[] probs = KMeansUtil.getProbArrayFromDistanceArray(clusterDistances);
            DenseVector vec = new DenseVector(probs.length);
            for (int i = 0; i < this.modelData.params.k; i++) {
                vec.set((int) this.modelData.getClusterId(i), probs[i]);
            }
            result.set(1, vec.toString());
            if (isPredDistance) {
                result.set(2, clusterDistances[index]);
            }
        } else {
            if (isPredDistance) {
                result.set(1, clusterDistances[index]);
            }
        }
    }
}
Also used : FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 4 with FastDistanceVectorData

use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.

the class LocalKmeansFunc method kmeans.

/**
 * Run K-means++ on the weighted samples. First do the K-means++ initialization and then runs Lloyd's algorithm.
 *
 * @param k          cluster number.
 * @param samples    initial weighted samples.
 * @param distance   distance measure.
 * @param vectorSize the size of vectors.
 * @return the result of kmeans.
 */
static FastDistanceMatrixData kmeans(int k, long[] sampleWeights, FastDistanceVectorData[] samples, FastDistance distance, int vectorSize, int seed) {
    Random random = new Random(seed);
    List<FastDistanceVectorData> initCentroidsList = sampleInitialCentroids(k, sampleWeights, samples, distance, random);
    FastDistanceMatrixData initCentroids = KMeansUtil.buildCentroidsMatrix(initCentroidsList, distance, vectorSize);
    boolean converge = false;
    int iteration = 0;
    DenseMatrix sumMatrix = new DenseMatrix(vectorSize + 1, k);
    DenseMatrix distanceMatrix = new DenseMatrix(k, 1);
    double[] sumMatrixData = sumMatrix.getData();
    double[] initCentroidsData = initCentroids.getVectors().getData();
    int[] indices = new int[samples.length];
    while (!converge && iteration < LOCAL_MAX_ITER) {
        iteration++;
        converge = true;
        for (int i = 0; i < samples.length; i++) {
            int clusterId = KMeansUtil.updateSumMatrix(samples[i], sampleWeights[i], initCentroids, vectorSize, sumMatrixData, k, distance, distanceMatrix);
            if (clusterId != indices[i]) {
                indices[i] = clusterId;
                converge = false;
            }
        }
        Arrays.fill(initCentroidsData, 0.0);
        for (int i = 0; i < k; i++) {
            int initCentroidsStartIndex = i * vectorSize;
            int sumMatrixStartIndex = initCentroidsStartIndex + i;
            double weight = sumMatrixData[sumMatrixStartIndex + vectorSize];
            if (weight > 0) {
                BLAS.axpy(vectorSize, 1.0 / weight, sumMatrixData, sumMatrixStartIndex, initCentroidsData, initCentroidsStartIndex);
            } else {
                int index = random.nextInt(samples.length);
                MatVecOp.appendVectorToMatrix(initCentroids.getVectors(), false, i, samples[index].getVector());
            }
            distance.updateLabel(initCentroids);
        }
    }
    if (iteration != LOCAL_MAX_ITER) {
        LOG.info("Local kmeans converge with {} steps.", iteration);
    } else {
        LOG.info("Local kmeans reach max iteration number!");
    }
    return initCentroids;
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) Random(java.util.Random) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 5 with FastDistanceVectorData

use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.

the class GeoKMeansTrainBatchOp method linkFrom.

@Override
public GeoKMeansTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    final String latitudeColName = this.getLatitudeCol();
    final String longitudeColName = this.getLongitudeCol();
    FastDistance distance = new HaversineDistance();
    final int maxIter = this.getMaxIter();
    final double tol = this.getEpsilon();
    DataSet<FastDistanceVectorData> data = in.select(new String[] { latitudeColName, longitudeColName }).getDataSet().rebalance().map(new MapFunction<Row, FastDistanceVectorData>() {

        private static final long serialVersionUID = -5236022856006527961L;

        @Override
        public FastDistanceVectorData map(Row row) {
            Vector vec = new DenseVector(new double[] { ((Number) row.getField(0)).doubleValue(), ((Number) row.getField(1)).doubleValue() });
            return distance.prepareVectorData(Row.of(vec), 0);
        }
    });
    DataSet<Integer> vectorSize = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment().fromElements(2);
    // Tuple3: clusterId, clusterWeight, clusterCentroid
    DataSet<FastDistanceMatrixData> initCentroid = initKmeansCentroids(data, distance, this.getParams(), vectorSize, getRandomSeed());
    DataSet<Row> finalCentroid = iterateICQ(initCentroid, data, vectorSize, maxIter, tol, distance, HasKMeansWithHaversineDistanceType.DistanceType.HAVERSINE, null, this.getLatitudeCol(), this.getLongitudeCol());
    // store the clustering model to the table
    this.setOutput(finalCentroid, new KMeansModelDataConverter().getModelSchema());
    return this;
}
Also used : FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) KMeansModelDataConverter(com.alibaba.alink.operator.common.clustering.kmeans.KMeansModelDataConverter) HaversineDistance(com.alibaba.alink.operator.common.distance.HaversineDistance) FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) FastDistance(com.alibaba.alink.operator.common.distance.FastDistance) Row(org.apache.flink.types.Row) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector)

Aggregations

FastDistanceVectorData (com.alibaba.alink.operator.common.distance.FastDistanceVectorData)23 DenseVector (com.alibaba.alink.common.linalg.DenseVector)12 Test (org.junit.Test)12 DenseMatrix (com.alibaba.alink.common.linalg.DenseMatrix)9 Vector (com.alibaba.alink.common.linalg.Vector)9 EuclideanDistance (com.alibaba.alink.operator.common.distance.EuclideanDistance)9 FastDistanceMatrixData (com.alibaba.alink.operator.common.distance.FastDistanceMatrixData)8 ArrayList (java.util.ArrayList)8 Row (org.apache.flink.types.Row)6 SparseVector (com.alibaba.alink.common.linalg.SparseVector)5 FastDistance (com.alibaba.alink.operator.common.distance.FastDistance)3 KDTree (com.alibaba.alink.operator.common.similarity.KDTree)3 KMeansModelDataConverter (com.alibaba.alink.operator.common.clustering.kmeans.KMeansModelDataConverter)2 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)2 RichMapFunction (org.apache.flink.api.common.functions.RichMapFunction)2 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)2 DataSet (org.apache.flink.api.java.DataSet)2 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)2 Params (org.apache.flink.ml.api.misc.param.Params)2 Collector (org.apache.flink.util.Collector)2