Search in sources :

Example 1 with FastDistanceMatrixData

use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.

the class KMeansAssignCluster method calc.

@Override
public void calc(ComContext context) {
    LOG.info("StepNo {}, TaskId {} Assign cluster begins!", context.getStepNo(), context.getTaskId());
    Integer vectorSize = context.getObj(KMeansTrainBatchOp.VECTOR_SIZE);
    Integer k = context.getObj(KMeansTrainBatchOp.K);
    // get iterative coefficient from static memory.
    Tuple2<Integer, FastDistanceMatrixData> stepNumCentroids;
    if (context.getStepNo() % 2 == 0) {
        stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID1);
    } else {
        stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID2);
    }
    if (null == distanceMatrix) {
        distanceMatrix = new DenseMatrix(k, 1);
    }
    double[] sumMatrixData = context.getObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE);
    if (sumMatrixData == null) {
        sumMatrixData = new double[k * (vectorSize + 1)];
        context.putObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE, sumMatrixData);
    }
    Iterable<FastDistanceVectorData> trainData = context.getObj(KMeansTrainBatchOp.TRAIN_DATA);
    if (trainData == null) {
        return;
    }
    Arrays.fill(sumMatrixData, 0.0);
    for (FastDistanceVectorData sample : trainData) {
        KMeansUtil.updateSumMatrix(sample, 1, stepNumCentroids.f1, vectorSize, sumMatrixData, k, fastDistance, distanceMatrix);
    }
    LOG.info("StepNo {}, TaskId {} Assign cluster ends!", context.getStepNo(), context.getTaskId());
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 2 with FastDistanceMatrixData

use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.

the class KMeansPreallocateCentroid method calc.

@Override
public void calc(ComContext context) {
    if (context.getStepNo() == 1) {
        List<FastDistanceMatrixData> initCentroids = context.getObj(KMeansTrainBatchOp.INIT_CENTROID);
        List<Integer> list = context.getObj(KMeansTrainBatchOp.KMEANS_STATISTICS);
        Integer vectorSize = list.get(0);
        context.putObj(KMeansTrainBatchOp.VECTOR_SIZE, vectorSize);
        FastDistanceMatrixData centroid = initCentroids.get(0);
        Preconditions.checkArgument(centroid.getVectors().numRows() == vectorSize, "Init centroid error, size not equal!");
        LOG.info("Init centroids, initial centroid size {}", centroid.getVectors().numCols());
        context.putObj(KMeansTrainBatchOp.CENTROID1, Tuple2.of(context.getStepNo() - 1, centroid));
        context.putObj(KMeansTrainBatchOp.CENTROID2, Tuple2.of(context.getStepNo() - 1, new FastDistanceMatrixData(centroid)));
        context.putObj(KMeansTrainBatchOp.K, centroid.getVectors().numCols());
    }
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData)

Example 3 with FastDistanceMatrixData

use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.

the class KMeansUtil method transformTrainDataToPredictData.

/**
 * Transform KMeansTrainModelData to KMeansPredictModelData.
 *
 * @param trainModelData KMeansTrainModelData.
 * @return KMeansPredictModelData.
 */
public static KMeansPredictModelData transformTrainDataToPredictData(KMeansTrainModelData trainModelData) {
    KMeansPredictModelData modelData = new KMeansPredictModelData();
    modelData.params = trainModelData.params;
    DenseMatrix denseMatrix = new DenseMatrix(trainModelData.params.vectorSize, trainModelData.params.k);
    Row[] rows = new Row[trainModelData.params.k];
    int index = 0;
    for (int i = 0; i < trainModelData.centroids.size(); i++) {
        MatVecOp.appendVectorToMatrix(denseMatrix, false, index, trainModelData.getClusterVector(i));
        rows[index] = Row.of(trainModelData.getClusterId(i), trainModelData.getClusterWeight(i));
        index++;
    }
    modelData.centroids = new FastDistanceMatrixData(denseMatrix, rows);
    (modelData.params.distanceType.getFastDistance()).updateLabel(modelData.centroids);
    return modelData;
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) Row(org.apache.flink.types.Row) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 4 with FastDistanceMatrixData

use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.

the class LocalKmeansFunc method kmeans.

/**
 * Run K-means++ on the weighted samples. First do the K-means++ initialization and then runs Lloyd's algorithm.
 *
 * @param k          cluster number.
 * @param samples    initial weighted samples.
 * @param distance   distance measure.
 * @param vectorSize the size of vectors.
 * @return the result of kmeans.
 */
static FastDistanceMatrixData kmeans(int k, long[] sampleWeights, FastDistanceVectorData[] samples, FastDistance distance, int vectorSize, int seed) {
    Random random = new Random(seed);
    List<FastDistanceVectorData> initCentroidsList = sampleInitialCentroids(k, sampleWeights, samples, distance, random);
    FastDistanceMatrixData initCentroids = KMeansUtil.buildCentroidsMatrix(initCentroidsList, distance, vectorSize);
    boolean converge = false;
    int iteration = 0;
    DenseMatrix sumMatrix = new DenseMatrix(vectorSize + 1, k);
    DenseMatrix distanceMatrix = new DenseMatrix(k, 1);
    double[] sumMatrixData = sumMatrix.getData();
    double[] initCentroidsData = initCentroids.getVectors().getData();
    int[] indices = new int[samples.length];
    while (!converge && iteration < LOCAL_MAX_ITER) {
        iteration++;
        converge = true;
        for (int i = 0; i < samples.length; i++) {
            int clusterId = KMeansUtil.updateSumMatrix(samples[i], sampleWeights[i], initCentroids, vectorSize, sumMatrixData, k, distance, distanceMatrix);
            if (clusterId != indices[i]) {
                indices[i] = clusterId;
                converge = false;
            }
        }
        Arrays.fill(initCentroidsData, 0.0);
        for (int i = 0; i < k; i++) {
            int initCentroidsStartIndex = i * vectorSize;
            int sumMatrixStartIndex = initCentroidsStartIndex + i;
            double weight = sumMatrixData[sumMatrixStartIndex + vectorSize];
            if (weight > 0) {
                BLAS.axpy(vectorSize, 1.0 / weight, sumMatrixData, sumMatrixStartIndex, initCentroidsData, initCentroidsStartIndex);
            } else {
                int index = random.nextInt(samples.length);
                MatVecOp.appendVectorToMatrix(initCentroids.getVectors(), false, i, samples[index].getVector());
            }
            distance.updateLabel(initCentroids);
        }
    }
    if (iteration != LOCAL_MAX_ITER) {
        LOG.info("Local kmeans converge with {} steps.", iteration);
    } else {
        LOG.info("Local kmeans reach max iteration number!");
    }
    return initCentroids;
}
Also used : FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) Random(java.util.Random) FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Example 5 with FastDistanceMatrixData

use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.

the class GeoKMeansTrainBatchOp method linkFrom.

@Override
public GeoKMeansTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    final String latitudeColName = this.getLatitudeCol();
    final String longitudeColName = this.getLongitudeCol();
    FastDistance distance = new HaversineDistance();
    final int maxIter = this.getMaxIter();
    final double tol = this.getEpsilon();
    DataSet<FastDistanceVectorData> data = in.select(new String[] { latitudeColName, longitudeColName }).getDataSet().rebalance().map(new MapFunction<Row, FastDistanceVectorData>() {

        private static final long serialVersionUID = -5236022856006527961L;

        @Override
        public FastDistanceVectorData map(Row row) {
            Vector vec = new DenseVector(new double[] { ((Number) row.getField(0)).doubleValue(), ((Number) row.getField(1)).doubleValue() });
            return distance.prepareVectorData(Row.of(vec), 0);
        }
    });
    DataSet<Integer> vectorSize = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment().fromElements(2);
    // Tuple3: clusterId, clusterWeight, clusterCentroid
    DataSet<FastDistanceMatrixData> initCentroid = initKmeansCentroids(data, distance, this.getParams(), vectorSize, getRandomSeed());
    DataSet<Row> finalCentroid = iterateICQ(initCentroid, data, vectorSize, maxIter, tol, distance, HasKMeansWithHaversineDistanceType.DistanceType.HAVERSINE, null, this.getLatitudeCol(), this.getLongitudeCol());
    // store the clustering model to the table
    this.setOutput(finalCentroid, new KMeansModelDataConverter().getModelSchema());
    return this;
}
Also used : FastDistanceVectorData(com.alibaba.alink.operator.common.distance.FastDistanceVectorData) KMeansModelDataConverter(com.alibaba.alink.operator.common.clustering.kmeans.KMeansModelDataConverter) HaversineDistance(com.alibaba.alink.operator.common.distance.HaversineDistance) FastDistanceMatrixData(com.alibaba.alink.operator.common.distance.FastDistanceMatrixData) FastDistance(com.alibaba.alink.operator.common.distance.FastDistance) Row(org.apache.flink.types.Row) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector)

Aggregations

FastDistanceMatrixData (com.alibaba.alink.operator.common.distance.FastDistanceMatrixData)13 DenseMatrix (com.alibaba.alink.common.linalg.DenseMatrix)8 FastDistanceVectorData (com.alibaba.alink.operator.common.distance.FastDistanceVectorData)8 DenseVector (com.alibaba.alink.common.linalg.DenseVector)6 Row (org.apache.flink.types.Row)5 Vector (com.alibaba.alink.common.linalg.Vector)4 Test (org.junit.Test)4 SparseVector (com.alibaba.alink.common.linalg.SparseVector)3 EuclideanDistance (com.alibaba.alink.operator.common.distance.EuclideanDistance)3 FastDistance (com.alibaba.alink.operator.common.distance.FastDistance)3 ArrayList (java.util.ArrayList)3 KMeansModelDataConverter (com.alibaba.alink.operator.common.clustering.kmeans.KMeansModelDataConverter)2 RowCollector (com.alibaba.alink.common.utils.RowCollector)1 FastDistanceData (com.alibaba.alink.operator.common.distance.FastDistanceData)1 FastDistanceSparseData (com.alibaba.alink.operator.common.distance.FastDistanceSparseData)1 HaversineDistance (com.alibaba.alink.operator.common.distance.HaversineDistance)1 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)1 HasKMeansWithHaversineDistanceType (com.alibaba.alink.params.shared.clustering.HasKMeansWithHaversineDistanceType)1 List (java.util.List)1 Random (java.util.Random)1