use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.
the class KMeansAssignCluster method calc.
@Override
public void calc(ComContext context) {
LOG.info("StepNo {}, TaskId {} Assign cluster begins!", context.getStepNo(), context.getTaskId());
Integer vectorSize = context.getObj(KMeansTrainBatchOp.VECTOR_SIZE);
Integer k = context.getObj(KMeansTrainBatchOp.K);
// get iterative coefficient from static memory.
Tuple2<Integer, FastDistanceMatrixData> stepNumCentroids;
if (context.getStepNo() % 2 == 0) {
stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID1);
} else {
stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID2);
}
if (null == distanceMatrix) {
distanceMatrix = new DenseMatrix(k, 1);
}
double[] sumMatrixData = context.getObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE);
if (sumMatrixData == null) {
sumMatrixData = new double[k * (vectorSize + 1)];
context.putObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE, sumMatrixData);
}
Iterable<FastDistanceVectorData> trainData = context.getObj(KMeansTrainBatchOp.TRAIN_DATA);
if (trainData == null) {
return;
}
Arrays.fill(sumMatrixData, 0.0);
for (FastDistanceVectorData sample : trainData) {
KMeansUtil.updateSumMatrix(sample, 1, stepNumCentroids.f1, vectorSize, sumMatrixData, k, fastDistance, distanceMatrix);
}
LOG.info("StepNo {}, TaskId {} Assign cluster ends!", context.getStepNo(), context.getTaskId());
}
use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.
the class KMeansPreallocateCentroid method calc.
@Override
public void calc(ComContext context) {
if (context.getStepNo() == 1) {
List<FastDistanceMatrixData> initCentroids = context.getObj(KMeansTrainBatchOp.INIT_CENTROID);
List<Integer> list = context.getObj(KMeansTrainBatchOp.KMEANS_STATISTICS);
Integer vectorSize = list.get(0);
context.putObj(KMeansTrainBatchOp.VECTOR_SIZE, vectorSize);
FastDistanceMatrixData centroid = initCentroids.get(0);
Preconditions.checkArgument(centroid.getVectors().numRows() == vectorSize, "Init centroid error, size not equal!");
LOG.info("Init centroids, initial centroid size {}", centroid.getVectors().numCols());
context.putObj(KMeansTrainBatchOp.CENTROID1, Tuple2.of(context.getStepNo() - 1, centroid));
context.putObj(KMeansTrainBatchOp.CENTROID2, Tuple2.of(context.getStepNo() - 1, new FastDistanceMatrixData(centroid)));
context.putObj(KMeansTrainBatchOp.K, centroid.getVectors().numCols());
}
}
use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.
the class KMeansUtil method transformTrainDataToPredictData.
/**
* Transform KMeansTrainModelData to KMeansPredictModelData.
*
* @param trainModelData KMeansTrainModelData.
* @return KMeansPredictModelData.
*/
public static KMeansPredictModelData transformTrainDataToPredictData(KMeansTrainModelData trainModelData) {
KMeansPredictModelData modelData = new KMeansPredictModelData();
modelData.params = trainModelData.params;
DenseMatrix denseMatrix = new DenseMatrix(trainModelData.params.vectorSize, trainModelData.params.k);
Row[] rows = new Row[trainModelData.params.k];
int index = 0;
for (int i = 0; i < trainModelData.centroids.size(); i++) {
MatVecOp.appendVectorToMatrix(denseMatrix, false, index, trainModelData.getClusterVector(i));
rows[index] = Row.of(trainModelData.getClusterId(i), trainModelData.getClusterWeight(i));
index++;
}
modelData.centroids = new FastDistanceMatrixData(denseMatrix, rows);
(modelData.params.distanceType.getFastDistance()).updateLabel(modelData.centroids);
return modelData;
}
use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.
the class LocalKmeansFunc method kmeans.
/**
* Run K-means++ on the weighted samples. First do the K-means++ initialization and then runs Lloyd's algorithm.
*
* @param k cluster number.
* @param samples initial weighted samples.
* @param distance distance measure.
* @param vectorSize the size of vectors.
* @return the result of kmeans.
*/
static FastDistanceMatrixData kmeans(int k, long[] sampleWeights, FastDistanceVectorData[] samples, FastDistance distance, int vectorSize, int seed) {
Random random = new Random(seed);
List<FastDistanceVectorData> initCentroidsList = sampleInitialCentroids(k, sampleWeights, samples, distance, random);
FastDistanceMatrixData initCentroids = KMeansUtil.buildCentroidsMatrix(initCentroidsList, distance, vectorSize);
boolean converge = false;
int iteration = 0;
DenseMatrix sumMatrix = new DenseMatrix(vectorSize + 1, k);
DenseMatrix distanceMatrix = new DenseMatrix(k, 1);
double[] sumMatrixData = sumMatrix.getData();
double[] initCentroidsData = initCentroids.getVectors().getData();
int[] indices = new int[samples.length];
while (!converge && iteration < LOCAL_MAX_ITER) {
iteration++;
converge = true;
for (int i = 0; i < samples.length; i++) {
int clusterId = KMeansUtil.updateSumMatrix(samples[i], sampleWeights[i], initCentroids, vectorSize, sumMatrixData, k, distance, distanceMatrix);
if (clusterId != indices[i]) {
indices[i] = clusterId;
converge = false;
}
}
Arrays.fill(initCentroidsData, 0.0);
for (int i = 0; i < k; i++) {
int initCentroidsStartIndex = i * vectorSize;
int sumMatrixStartIndex = initCentroidsStartIndex + i;
double weight = sumMatrixData[sumMatrixStartIndex + vectorSize];
if (weight > 0) {
BLAS.axpy(vectorSize, 1.0 / weight, sumMatrixData, sumMatrixStartIndex, initCentroidsData, initCentroidsStartIndex);
} else {
int index = random.nextInt(samples.length);
MatVecOp.appendVectorToMatrix(initCentroids.getVectors(), false, i, samples[index].getVector());
}
distance.updateLabel(initCentroids);
}
}
if (iteration != LOCAL_MAX_ITER) {
LOG.info("Local kmeans converge with {} steps.", iteration);
} else {
LOG.info("Local kmeans reach max iteration number!");
}
return initCentroids;
}
use of com.alibaba.alink.operator.common.distance.FastDistanceMatrixData in project Alink by alibaba.
the class GeoKMeansTrainBatchOp method linkFrom.
@Override
public GeoKMeansTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> in = checkAndGetFirst(inputs);
final String latitudeColName = this.getLatitudeCol();
final String longitudeColName = this.getLongitudeCol();
FastDistance distance = new HaversineDistance();
final int maxIter = this.getMaxIter();
final double tol = this.getEpsilon();
DataSet<FastDistanceVectorData> data = in.select(new String[] { latitudeColName, longitudeColName }).getDataSet().rebalance().map(new MapFunction<Row, FastDistanceVectorData>() {
private static final long serialVersionUID = -5236022856006527961L;
@Override
public FastDistanceVectorData map(Row row) {
Vector vec = new DenseVector(new double[] { ((Number) row.getField(0)).doubleValue(), ((Number) row.getField(1)).doubleValue() });
return distance.prepareVectorData(Row.of(vec), 0);
}
});
DataSet<Integer> vectorSize = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment().fromElements(2);
// Tuple3: clusterId, clusterWeight, clusterCentroid
DataSet<FastDistanceMatrixData> initCentroid = initKmeansCentroids(data, distance, this.getParams(), vectorSize, getRandomSeed());
DataSet<Row> finalCentroid = iterateICQ(initCentroid, data, vectorSize, maxIter, tol, distance, HasKMeansWithHaversineDistanceType.DistanceType.HAVERSINE, null, this.getLatitudeCol(), this.getLongitudeCol());
// store the clustering model to the table
this.setOutput(finalCentroid, new KMeansModelDataConverter().getModelSchema());
return this;
}
Aggregations