use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.
the class KMeansAssignCluster method calc.
@Override
public void calc(ComContext context) {
LOG.info("StepNo {}, TaskId {} Assign cluster begins!", context.getStepNo(), context.getTaskId());
Integer vectorSize = context.getObj(KMeansTrainBatchOp.VECTOR_SIZE);
Integer k = context.getObj(KMeansTrainBatchOp.K);
// get iterative coefficient from static memory.
Tuple2<Integer, FastDistanceMatrixData> stepNumCentroids;
if (context.getStepNo() % 2 == 0) {
stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID1);
} else {
stepNumCentroids = context.getObj(KMeansTrainBatchOp.CENTROID2);
}
if (null == distanceMatrix) {
distanceMatrix = new DenseMatrix(k, 1);
}
double[] sumMatrixData = context.getObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE);
if (sumMatrixData == null) {
sumMatrixData = new double[k * (vectorSize + 1)];
context.putObj(KMeansTrainBatchOp.CENTROID_ALL_REDUCE, sumMatrixData);
}
Iterable<FastDistanceVectorData> trainData = context.getObj(KMeansTrainBatchOp.TRAIN_DATA);
if (trainData == null) {
return;
}
Arrays.fill(sumMatrixData, 0.0);
for (FastDistanceVectorData sample : trainData) {
KMeansUtil.updateSumMatrix(sample, 1, stepNumCentroids.f1, vectorSize, sumMatrixData, k, fastDistance, distanceMatrix);
}
LOG.info("StepNo {}, TaskId {} Assign cluster ends!", context.getStepNo(), context.getTaskId());
}
use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.
the class KMeansInitCentroids method kMeansPlusPlusInit.
private static DataSet<FastDistanceMatrixData> kMeansPlusPlusInit(DataSet<FastDistanceVectorData> data, final int k, final int initSteps, final FastDistance distance, DataSet<Integer> vectorSize, int seed) {
final HashFunction hashFunc = murmur3_128(seed);
DataSet<Tuple2<Long, FastDistanceVectorData>> dataWithId = data.map(new MapFunction<FastDistanceVectorData, Tuple2<Long, FastDistanceVectorData>>() {
private static final long serialVersionUID = 1539229008777267709L;
@Override
public Tuple2<Long, FastDistanceVectorData> map(FastDistanceVectorData value) throws Exception {
Long hashValue = hashFunc.hashUnencodedChars(value.toString()).asLong();
return Tuple2.of(hashValue, value);
}
});
// id, vectorData, nearestCenterId, nearestCenterDist, mark(center/data)
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> dataNeighborMark = dataWithId.map(new MapFunction<Tuple2<Long, FastDistanceVectorData>, Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>>() {
private static final long serialVersionUID = -8289894247468770813L;
@Override
public Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean> map(Tuple2<Long, FastDistanceVectorData> value) {
return Tuple5.of(value.f0, value.f1, -1L, Double.MAX_VALUE, false);
}
}).withForwardedFields("f0;f1");
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> centers = dataNeighborMark.maxBy(0).map(new TransformToCenter()).withForwardedFields("f0;f1");
dataNeighborMark = dataNeighborMark.map(new CalWeight(distance)).withBroadcastSet(centers, CENTER).withForwardedFields("f0;f1;f4");
IterativeDataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> loop = dataNeighborMark.iterate(initSteps - 1);
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> dataOnly = loop.filter(new FilterData());
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> oldCenter = loop.filter(new FilterCenter());
DataSet<Tuple1<Double>> sumCosts = dataOnly.<Tuple1<Double>>project(3).aggregate(SUM, 0);
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> newCenter = dataOnly.partitionCustom(new Partitioner<Long>() {
private static final long serialVersionUID = 8742959167492464159L;
@Override
public int partition(Long key, int numPartitions) {
return (int) (Math.abs(key) % numPartitions);
}
}, 0).sortPartition(0, Order.DESCENDING).filter(new FilterNewCenter(k, seed)).withBroadcastSet(sumCosts, SUM_COSTS).name("kmeans_||_pick").map(new TransformToCenter()).withForwardedFields("f0;f1");
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> updateData = dataOnly.map(new CalWeight(distance)).withBroadcastSet(newCenter, CENTER).withForwardedFields("f0;f1;f4");
DataSet<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>> finalDataAndCenter = loop.closeWith(updateData.union(oldCenter));
DataSet<Tuple2<Long, FastDistanceVectorData>> finalCenters = finalDataAndCenter.filter(new FilterCenter()).project(0, 1);
DataSet<Tuple2<Long, FastDistanceVectorData>> weight = finalDataAndCenter.filter(new FilterData()).map(new MapFunction<Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean>, Tuple2<Long, Long>>() {
private static final long serialVersionUID = -7230628651729304469L;
@Override
public Tuple2<Long, Long> map(Tuple5<Long, FastDistanceVectorData, Long, Double, Boolean> t) {
return Tuple2.of(t.f2, 1L);
}
}).withForwardedFields("f2->f0").groupBy(0).aggregate(SUM, 1).join(finalCenters).where(0).equalTo(0).projectFirst(1).projectSecond(1);
return weight.mapPartition(new LocalKmeans(k, distance, seed)).withBroadcastSet(vectorSize, VECTOR_SIZE).setParallelism(1);
}
use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.
the class KMeansModelMapper method map.
@Override
protected void map(SlicedSelectedSample selection, SlicedResult result) throws Exception {
Vector record;
if (colIdx.length > 1) {
record = new DenseVector(2);
record.set(0, ((Number) selection.get(colIdx[0])).doubleValue());
record.set(1, ((Number) selection.get(colIdx[1])).doubleValue());
} else {
record = VectorUtil.getVector(selection.get(colIdx[0]));
}
if (null == record) {
result.set(0, null);
if (isPredDetail) {
result.set(1, null);
if (isPredDistance) {
result.set(2, null);
}
} else {
if (isPredDistance) {
result.set(1, null);
}
}
} else {
DenseMatrix distanceMatrix = new DenseMatrix(this.modelData.params.k, 1);
FastDistanceVectorData vectorData = distance.prepareVectorData(Tuple2.of(record, null));
double[] clusterDistances = KMeansUtil.getClusterDistances(vectorData, this.modelData.centroids, distance, distanceMatrix);
int index = KMeansUtil.getMinPointIndex(clusterDistances, this.modelData.params.k);
result.set(0, (long) index);
if (isPredDetail) {
double[] probs = KMeansUtil.getProbArrayFromDistanceArray(clusterDistances);
DenseVector vec = new DenseVector(probs.length);
for (int i = 0; i < this.modelData.params.k; i++) {
vec.set((int) this.modelData.getClusterId(i), probs[i]);
}
result.set(1, vec.toString());
if (isPredDistance) {
result.set(2, clusterDistances[index]);
}
} else {
if (isPredDistance) {
result.set(1, clusterDistances[index]);
}
}
}
}
use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.
the class LocalKmeansFunc method kmeans.
/**
* Run K-means++ on the weighted samples. First do the K-means++ initialization and then runs Lloyd's algorithm.
*
* @param k cluster number.
* @param samples initial weighted samples.
* @param distance distance measure.
* @param vectorSize the size of vectors.
* @return the result of kmeans.
*/
static FastDistanceMatrixData kmeans(int k, long[] sampleWeights, FastDistanceVectorData[] samples, FastDistance distance, int vectorSize, int seed) {
Random random = new Random(seed);
List<FastDistanceVectorData> initCentroidsList = sampleInitialCentroids(k, sampleWeights, samples, distance, random);
FastDistanceMatrixData initCentroids = KMeansUtil.buildCentroidsMatrix(initCentroidsList, distance, vectorSize);
boolean converge = false;
int iteration = 0;
DenseMatrix sumMatrix = new DenseMatrix(vectorSize + 1, k);
DenseMatrix distanceMatrix = new DenseMatrix(k, 1);
double[] sumMatrixData = sumMatrix.getData();
double[] initCentroidsData = initCentroids.getVectors().getData();
int[] indices = new int[samples.length];
while (!converge && iteration < LOCAL_MAX_ITER) {
iteration++;
converge = true;
for (int i = 0; i < samples.length; i++) {
int clusterId = KMeansUtil.updateSumMatrix(samples[i], sampleWeights[i], initCentroids, vectorSize, sumMatrixData, k, distance, distanceMatrix);
if (clusterId != indices[i]) {
indices[i] = clusterId;
converge = false;
}
}
Arrays.fill(initCentroidsData, 0.0);
for (int i = 0; i < k; i++) {
int initCentroidsStartIndex = i * vectorSize;
int sumMatrixStartIndex = initCentroidsStartIndex + i;
double weight = sumMatrixData[sumMatrixStartIndex + vectorSize];
if (weight > 0) {
BLAS.axpy(vectorSize, 1.0 / weight, sumMatrixData, sumMatrixStartIndex, initCentroidsData, initCentroidsStartIndex);
} else {
int index = random.nextInt(samples.length);
MatVecOp.appendVectorToMatrix(initCentroids.getVectors(), false, i, samples[index].getVector());
}
distance.updateLabel(initCentroids);
}
}
if (iteration != LOCAL_MAX_ITER) {
LOG.info("Local kmeans converge with {} steps.", iteration);
} else {
LOG.info("Local kmeans reach max iteration number!");
}
return initCentroids;
}
use of com.alibaba.alink.operator.common.distance.FastDistanceVectorData in project Alink by alibaba.
the class GeoKMeansTrainBatchOp method linkFrom.
@Override
public GeoKMeansTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> in = checkAndGetFirst(inputs);
final String latitudeColName = this.getLatitudeCol();
final String longitudeColName = this.getLongitudeCol();
FastDistance distance = new HaversineDistance();
final int maxIter = this.getMaxIter();
final double tol = this.getEpsilon();
DataSet<FastDistanceVectorData> data = in.select(new String[] { latitudeColName, longitudeColName }).getDataSet().rebalance().map(new MapFunction<Row, FastDistanceVectorData>() {
private static final long serialVersionUID = -5236022856006527961L;
@Override
public FastDistanceVectorData map(Row row) {
Vector vec = new DenseVector(new double[] { ((Number) row.getField(0)).doubleValue(), ((Number) row.getField(1)).doubleValue() });
return distance.prepareVectorData(Row.of(vec), 0);
}
});
DataSet<Integer> vectorSize = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment().fromElements(2);
// Tuple3: clusterId, clusterWeight, clusterCentroid
DataSet<FastDistanceMatrixData> initCentroid = initKmeansCentroids(data, distance, this.getParams(), vectorSize, getRandomSeed());
DataSet<Row> finalCentroid = iterateICQ(initCentroid, data, vectorSize, maxIter, tol, distance, HasKMeansWithHaversineDistanceType.DistanceType.HAVERSINE, null, this.getLatitudeCol(), this.getLongitudeCol());
// store the clustering model to the table
this.setOutput(finalCentroid, new KMeansModelDataConverter().getModelSchema());
return this;
}
Aggregations