use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.
the class VectorAssemblerMapper method assembler.
private static Object assembler(Object[] input, HandleInvalidMethod handleInvalid) {
if (null == input) {
return null;
}
int pos = 0;
int size = input.length;
for (Object col : input) {
if (col instanceof DenseVector) {
size += ((DenseVector) col).size();
} else if (col instanceof SparseVector) {
size += ((SparseVector) col).getIndices().length;
}
}
Map<Integer, Double> map = new HashMap<>(size);
// getVector the data, and write it in List.
for (Object col : input) {
if (null != col) {
if (col instanceof Number) {
map.put(pos++, ((Number) col).doubleValue());
} else if (col instanceof String) {
Vector vec = VectorUtil.getVector(col);
pos = appendVector(vec, map, pos);
} else if (col instanceof Vector) {
pos = appendVector((Vector) col, map, pos);
} else {
throw new UnsupportedOperationException("not support type of object.");
}
} else {
switch(handleInvalid) {
case ERROR:
throw new NullPointerException("null value is found in vector assembler inputs.");
case SKIP:
return null;
default:
}
}
}
/* form the vector, and finally toString it. */
Vector vec = new SparseVector(pos, map);
if (map.size() * RATIO > pos) {
vec = ((SparseVector) vec).toDenseVector();
}
return vec;
}
use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.
the class FlattenMTableStreamTest method linkFrom.
@Test
public void linkFrom() throws Exception {
List<Row> rows = new ArrayList<>();
rows.add(Row.of(1, "2", 0, null, new SparseVector(3, new int[] { 1 }, new double[] { 2.0 }), new FloatTensor(new float[] { 3.0f })));
rows.add(Row.of(null, "2", 0, new DenseVector(new double[] { 0.0, 1.0 }), new SparseVector(4, new int[] { 2 }, new double[] { 3.0 }), new FloatTensor(new float[] { 3.0f })));
rows.add(Row.of(null, "2", 0, new DenseVector(new double[] { 0.1, 1.0 }), new SparseVector(4, new int[] { 2 }, new double[] { 3.0 }), new FloatTensor(new float[] { 3.0f })));
String schemaStr = "col0 int, col1 string, label int" + ", d_vec DENSE_VECTOR" + ", s_vec SPARSE_VECTOR" + ", tensor FLOAT_TENSOR";
MTable mTable = new MTable(rows, schemaStr);
List<Row> table = new ArrayList<>();
table.add(Row.of("id", mTable.toString()));
StreamOperator<?> op = new MemSourceStreamOp(table, new String[] { "id", "mTable" });
StreamOperator<?> res = op.link(new FlattenMTableStreamOp().setSchemaStr(schemaStr).setSelectedCol("mTable").setReservedCols("id"));
CollectSinkStreamOp sop = res.link(new CollectSinkStreamOp());
StreamOperator.execute();
List<Row> list = sop.getAndRemoveValues();
for (Row row : list) {
Assert.assertEquals(row.getField(0), "id");
}
}
use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.
the class VectorSummarizerUtilTest method testMulti.
@Test
public void testMulti() {
SparseVectorSummarizer left = new SparseVectorSummarizer(true);
DenseVectorSummarizer right = new DenseVectorSummarizer(true);
Assert.assertEquals(0, VectorSummarizerUtil.merge(left, right).count);
right = (DenseVectorSummarizer) right.visit(new DenseVector(new double[] { 1.0 }));
Assert.assertEquals(1, VectorSummarizerUtil.merge(left, right).count);
left = (SparseVectorSummarizer) left.visit(new SparseVector(2, new int[] { 0 }, new double[] { 1.0 }));
Assert.assertEquals(3, VectorSummarizerUtil.merge(left, right).count);
}
use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.
the class KMeansUtil method updateSumMatrix.
/**
* Find the closest centroid from centroids for sample, and add the sample to sumMatrix.
*
* @param sample query sample.
* @param sampleWeight sample weight.
* @param centroids centroids.
* @param vectorSize vectorsize.
* @param sumMatrix the sumMatrix to be update.
* @param k centroid number.
* @param fastDistance distance.
* @param distanceMatrix preallocated distance result matrix.
* @return the closest cluster index.
*/
public static int updateSumMatrix(FastDistanceVectorData sample, long sampleWeight, FastDistanceMatrixData centroids, int vectorSize, double[] sumMatrix, int k, FastDistance fastDistance, DenseMatrix distanceMatrix) {
Preconditions.checkNotNull(sumMatrix);
Preconditions.checkNotNull(distanceMatrix);
Preconditions.checkArgument(distanceMatrix.numRows() == centroids.getVectors().numCols() && distanceMatrix.numCols() == 1, "Memory not preallocated!");
fastDistance.calc(sample, centroids, distanceMatrix);
int clusterIndex = getClosestClusterIndex(sample, centroids, k, fastDistance, distanceMatrix).f0;
int startIndex = clusterIndex * (vectorSize + 1);
Vector vec = sample.getVector();
if (vec instanceof DenseVector) {
BLAS.axpy(vectorSize, sampleWeight, ((DenseVector) vec).getData(), 0, sumMatrix, startIndex);
} else {
SparseVector sparseVector = (SparseVector) vec;
sparseVector.forEach((index, value) -> sumMatrix[startIndex + index] += sampleWeight * value);
}
sumMatrix[startIndex + vectorSize] += sampleWeight;
return clusterIndex;
}
use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.
the class EmCorpusStep method calc.
@Override
public void calc(ComContext context) {
if (!addedIndex && seed != null) {
rand.reSeed(seed);
addedIndex = true;
}
int vocabularySize = ((List<Integer>) context.getObj(LdaVariable.vocabularySize)).get(0);
// initialize the params.
if (context.getStepNo() == 1) {
DenseMatrix nWordTopics = new DenseMatrix(vocabularySize + 1, numTopic);
context.putObj(LdaVariable.nWordTopics, nWordTopics.getData());
List<SparseVector> data = context.getObj(LdaVariable.data);
if (data == null) {
return;
}
// the size of docs.
int localDocSize = data.size();
Document[] docs = new Document[localDocSize];
DenseMatrix nDocTopics = new DenseMatrix(localDocSize, numTopic);
int docId = 0;
int topic, word;
for (SparseVector sparseVector : data) {
int wordNum = 0;
for (double value : sparseVector.getValues()) {
wordNum += value;
}
Document doc = new Document(wordNum);
int idx = 0;
VectorIterator iter = sparseVector.iterator();
while (iter.hasNext()) {
word = iter.getIndex();
for (int j = 0; j < (int) iter.getValue(); j++) {
topic = rand.nextInt(0, numTopic - 1);
doc.setWordIdxs(idx, word);
doc.setTopicIdxs(idx, topic);
updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, topic, 1);
idx++;
}
iter.next();
}
docs[docId] = doc;
docId++;
}
context.putObj(LdaVariable.corpus, docs);
context.putObj(LdaVariable.nDocTopics, nDocTopics);
context.removeObj(LdaVariable.data);
} else {
Document[] docs = context.getObj(LdaVariable.corpus);
if (docs == null) {
return;
}
DenseMatrix nDocTopics = context.getObj(LdaVariable.nDocTopics);
DenseMatrix nWordTopics = new DenseMatrix(vocabularySize + 1, numTopic, context.getObj(LdaVariable.nWordTopics), false);
int docId = 0;
double[] p = new double[numTopic];
double pSum;
int newTopic;
// update params with each doc.
for (Document doc : docs) {
int wordCount = doc.getLength();
for (int i = 0; i < wordCount; ++i) {
int word = doc.getWordIdxs(i);
int topic = doc.getTopicIdxs(i);
// choose the word and minus its topic
updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, topic, -1);
pSum = 0;
for (int k = 0; k < numTopic; k++) {
// calculate the probability that word belongs to each topic, and then generate the topic.
pSum += (nWordTopics.get(word, k) + beta) * (nDocTopics.get(docId, k) + alpha) / (nWordTopics.get(vocabularySize, k) + vocabularySize * beta);
p[k] = pSum;
}
double u = rand.nextUniform(0, 1) * pSum;
newTopic = findProbIdx(p, u);
doc.setTopicIdxs(i, newTopic);
// update the word and its new topic.
updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, newTopic, 1);
}
docId++;
}
nWordTopics = new DenseMatrix(nWordTopics.numRows(), nWordTopics.numCols());
for (Document doc : docs) {
int length = doc.getLength();
for (int i = 0; i < length; i++) {
nWordTopics.add(doc.getWordIdxs(i), doc.getTopicIdxs(i), 1);
nWordTopics.add(vocabularySize, doc.getTopicIdxs(i), 1);
}
}
context.putObj(LdaVariable.nWordTopics, nWordTopics.getData());
}
}
Aggregations