Search in sources :

Example 6 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class VectorAssemblerMapper method assembler.

private static Object assembler(Object[] input, HandleInvalidMethod handleInvalid) {
    if (null == input) {
        return null;
    }
    int pos = 0;
    int size = input.length;
    for (Object col : input) {
        if (col instanceof DenseVector) {
            size += ((DenseVector) col).size();
        } else if (col instanceof SparseVector) {
            size += ((SparseVector) col).getIndices().length;
        }
    }
    Map<Integer, Double> map = new HashMap<>(size);
    // getVector the data, and write it in List.
    for (Object col : input) {
        if (null != col) {
            if (col instanceof Number) {
                map.put(pos++, ((Number) col).doubleValue());
            } else if (col instanceof String) {
                Vector vec = VectorUtil.getVector(col);
                pos = appendVector(vec, map, pos);
            } else if (col instanceof Vector) {
                pos = appendVector((Vector) col, map, pos);
            } else {
                throw new UnsupportedOperationException("not support type of object.");
            }
        } else {
            switch(handleInvalid) {
                case ERROR:
                    throw new NullPointerException("null value is found in vector assembler inputs.");
                case SKIP:
                    return null;
                default:
            }
        }
    }
    /* form the vector, and finally toString it. */
    Vector vec = new SparseVector(pos, map);
    if (map.size() * RATIO > pos) {
        vec = ((SparseVector) vec).toDenseVector();
    }
    return vec;
}
Also used : HashMap(java.util.HashMap) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) SparseVector(com.alibaba.alink.common.linalg.SparseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector)

Example 7 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class FlattenMTableStreamTest method linkFrom.

@Test
public void linkFrom() throws Exception {
    List<Row> rows = new ArrayList<>();
    rows.add(Row.of(1, "2", 0, null, new SparseVector(3, new int[] { 1 }, new double[] { 2.0 }), new FloatTensor(new float[] { 3.0f })));
    rows.add(Row.of(null, "2", 0, new DenseVector(new double[] { 0.0, 1.0 }), new SparseVector(4, new int[] { 2 }, new double[] { 3.0 }), new FloatTensor(new float[] { 3.0f })));
    rows.add(Row.of(null, "2", 0, new DenseVector(new double[] { 0.1, 1.0 }), new SparseVector(4, new int[] { 2 }, new double[] { 3.0 }), new FloatTensor(new float[] { 3.0f })));
    String schemaStr = "col0 int, col1 string, label int" + ", d_vec DENSE_VECTOR" + ", s_vec SPARSE_VECTOR" + ", tensor FLOAT_TENSOR";
    MTable mTable = new MTable(rows, schemaStr);
    List<Row> table = new ArrayList<>();
    table.add(Row.of("id", mTable.toString()));
    StreamOperator<?> op = new MemSourceStreamOp(table, new String[] { "id", "mTable" });
    StreamOperator<?> res = op.link(new FlattenMTableStreamOp().setSchemaStr(schemaStr).setSelectedCol("mTable").setReservedCols("id"));
    CollectSinkStreamOp sop = res.link(new CollectSinkStreamOp());
    StreamOperator.execute();
    List<Row> list = sop.getAndRemoveValues();
    for (Row row : list) {
        Assert.assertEquals(row.getField(0), "id");
    }
}
Also used : MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) MTable(com.alibaba.alink.common.MTable) CollectSinkStreamOp(com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp) ArrayList(java.util.ArrayList) FloatTensor(com.alibaba.alink.common.linalg.tensor.FloatTensor) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) Test(org.junit.Test)

Example 8 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class VectorSummarizerUtilTest method testMulti.

@Test
public void testMulti() {
    SparseVectorSummarizer left = new SparseVectorSummarizer(true);
    DenseVectorSummarizer right = new DenseVectorSummarizer(true);
    Assert.assertEquals(0, VectorSummarizerUtil.merge(left, right).count);
    right = (DenseVectorSummarizer) right.visit(new DenseVector(new double[] { 1.0 }));
    Assert.assertEquals(1, VectorSummarizerUtil.merge(left, right).count);
    left = (SparseVectorSummarizer) left.visit(new SparseVector(2, new int[] { 0 }, new double[] { 1.0 }));
    Assert.assertEquals(3, VectorSummarizerUtil.merge(left, right).count);
}
Also used : SparseVector(com.alibaba.alink.common.linalg.SparseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) Test(org.junit.Test)

Example 9 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class KMeansUtil method updateSumMatrix.

/**
 * Find the closest centroid from centroids for sample, and add the sample to sumMatrix.
 *
 * @param sample         query sample.
 * @param sampleWeight   sample weight.
 * @param centroids      centroids.
 * @param vectorSize     vectorsize.
 * @param sumMatrix      the sumMatrix to be update.
 * @param k              centroid number.
 * @param fastDistance   distance.
 * @param distanceMatrix preallocated distance result matrix.
 * @return the closest cluster index.
 */
public static int updateSumMatrix(FastDistanceVectorData sample, long sampleWeight, FastDistanceMatrixData centroids, int vectorSize, double[] sumMatrix, int k, FastDistance fastDistance, DenseMatrix distanceMatrix) {
    Preconditions.checkNotNull(sumMatrix);
    Preconditions.checkNotNull(distanceMatrix);
    Preconditions.checkArgument(distanceMatrix.numRows() == centroids.getVectors().numCols() && distanceMatrix.numCols() == 1, "Memory not preallocated!");
    fastDistance.calc(sample, centroids, distanceMatrix);
    int clusterIndex = getClosestClusterIndex(sample, centroids, k, fastDistance, distanceMatrix).f0;
    int startIndex = clusterIndex * (vectorSize + 1);
    Vector vec = sample.getVector();
    if (vec instanceof DenseVector) {
        BLAS.axpy(vectorSize, sampleWeight, ((DenseVector) vec).getData(), 0, sumMatrix, startIndex);
    } else {
        SparseVector sparseVector = (SparseVector) vec;
        sparseVector.forEach((index, value) -> sumMatrix[startIndex + index] += sampleWeight * value);
    }
    sumMatrix[startIndex + vectorSize] += sampleWeight;
    return clusterIndex;
}
Also used : SparseVector(com.alibaba.alink.common.linalg.SparseVector) Vector(com.alibaba.alink.common.linalg.Vector) DenseVector(com.alibaba.alink.common.linalg.DenseVector) SparseVector(com.alibaba.alink.common.linalg.SparseVector) DenseVector(com.alibaba.alink.common.linalg.DenseVector)

Example 10 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class EmCorpusStep method calc.

@Override
public void calc(ComContext context) {
    if (!addedIndex && seed != null) {
        rand.reSeed(seed);
        addedIndex = true;
    }
    int vocabularySize = ((List<Integer>) context.getObj(LdaVariable.vocabularySize)).get(0);
    // initialize the params.
    if (context.getStepNo() == 1) {
        DenseMatrix nWordTopics = new DenseMatrix(vocabularySize + 1, numTopic);
        context.putObj(LdaVariable.nWordTopics, nWordTopics.getData());
        List<SparseVector> data = context.getObj(LdaVariable.data);
        if (data == null) {
            return;
        }
        // the size of docs.
        int localDocSize = data.size();
        Document[] docs = new Document[localDocSize];
        DenseMatrix nDocTopics = new DenseMatrix(localDocSize, numTopic);
        int docId = 0;
        int topic, word;
        for (SparseVector sparseVector : data) {
            int wordNum = 0;
            for (double value : sparseVector.getValues()) {
                wordNum += value;
            }
            Document doc = new Document(wordNum);
            int idx = 0;
            VectorIterator iter = sparseVector.iterator();
            while (iter.hasNext()) {
                word = iter.getIndex();
                for (int j = 0; j < (int) iter.getValue(); j++) {
                    topic = rand.nextInt(0, numTopic - 1);
                    doc.setWordIdxs(idx, word);
                    doc.setTopicIdxs(idx, topic);
                    updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, topic, 1);
                    idx++;
                }
                iter.next();
            }
            docs[docId] = doc;
            docId++;
        }
        context.putObj(LdaVariable.corpus, docs);
        context.putObj(LdaVariable.nDocTopics, nDocTopics);
        context.removeObj(LdaVariable.data);
    } else {
        Document[] docs = context.getObj(LdaVariable.corpus);
        if (docs == null) {
            return;
        }
        DenseMatrix nDocTopics = context.getObj(LdaVariable.nDocTopics);
        DenseMatrix nWordTopics = new DenseMatrix(vocabularySize + 1, numTopic, context.getObj(LdaVariable.nWordTopics), false);
        int docId = 0;
        double[] p = new double[numTopic];
        double pSum;
        int newTopic;
        // update params with each doc.
        for (Document doc : docs) {
            int wordCount = doc.getLength();
            for (int i = 0; i < wordCount; ++i) {
                int word = doc.getWordIdxs(i);
                int topic = doc.getTopicIdxs(i);
                // choose the word and minus its topic
                updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, topic, -1);
                pSum = 0;
                for (int k = 0; k < numTopic; k++) {
                    // calculate the probability that word belongs to each topic, and then generate the topic.
                    pSum += (nWordTopics.get(word, k) + beta) * (nDocTopics.get(docId, k) + alpha) / (nWordTopics.get(vocabularySize, k) + vocabularySize * beta);
                    p[k] = pSum;
                }
                double u = rand.nextUniform(0, 1) * pSum;
                newTopic = findProbIdx(p, u);
                doc.setTopicIdxs(i, newTopic);
                // update the word and its new topic.
                updateDocWordTopics(nDocTopics, nWordTopics, docId, word, vocabularySize, newTopic, 1);
            }
            docId++;
        }
        nWordTopics = new DenseMatrix(nWordTopics.numRows(), nWordTopics.numCols());
        for (Document doc : docs) {
            int length = doc.getLength();
            for (int i = 0; i < length; i++) {
                nWordTopics.add(doc.getWordIdxs(i), doc.getTopicIdxs(i), 1);
                nWordTopics.add(vocabularySize, doc.getTopicIdxs(i), 1);
            }
        }
        context.putObj(LdaVariable.nWordTopics, nWordTopics.getData());
    }
}
Also used : List(java.util.List) SparseVector(com.alibaba.alink.common.linalg.SparseVector) VectorIterator(com.alibaba.alink.common.linalg.VectorIterator) DenseMatrix(com.alibaba.alink.common.linalg.DenseMatrix)

Aggregations

SparseVector (com.alibaba.alink.common.linalg.SparseVector)125 Test (org.junit.Test)63 DenseVector (com.alibaba.alink.common.linalg.DenseVector)60 Params (org.apache.flink.ml.api.misc.param.Params)45 Row (org.apache.flink.types.Row)45 Vector (com.alibaba.alink.common.linalg.Vector)40 TableSchema (org.apache.flink.table.api.TableSchema)27 ArrayList (java.util.ArrayList)21 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)15 HashMap (java.util.HashMap)12 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)12 List (java.util.List)11 DenseMatrix (com.alibaba.alink.common.linalg.DenseMatrix)10 MTable (com.alibaba.alink.common.MTable)7 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)6 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)6 Map (java.util.Map)6 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)5 VectorAssemblerParams (com.alibaba.alink.params.dataproc.vector.VectorAssemblerParams)5 OneHotPredictParams (com.alibaba.alink.params.feature.OneHotPredictParams)5