Search in sources :

Example 1 with LdaModelDataConverter

use of com.alibaba.alink.operator.common.clustering.LdaModelDataConverter in project Alink by alibaba.

the class LdaTrainBatchOp method online.

private void online(Tuple2<DataSet<Vector>, DataSet<BaseVectorSummary>> dataAndStat, int numTopic, int numIter, double alpha, double beta, DataSet<DocCountVectorizerModelData> resDocCountModel, int gammaShape, Integer seed) {
    if (beta == -1) {
        beta = 1.0 / numTopic;
    }
    if (alpha == -1) {
        alpha = 1.0 / numTopic;
    }
    double learningOffset = getParams().get(ONLINE_LEARNING_OFFSET);
    double learningDecay = getParams().get(LEARNING_DECAY);
    double subSamplingRate = getParams().get(SUBSAMPLING_RATE);
    boolean optimizeDocConcentration = getParams().get(OPTIMIZE_DOC_CONCENTRATION);
    DataSet<Vector> data = dataAndStat.f0;
    DataSet<Tuple2<Long, Integer>> shape = dataAndStat.f1.map(new MapFunction<BaseVectorSummary, Tuple2<Long, Integer>>() {

        private static final long serialVersionUID = 1305270477796787466L;

        @Override
        public Tuple2<Long, Integer> map(BaseVectorSummary srt) {
            return new Tuple2<>(srt.count(), srt.vectorSize());
        }
    });
    DataSet<Tuple2<DenseMatrix, DenseMatrix>> initModel = data.mapPartition(new OnlineInit(numTopic, gammaShape, alpha, seed)).name("init lambda").withBroadcastSet(shape, LdaVariable.shape);
    DataSet<Row> ldaModelData = new IterativeComQueue().initWithPartitionedData(LdaVariable.data, data).initWithBroadcastData(LdaVariable.shape, shape).initWithBroadcastData(LdaVariable.initModel, initModel).add(new OnlineCorpusStep(numTopic, subSamplingRate, gammaShape, seed)).add(new AllReduce(LdaVariable.wordTopicStat)).add(new AllReduce(LdaVariable.logPhatPart)).add(new AllReduce(LdaVariable.nonEmptyWordCount)).add(new AllReduce(LdaVariable.nonEmptyDocCount)).add(new UpdateLambdaAndAlpha(numTopic, learningOffset, learningDecay, subSamplingRate, optimizeDocConcentration, beta)).add(new OnlineLogLikelihood(beta, numTopic, numIter, gammaShape, seed)).add(new AllReduce(LdaVariable.logLikelihood)).closeWith(new BuildOnlineLdaModel(numTopic, beta)).setMaxIter(numIter).exec();
    DataSet<Row> model = ldaModelData.flatMap(new BuildResModel(seed)).withBroadcastSet(resDocCountModel, "DocCountModel");
    setOutput(model, new LdaModelDataConverter().getModelSchema());
    saveWordTopicModelAndPerplexity(model, numTopic, true);
}
Also used : IterativeComQueue(com.alibaba.alink.common.comqueue.IterativeComQueue) AllReduce(com.alibaba.alink.common.comqueue.communication.AllReduce) UpdateLambdaAndAlpha(com.alibaba.alink.operator.common.clustering.lda.UpdateLambdaAndAlpha) OnlineLogLikelihood(com.alibaba.alink.operator.common.clustering.lda.OnlineLogLikelihood) BuildOnlineLdaModel(com.alibaba.alink.operator.common.clustering.lda.BuildOnlineLdaModel) Tuple2(org.apache.flink.api.java.tuple.Tuple2) OnlineCorpusStep(com.alibaba.alink.operator.common.clustering.lda.OnlineCorpusStep) LdaModelDataConverter(com.alibaba.alink.operator.common.clustering.LdaModelDataConverter) BaseVectorSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary) Row(org.apache.flink.types.Row) Vector(com.alibaba.alink.common.linalg.Vector) SparseVector(com.alibaba.alink.common.linalg.SparseVector)

Example 2 with LdaModelDataConverter

use of com.alibaba.alink.operator.common.clustering.LdaModelDataConverter in project Alink by alibaba.

the class LdaTrainBatchOp method gibbsSample.

private void gibbsSample(Tuple2<DataSet<Vector>, DataSet<BaseVectorSummary>> dataAndStat, int numTopic, int numIter, double alpha, double beta, DataSet<DocCountVectorizerModelData> resDocCountModel, Integer seed) {
    if (beta == -1) {
        beta = 0.01 + 1;
    }
    if (alpha == -1) {
        alpha = 50.0 / numTopic + 1;
    }
    DataSet<Vector> data = dataAndStat.f0;
    DataSet<Integer> colNum = dataAndStat.f1.map(new MapFunction<BaseVectorSummary, Integer>() {

        private static final long serialVersionUID = -7170259222827300492L;

        @Override
        public Integer map(BaseVectorSummary srt) {
            return srt.vectorSize();
        }
    });
    DataSet<Row> ldaModelData = new IterativeComQueue().initWithPartitionedData(LdaVariable.data, data).initWithBroadcastData(LdaVariable.vocabularySize, colNum).add(new EmCorpusStep(numTopic, alpha, beta, seed)).add(new AllReduce(LdaVariable.nWordTopics)).add(new EmLogLikelihood(numTopic, alpha, beta, numIter)).add(new AllReduce(LdaVariable.logLikelihood)).closeWith(new BuildEmLdaModel(numTopic, alpha, beta)).setMaxIter(numIter).exec();
    DataSet<Row> model = ldaModelData.flatMap(new BuildResModel(seed)).withBroadcastSet(resDocCountModel, "DocCountModel");
    setOutput(model, new LdaModelDataConverter().getModelSchema());
    saveWordTopicModelAndPerplexity(model, numTopic, false);
}
Also used : IterativeComQueue(com.alibaba.alink.common.comqueue.IterativeComQueue) AllReduce(com.alibaba.alink.common.comqueue.communication.AllReduce) BuildEmLdaModel(com.alibaba.alink.operator.common.clustering.lda.BuildEmLdaModel) EmCorpusStep(com.alibaba.alink.operator.common.clustering.lda.EmCorpusStep) EmLogLikelihood(com.alibaba.alink.operator.common.clustering.lda.EmLogLikelihood) LdaModelDataConverter(com.alibaba.alink.operator.common.clustering.LdaModelDataConverter) BaseVectorSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary) Row(org.apache.flink.types.Row) Vector(com.alibaba.alink.common.linalg.Vector) SparseVector(com.alibaba.alink.common.linalg.SparseVector)

Aggregations

IterativeComQueue (com.alibaba.alink.common.comqueue.IterativeComQueue)2 AllReduce (com.alibaba.alink.common.comqueue.communication.AllReduce)2 SparseVector (com.alibaba.alink.common.linalg.SparseVector)2 Vector (com.alibaba.alink.common.linalg.Vector)2 LdaModelDataConverter (com.alibaba.alink.operator.common.clustering.LdaModelDataConverter)2 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)2 Row (org.apache.flink.types.Row)2 BuildEmLdaModel (com.alibaba.alink.operator.common.clustering.lda.BuildEmLdaModel)1 BuildOnlineLdaModel (com.alibaba.alink.operator.common.clustering.lda.BuildOnlineLdaModel)1 EmCorpusStep (com.alibaba.alink.operator.common.clustering.lda.EmCorpusStep)1 EmLogLikelihood (com.alibaba.alink.operator.common.clustering.lda.EmLogLikelihood)1 OnlineCorpusStep (com.alibaba.alink.operator.common.clustering.lda.OnlineCorpusStep)1 OnlineLogLikelihood (com.alibaba.alink.operator.common.clustering.lda.OnlineLogLikelihood)1 UpdateLambdaAndAlpha (com.alibaba.alink.operator.common.clustering.lda.UpdateLambdaAndAlpha)1 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)1