use of com.alibaba.alink.common.comqueue.IterativeComQueue in project Alink by alibaba.
the class BaseGbdtTrainBatchOp method linkFrom.
@Override
public T linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> in = checkAndGetFirst(inputs);
LOG.info("gbdt train start");
if (!Preprocessing.isSparse(getParams())) {
getParams().set(HasCategoricalCols.CATEGORICAL_COLS, TableUtil.getCategoricalCols(in.getSchema(), getParams().get(GbdtTrainParams.FEATURE_COLS), getParams().contains(GbdtTrainParams.CATEGORICAL_COLS) ? getParams().get(GbdtTrainParams.CATEGORICAL_COLS) : null));
}
LossType loss = getParams().get(LossUtils.LOSS_TYPE);
getParams().set(ALGO_TYPE, LossUtils.lossTypeToInt(loss));
rewriteLabelType(in.getSchema(), getParams());
if (!Preprocessing.isSparse(getParams())) {
getParams().set(ModelParamName.FEATURE_TYPES, FlinkTypeConverter.getTypeString(TableUtil.findColTypes(in.getSchema(), getParams().get(GbdtTrainParams.FEATURE_COLS))));
}
if (LossUtils.isRanking(getParams().get(LossUtils.LOSS_TYPE))) {
if (!getParams().contains(LambdaMartNdcgParams.GROUP_COL)) {
throw new IllegalArgumentException("Group column should be set in ranking loss function.");
}
}
String[] trainColNames = trainColsWithGroup();
// check label if has null value or not.
final String labelColName = this.getParams().get(HasLabelCol.LABEL_COL);
final int labelColIdx = TableUtil.findColIndex(in.getSchema(), labelColName);
in = new TableSourceBatchOp(DataSetConversionUtil.toTable(in.getMLEnvironmentId(), in.getDataSet().map(new MapFunction<Row, Row>() {
@Override
public Row map(Row row) throws Exception {
if (null == row.getField(labelColIdx)) {
throw new RuntimeException("label col has null values.");
}
return row;
}
}), in.getSchema())).setMLEnvironmentId(in.getMLEnvironmentId());
in = Preprocessing.select(in, trainColNames);
DataSet<Object[]> labels = Preprocessing.generateLabels(in, getParams(), LossUtils.isRegression(loss) || LossUtils.isRanking(loss));
if (LossUtils.isClassification(loss)) {
labels = labels.map(new CheckNumLabels4BinaryClassifier());
}
DataSet<Row> trainDataSet;
BatchOperator<?> stringIndexerModel;
BatchOperator<?> quantileModel;
if (getParams().get(USE_ONEHOT)) {
// create empty string indexer model.
stringIndexerModel = Preprocessing.generateStringIndexerModel(in, new Params());
// create empty quantile model.
quantileModel = Preprocessing.generateQuantileDiscretizerModel(in, new Params().set(HasFeatureCols.FEATURE_COLS, new String[] {}).set(HasCategoricalCols.CATEGORICAL_COLS, new String[] {}));
trainDataSet = Preprocessing.castLabel(in, getParams(), labels, LossUtils.isRegression(loss) || LossUtils.isRanking(loss)).getDataSet();
} else if (getParams().get(USE_EPSILON_APPRO_QUANTILE)) {
// create string indexer model
stringIndexerModel = Preprocessing.generateStringIndexerModel(in, getParams());
// create empty quantile model
quantileModel = Preprocessing.generateQuantileDiscretizerModel(in, new Params().set(HasFeatureCols.FEATURE_COLS, new String[] {}).set(HasCategoricalCols.CATEGORICAL_COLS, new String[] {}));
trainDataSet = Preprocessing.castLabel(Preprocessing.isSparse(getParams()) ? in : Preprocessing.castContinuousCols(Preprocessing.castCategoricalCols(in, stringIndexerModel, getParams()), getParams()), getParams(), labels, LossUtils.isRegression(loss) || LossUtils.isRanking(loss)).getDataSet();
} else {
stringIndexerModel = Preprocessing.generateStringIndexerModel(in, getParams());
quantileModel = Preprocessing.generateQuantileDiscretizerModel(in, getParams());
trainDataSet = Preprocessing.castLabel(Preprocessing.castToQuantile(Preprocessing.isSparse(getParams()) ? in : Preprocessing.castContinuousCols(Preprocessing.castCategoricalCols(in, stringIndexerModel, getParams()), getParams()), quantileModel, getParams()), getParams(), labels, LossUtils.isRegression(loss) || LossUtils.isRanking(loss)).getDataSet();
}
if (LossUtils.isRanking(getParams().get(LossUtils.LOSS_TYPE))) {
trainDataSet = trainDataSet.partitionCustom(new Partitioner<Number>() {
private static final long serialVersionUID = -7790649477852624964L;
@Override
public int partition(Number key, int numPartitions) {
return (int) (key.longValue() % numPartitions);
}
}, 0);
}
DataSet<Tuple2<Double, Long>> sum = trainDataSet.mapPartition(new MapPartitionFunction<Row, Tuple2<Double, Long>>() {
private static final long serialVersionUID = -8333738060239409640L;
@Override
public void mapPartition(Iterable<Row> iterable, Collector<Tuple2<Double, Long>> collector) throws Exception {
double sum = 0.;
long cnt = 0;
for (Row row : iterable) {
sum += ((Number) row.getField(row.getArity() - 1)).doubleValue();
cnt++;
}
collector.collect(Tuple2.of(sum, cnt));
}
}).reduce(new ReduceFunction<Tuple2<Double, Long>>() {
private static final long serialVersionUID = -6464200385237876961L;
@Override
public Tuple2<Double, Long> reduce(Tuple2<Double, Long> t0, Tuple2<Double, Long> t1) throws Exception {
return Tuple2.of(t0.f0 + t1.f0, t0.f1 + t1.f1);
}
});
DataSet<FeatureMeta> featureMetas;
if (getParams().get(USE_ONEHOT)) {
featureMetas = DataUtil.createOneHotFeatureMeta(trainDataSet, getParams(), trainColNames);
} else if (getParams().get(USE_EPSILON_APPRO_QUANTILE)) {
featureMetas = DataUtil.createEpsilonApproQuantileFeatureMeta(trainDataSet, stringIndexerModel.getDataSet(), getParams(), trainColNames, getMLEnvironmentId());
} else {
featureMetas = DataUtil.createFeatureMetas(quantileModel.getDataSet(), stringIndexerModel.getDataSet(), getParams());
}
{
getParams().set(BoosterType.BOOSTER_TYPE, BoosterType.HESSION_BASE);
getParams().set(CriteriaType.CRITERIA_TYPE, CriteriaType.valueOf(getParams().get(GbdtTrainParams.CRITERIA).toString()));
if (getParams().get(GbdtTrainParams.NEWTON_STEP)) {
getParams().set(LeafScoreUpdaterType.LEAF_SCORE_UPDATER_TYPE, LeafScoreUpdaterType.NEWTON_SINGLE_STEP_UPDATER);
} else {
getParams().set(LeafScoreUpdaterType.LEAF_SCORE_UPDATER_TYPE, LeafScoreUpdaterType.WEIGHT_AVG_UPDATER);
}
}
IterativeComQueue comQueue = new IterativeComQueue().initWithPartitionedData("trainData", trainDataSet).initWithBroadcastData("gbdt.y.sum", sum).initWithBroadcastData("quantileModel", quantileModel.getDataSet()).initWithBroadcastData("stringIndexerModel", stringIndexerModel.getDataSet()).initWithBroadcastData("labels", labels).initWithBroadcastData("featureMetas", featureMetas).add(new InitBoostingObjs(getParams())).add(new Boosting()).add(new Bagging()).add(new InitTreeObjs());
if (getParams().get(USE_EPSILON_APPRO_QUANTILE)) {
comQueue.add(new BuildLocalSketch()).add(new AllReduceT<>(BuildLocalSketch.SKETCH, BuildLocalSketch.FEATURE_SKETCH_LENGTH, new BuildLocalSketch.SketchReducer(getParams()), EpsilonApproQuantile.WQSummary.class)).add(new FinalizeBuildSketch());
}
comQueue.add(new ConstructLocalHistogram()).add(new ReduceScatter("histogram", "histogram", "recvcnts", AllReduce.SUM)).add(new CalcFeatureGain()).add(new AllReduceT<>("best", "bestLength", new NodeReducer(), Node.class)).add(new SplitInstances()).add(new UpdateLeafScore()).add(new UpdatePredictionScore()).setCompareCriterionOfNode0(new TerminateCriterion()).closeWith(new SaveModel(getParams()));
DataSet<Row> model = comQueue.exec();
setOutput(model, new TreeModelDataConverter(FlinkTypeConverter.getFlinkType(getParams().get(ModelParamName.LABEL_TYPE_NAME))).getModelSchema());
this.setSideOutputTables(new Table[] { DataSetConversionUtil.toTable(getMLEnvironmentId(), model.reduceGroup(new TreeModelDataConverter.FeatureImportanceReducer()), new String[] { getParams().get(TreeModelDataConverter.IMPORTANCE_FIRST_COL), getParams().get(TreeModelDataConverter.IMPORTANCE_SECOND_COL) }, new TypeInformation[] { Types.STRING, Types.DOUBLE }) });
return (T) this;
}
use of com.alibaba.alink.common.comqueue.IterativeComQueue in project Alink by alibaba.
the class LdaTrainBatchOp method online.
private void online(Tuple2<DataSet<Vector>, DataSet<BaseVectorSummary>> dataAndStat, int numTopic, int numIter, double alpha, double beta, DataSet<DocCountVectorizerModelData> resDocCountModel, int gammaShape, Integer seed) {
if (beta == -1) {
beta = 1.0 / numTopic;
}
if (alpha == -1) {
alpha = 1.0 / numTopic;
}
double learningOffset = getParams().get(ONLINE_LEARNING_OFFSET);
double learningDecay = getParams().get(LEARNING_DECAY);
double subSamplingRate = getParams().get(SUBSAMPLING_RATE);
boolean optimizeDocConcentration = getParams().get(OPTIMIZE_DOC_CONCENTRATION);
DataSet<Vector> data = dataAndStat.f0;
DataSet<Tuple2<Long, Integer>> shape = dataAndStat.f1.map(new MapFunction<BaseVectorSummary, Tuple2<Long, Integer>>() {
private static final long serialVersionUID = 1305270477796787466L;
@Override
public Tuple2<Long, Integer> map(BaseVectorSummary srt) {
return new Tuple2<>(srt.count(), srt.vectorSize());
}
});
DataSet<Tuple2<DenseMatrix, DenseMatrix>> initModel = data.mapPartition(new OnlineInit(numTopic, gammaShape, alpha, seed)).name("init lambda").withBroadcastSet(shape, LdaVariable.shape);
DataSet<Row> ldaModelData = new IterativeComQueue().initWithPartitionedData(LdaVariable.data, data).initWithBroadcastData(LdaVariable.shape, shape).initWithBroadcastData(LdaVariable.initModel, initModel).add(new OnlineCorpusStep(numTopic, subSamplingRate, gammaShape, seed)).add(new AllReduce(LdaVariable.wordTopicStat)).add(new AllReduce(LdaVariable.logPhatPart)).add(new AllReduce(LdaVariable.nonEmptyWordCount)).add(new AllReduce(LdaVariable.nonEmptyDocCount)).add(new UpdateLambdaAndAlpha(numTopic, learningOffset, learningDecay, subSamplingRate, optimizeDocConcentration, beta)).add(new OnlineLogLikelihood(beta, numTopic, numIter, gammaShape, seed)).add(new AllReduce(LdaVariable.logLikelihood)).closeWith(new BuildOnlineLdaModel(numTopic, beta)).setMaxIter(numIter).exec();
DataSet<Row> model = ldaModelData.flatMap(new BuildResModel(seed)).withBroadcastSet(resDocCountModel, "DocCountModel");
setOutput(model, new LdaModelDataConverter().getModelSchema());
saveWordTopicModelAndPerplexity(model, numTopic, true);
}
use of com.alibaba.alink.common.comqueue.IterativeComQueue in project Alink by alibaba.
the class Lbfgs method optimize.
/**
* optimizer api.
*
* @return the coefficient of linear problem.
*/
@Override
public DataSet<Tuple2<DenseVector, double[]>> optimize() {
// get parameters.
int maxIter = params.get(LinearTrainParams.MAX_ITER);
int numSearchStep = params.get(HasNumSearchStepDv4.NUM_SEARCH_STEP);
checkInitCoef();
/**
* solving problem using iteration.
* trainData is the distributed samples.
* initCoef is the initial model coefficient, which will be broadcast to every worker.
* objFuncSet is the object function in dataSet format
* .add(new PreallocateCoefficient(OptimName.currentCoef)) allocate memory for current coefficient
* .add(new PreallocateCoefficient(OptimName.minCoef)) allocate memory for min loss coefficient
* .add(new PreallocateLossCurve(OptimVariable.convergenceInfo)) allocate memory for loss values
* .add(new PreallocateVector(OptimName.dir ...)) allocate memory for dir
* .add(new PreallocateVector(OptimName.grad)) allocate memory for grad
* .add(new PreallocateSkyk()) allocate memory for sK yK
* .add(new CalcGradient(objFunc)) calculate local sub gradient
* .add(new AllReduce(OptimName.gradAllReduce)) sum all sub gradient with allReduce
* .add(new CalDirection()) get summed gradient and use it to calc descend dir
* .add(new CalcLosses(objFunc, OptimMethod.GD)) calculate local losses for line search
* .add(new AllReduce(OptimName.lossAllReduce)) sum all losses with allReduce
* .add(new UpdateModel(maxIter, epsilon ...)) update coefficient
* .setCompareCriterionOfNode0(new IterTermination()) judge stop of iteration
*/
DataSet<Row> model = new IterativeComQueue().initWithPartitionedData(OptimVariable.trainData, trainData).initWithBroadcastData(OptimVariable.model, coefVec).initWithBroadcastData(OptimVariable.objFunc, objFuncSet).add(new PreallocateCoefficient(OptimVariable.currentCoef)).add(new PreallocateCoefficient(OptimVariable.minCoef)).add(new PreallocateConvergenceInfo(OptimVariable.convergenceInfo, maxIter)).add(new PreallocateVector(OptimVariable.dir, new double[] { 0.0, OptimVariable.learningRate })).add(new PreallocateVector(OptimVariable.grad)).add(new PreallocateSkyk(OptimVariable.numCorrections)).add(new CalcGradient()).add(new AllReduce(OptimVariable.gradAllReduce)).add(new CalDirection(OptimVariable.numCorrections)).add(new CalcLosses(LinearTrainParams.OptimMethod.LBFGS, numSearchStep)).add(new AllReduce(OptimVariable.lossAllReduce)).add(new UpdateModel(params, OptimVariable.grad, LinearTrainParams.OptimMethod.LBFGS, numSearchStep)).setCompareCriterionOfNode0(new IterTermination()).closeWith(new OutputModel()).setMaxIter(maxIter).exec();
return model.mapPartition(new ParseRowModel());
}
use of com.alibaba.alink.common.comqueue.IterativeComQueue in project Alink by alibaba.
the class Newton method optimize.
/**
* optimizer api.
*
* @return the coefficient of linear problem.
*/
@Override
public DataSet<Tuple2<DenseVector, double[]>> optimize() {
// get parameters.
int maxIter = params.get(LinearTrainParams.MAX_ITER);
double epsilon = params.get(LinearTrainParams.EPSILON);
checkInitCoef();
/**
* solve problem using iteration.
* trainData is the distributed samples.
* initCoef is the initial model coefficient, which will be broadcast to every worker.
* objFuncSet is the object function in dataSet format
*
* .add(new PreallocateCoefficient(OptimName.currentCoef)) allocate memory for current coefficient
* .add(new PreallocateCoefficient(OptimName.minCoef)) allocate memory for min loss coefficient
* .add(new PreallocateLossCurve(OptimVariable.convergenceInfo)) allocate memory for loss values
* .add(new PreallocateVector(OptimName.dir ...)) allocate memory for grad
* ..add(new PreallocateMatrix(OptimName.hessian,...)) allocate memory for hessian matrix
* .add(new CalcGradientAndHessian(objFunc)) calculate local sub gradient and hessian
* .add(new AllReduce(OptimName.gradAllReduce)) sum all sub gradient and hessian with allReduce
* .add(new GetGradientAndHessian()) get summed gradient and hessian
* .add(new UpdateModel(maxIter, epsilon ...)) update coefficient with gradient and hessian
* .setCompareCriterionOfNode0(new IterTermination()) judge stop of iteration
*/
DataSet<Row> model = new IterativeComQueue().initWithPartitionedData(OptimVariable.trainData, trainData).initWithBroadcastData(OptimVariable.model, coefVec).initWithBroadcastData(OptimVariable.objFunc, objFuncSet).add(new PreallocateCoefficient(OptimVariable.currentCoef)).add(new PreallocateCoefficient(OptimVariable.minCoef)).add(new PreallocateConvergenceInfo(OptimVariable.convergenceInfo, maxIter)).add(new PreallocateVector(OptimVariable.dir, new double[2])).add(new PreallocateMatrix(OptimVariable.hessian, MAX_FEATURE_NUM)).add(new CalcGradientAndHessian()).add(new AllReduce(OptimVariable.gradHessAllReduce)).add(new GetGradeintAndHessian()).add(new UpdateModel(maxIter, epsilon)).setCompareCriterionOfNode0(new IterTermination()).closeWith(new OutputModel()).setMaxIter(maxIter).exec();
return model.mapPartition(new ParseRowModel());
}
use of com.alibaba.alink.common.comqueue.IterativeComQueue in project Alink by alibaba.
the class Owlqn method optimize.
/**
* optimizer api.
*
* @return the coefficient of linear problem.
*/
@Override
public DataSet<Tuple2<DenseVector, double[]>> optimize() {
// get parameters.
int maxIter = params.get(LinearTrainParams.MAX_ITER);
checkInitCoef();
int numSearchStep = params.get(HasNumSearchStepDv4.NUM_SEARCH_STEP);
/**
* solving problem using iteration.
* trainData is the distributed samples.
* initCoef is the initial model coefficient, which will be broadcast to every worker.
* objFuncSet is the object function in dataSet format
*
* .add(new PreallocateCoefficient(OptimName.currentCoef)) allocate memory for current coefficient
* .add(new PreallocateCoefficient(OptimName.minCoef)) allocate memory for min loss coefficient
* .add(new PreallocateLossCurve(OptimVariable.convergenceInfo)) allocate memory for loss values
* .add(new PreallocateVector(OptimName.dir ...)) allocate memory for descend direction
* .add(new PreallocateVector(OptimName.grad)) allocate memory for gradient
* .add(new PreallocateSkyk()) allocate memory for sK yK
* .add(new CalcGradient(objFunc)) calculate local sub gradient
* .add(new AllReduce(OptimName.gradAllReduce)) sum all sub gradient with allReduce
* .add(new CalDirection()) get summed gradient and use it to calc descend dir
* .add(new CalcLosses(objFunc, OptimMethod.GD)) calculate local losses for line search
* .add(new AllReduce(OptimName.lossAllReduce)) sum all losses with allReduce
* .add(new UpdateModel(maxIter, epsilon ...)) update coefficient
* .setCompareCriterionOfNode0(new IterTermination()) judge stop of iteration
*/
DataSet<Row> model = new IterativeComQueue().initWithPartitionedData(OptimVariable.trainData, trainData).initWithBroadcastData(OptimVariable.model, coefVec).initWithBroadcastData(OptimVariable.objFunc, objFuncSet).add(new PreallocateCoefficient(OptimVariable.currentCoef)).add(new PreallocateCoefficient(OptimVariable.minCoef)).add(new PreallocateConvergenceInfo(OptimVariable.convergenceInfo, maxIter)).add(new PreallocateVector(OptimVariable.dir, new double[] { 0.0, OptimVariable.learningRate })).add(new PreallocateVector(OptimVariable.grad)).add(new PreallocateVector(OptimVariable.pseGrad)).add(new PreallocateSkyk(OptimVariable.numCorrections)).add(new CalcGradient()).add(new AllReduce(OptimVariable.gradAllReduce)).add(new CalDirection(params.get(HasL1.L_1), OptimVariable.numCorrections)).add(new CalcLosses(LinearTrainParams.OptimMethod.OWLQN, numSearchStep)).add(new AllReduce(OptimVariable.lossAllReduce)).add(new UpdateModel(params, OptimVariable.grad, LinearTrainParams.OptimMethod.OWLQN, numSearchStep)).setCompareCriterionOfNode0(new IterTermination()).closeWith(new OutputModel()).setMaxIter(maxIter).exec();
return model.mapPartition(new ParseRowModel());
}
Aggregations