use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.
the class Preprocessing method generateQuantileDiscretizerModel.
public static BatchOperator<?> generateQuantileDiscretizerModel(BatchOperator<?> input, Params params) {
if (params.contains(HasVectorCol.VECTOR_COL)) {
return sample(input, params).linkTo(new VectorTrain(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setVectorCol(params.get(HasVectorCol.VECTOR_COL)).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
}
String[] continuousColNames = ArrayUtils.removeElements(params.get(HasFeatureCols.FEATURE_COLS), params.get(HasCategoricalCols.CATEGORICAL_COLS));
BatchOperator<?> quantileDiscretizerModel;
if (continuousColNames != null && continuousColNames.length > 0) {
quantileDiscretizerModel = sample(input, params).linkTo(new QuantileDiscretizerTrainBatchOp(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setSelectedCols(continuousColNames).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
} else {
QuantileDiscretizerModelDataConverter emptyModel = new QuantileDiscretizerModelDataConverter();
quantileDiscretizerModel = new DataSetWrapperBatchOp(MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1).mapPartition(new MapPartitionFunction<Integer, Row>() {
private static final long serialVersionUID = 2328781103352773618L;
@Override
public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
// pass
}
}), emptyModel.getModelSchema().getFieldNames(), emptyModel.getModelSchema().getFieldTypes()).setMLEnvironmentId(input.getMLEnvironmentId());
}
return quantileDiscretizerModel;
}
use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.
the class PreprocessingTest method sparse.
@Test
public void sparse() {
Row[] rows = new Row[] { Row.of(0L, "{\"vectorCol\":\"\\\"vector\\\"\",\"MLEnvironmentId\":\"0\",\"version\":\"\\\"v2\\\"\"," + "\"numBuckets\":\"128\"}\n"), Row.of(1048576L, "[{\"featureName\":\"0\",\"featureType\":\"DOUBLE\",\"splitsArray\":[0.0,1.0,4.0,5.0]," + "\"isLeftOpen\":true}]\n"), Row.of(2097152L, "[{\"featureName\":\"1\",\"featureType\":\"DOUBLE\",\"splitsArray\":[2.0,3.0,4.0]," + "\"isLeftOpen\":true}]\n") };
List<Row> model = Arrays.asList(rows);
QuantileDiscretizerModelDataConverter quantileModel = new QuantileDiscretizerModelDataConverter();
quantileModel.load(model);
Assert.assertEquals(quantileModel.getFeatureSize("0"), 5);
Assert.assertEquals(quantileModel.missingIndex("0"), 5);
Assert.assertEquals(quantileModel.getFeatureSize("1"), 4);
Assert.assertEquals(quantileModel.getFeatureSize("1"), 4);
Assert.assertEquals(Preprocessing.zeroIndex(quantileModel, "0"), 0);
Assert.assertEquals(Preprocessing.zeroIndex(quantileModel, "1"), 0);
}
use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.
the class TreeInitObj method initialMapping.
private static QuantileDiscretizerModelDataConverter initialMapping(List<Row> quantileModel) {
if (!quantileModel.isEmpty()) {
QuantileDiscretizerModelDataConverter quantileDiscretizerModel = new QuantileDiscretizerModelDataConverter();
quantileDiscretizerModel.load(quantileModel);
return quantileDiscretizerModel;
} else {
return null;
}
}
use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.
the class TreeInitObj method calc.
@Override
public void calc(ComContext context) {
if (context.getStepNo() != 1) {
return;
}
List<Row> dataRows = context.getObj("treeInput");
List<Row> quantileModel = context.getObj("quantileModel");
List<Row> stringIndexerModel = context.getObj("stringIndexerModel");
List<Object[]> labels = context.getObj("labels");
int nLocalRow = dataRows == null ? 0 : dataRows.size();
Params localParams = params.clone();
localParams.set(TASK_ID, context.getTaskId());
localParams.set(NUM_OF_SUBTASKS, context.getNumTask());
localParams.set(N_LOCAL_ROW, nLocalRow);
QuantileDiscretizerModelDataConverter quantileDiscretizerModel = initialMapping(quantileModel);
List<String> lookUpColNames = new ArrayList<>();
if (params.get(RandomForestTrainParams.CATEGORICAL_COLS) != null) {
lookUpColNames.addAll(Arrays.asList(params.get(RandomForestTrainParams.CATEGORICAL_COLS)));
}
Map<String, Integer> categoricalColsSize = TreeUtil.extractCategoricalColsSize(stringIndexerModel, lookUpColNames.toArray(new String[0]));
if (!Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
categoricalColsSize.put(params.get(RandomForestTrainParams.LABEL_COL), labels.get(0).length);
}
FeatureMeta[] featureMetas = TreeUtil.getFeatureMeta(params.get(RandomForestTrainParams.FEATURE_COLS), categoricalColsSize);
FeatureMeta labelMeta = TreeUtil.getLabelMeta(params.get(RandomForestTrainParams.LABEL_COL), params.get(RandomForestTrainParams.FEATURE_COLS).length, categoricalColsSize);
TreeObj treeObj;
if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
treeObj = new RegObj(localParams, quantileDiscretizerModel, featureMetas, labelMeta);
} else {
treeObj = new ClassifierObj(localParams, quantileDiscretizerModel, featureMetas, labelMeta);
}
int nFeatureCol = localParams.get(RandomForestTrainParams.FEATURE_COLS).length;
int[] data = new int[nFeatureCol * nLocalRow];
double[] regLabels = null;
int[] classifyLabels = null;
if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
regLabels = new double[nLocalRow];
} else {
classifyLabels = new int[nLocalRow];
}
int agg = 0;
for (int iter = 0; iter < nLocalRow; ++iter) {
for (int i = 0; i < nFeatureCol; ++i) {
data[i * nLocalRow + agg] = (int) dataRows.get(iter).getField(i);
}
if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
regLabels[agg] = (double) dataRows.get(iter).getField(nFeatureCol);
} else {
classifyLabels[agg] = (int) dataRows.get(iter).getField(nFeatureCol);
}
agg++;
}
treeObj.setFeatures(data);
if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
treeObj.setLabels(regLabels);
} else {
treeObj.setLabels(classifyLabels);
}
double[] histBuffer = new double[treeObj.getMaxHistBufferSize()];
context.putObj("allReduce", histBuffer);
treeObj.setHist(histBuffer);
treeObj.initialRoot();
context.putObj("treeObj", treeObj);
}
use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.
the class QuantileDiscretizerTrainBatchOp method transformFeatureBinsToModel.
public static void transformFeatureBinsToModel(Iterable<FeatureBinsCalculator> values, Collector<Row> out) {
List<String> selectedCols = new ArrayList<>();
Map<String, ContinuousRanges> m = new HashMap<>();
for (FeatureBinsCalculator featureBinsCalculator : values) {
m.put(featureBinsCalculator.getFeatureName(), FeatureBinsCalculatorTransformer.toContinuousFeatureInterval(featureBinsCalculator));
selectedCols.add(featureBinsCalculator.getFeatureName());
}
Params meta = new Params().set(QuantileDiscretizerTrainParams.SELECTED_COLS, selectedCols.toArray(new String[0]));
QuantileDiscretizerModelDataConverter model = new QuantileDiscretizerModelDataConverter(m, meta);
model.save(model, out);
}
Aggregations