use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.
the class Preprocessing method generateStringIndexerModel.
public static BatchOperator<?> generateStringIndexerModel(BatchOperator<?> input, Params params) {
String[] categoricalColNames = null;
if (params.contains(HasCategoricalCols.CATEGORICAL_COLS)) {
categoricalColNames = params.get(HasCategoricalCols.CATEGORICAL_COLS);
}
BatchOperator<?> stringIndexerModel;
if (categoricalColNames == null || categoricalColNames.length == 0) {
MultiStringIndexerModelDataConverter emptyModel = new MultiStringIndexerModelDataConverter();
stringIndexerModel = new DataSetWrapperBatchOp(MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1).mapPartition(new MapPartitionFunction<Integer, Row>() {
private static final long serialVersionUID = -7481931851291494026L;
@Override
public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
// pass
}
}), emptyModel.getModelSchema().getFieldNames(), emptyModel.getModelSchema().getFieldTypes()).setMLEnvironmentId(input.getMLEnvironmentId());
} else {
stringIndexerModel = new MultiStringIndexerTrainBatchOp().setMLEnvironmentId(input.getMLEnvironmentId()).setSelectedCols(categoricalColNames).setStringOrderType(HasStringOrderTypeDefaultAsRandom.StringOrderType.ALPHABET_ASC).linkFrom(input);
}
return stringIndexerModel;
}
use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.
the class CrossFeatureTrainBatchOp method linkFrom.
@Override
public CrossFeatureTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator in = checkAndGetFirst(inputs);
long mlEnvId = getMLEnvironmentId();
String[] selectedCols = getSelectedCols();
final String[] selectedColType = new String[selectedCols.length];
for (int i = 0; i < selectedCols.length; i++) {
selectedColType[i] = FlinkTypeConverter.getTypeString(TableUtil.findColTypeWithAssertAndHint(in.getSchema(), selectedCols[i]));
}
DataSet<Tuple3<Integer, String, Long>> indexedToken = StringIndexerUtil.indexRandom(in.select(selectedCols).getDataSet(), 0L, false);
DataSet<Row> values = indexedToken.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, String, Long>, Row>() {
private static final long serialVersionUID = 2876851020570715540L;
@Override
public void mapPartition(Iterable<Tuple3<Integer, String, Long>> values, Collector<Row> out) throws Exception {
Params meta = null;
if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
meta = new Params().set(HasSelectedCols.SELECTED_COLS, selectedCols).set(HasSelectedColTypes.SELECTED_COL_TYPES, selectedColType);
}
new MultiStringIndexerModelDataConverter().save(Tuple2.of(meta, values), out);
}
}).name("build_model");
this.setOutput(values, new MultiStringIndexerModelDataConverter().getModelSchema());
DataSet<Row> sideDataSet = values.mapPartition(new BuildSideOutput()).setParallelism(1);
Table sideModel = DataSetConversionUtil.toTable(mlEnvId, sideDataSet, new String[] { "index", "value" }, new TypeInformation[] { Types.INT, Types.STRING });
this.setSideOutputTables(new Table[] { sideModel });
;
return this;
}
use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.
the class MultiStringIndexerTrainBatchOp method linkFrom.
@Override
public MultiStringIndexerTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> in = checkAndGetFirst(inputs);
final String[] selectedColNames = getSelectedCols();
final HasStringOrderTypeDefaultAsRandom.StringOrderType orderType = getStringOrderType();
final String[] selectedColSqlType = new String[selectedColNames.length];
for (int i = 0; i < selectedColNames.length; i++) {
selectedColSqlType[i] = FlinkTypeConverter.getTypeString(TableUtil.findColTypeWithAssertAndHint(in.getSchema(), selectedColNames[i]));
}
DataSet<Tuple2<Integer, String>> inputRows = in.select(selectedColNames).getDataSet().flatMap(new FlatMapFunction<Row, Tuple2<Integer, String>>() {
@Override
public void flatMap(Row row, Collector<Tuple2<Integer, String>> collector) throws Exception {
for (int i = 0; i < selectedColNames.length; i++) {
Object o = row.getField(i);
if (null != o) {
collector.collect(Tuple2.of(i, String.valueOf(o)));
}
}
}
}).returns(new TupleTypeInfo<>(Types.INT, Types.STRING));
DataSet<Tuple3<Integer, String, Long>> indexedToken = HugeStringIndexerUtil.indexTokens(inputRows, orderType, 0L);
DataSet<Row> values = indexedToken.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, String, Long>, Row>() {
private static final long serialVersionUID = 2876851020570715540L;
@Override
public void mapPartition(Iterable<Tuple3<Integer, String, Long>> values, Collector<Row> out) throws Exception {
Params meta = null;
if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
meta = new Params().set(HasSelectedCols.SELECTED_COLS, selectedColNames).set(HasSelectedColTypes.SELECTED_COL_TYPES, selectedColSqlType);
}
new MultiStringIndexerModelDataConverter().save(Tuple2.of(meta, values), out);
}
}).name("build_model").returns(new RowTypeInfo(new MultiStringIndexerModelDataConverter().getModelSchema().getFieldTypes()));
this.setOutput(values, new MultiStringIndexerModelDataConverter().getModelSchema());
return this;
}
use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.
the class NaiveBayesModelInfo method getCategoryFeatureInfo.
/**
* This function gets the feature information of categorical features.
* For each categorical feature, this function calculates the proportion among all the labels.
*/
public HashMap<Object, HashMap<Object, HashMap<Object, Double>>> getCategoryFeatureInfo() {
MultiStringIndexerModelData model = new MultiStringIndexerModelDataConverter().load(stringIndexerModelSerialized);
if (model.meta == null || !model.meta.contains(HasSelectedCols.SELECTED_COLS)) {
return new HashMap<>(0);
}
HashMap<Object, HashMap<Object, HashMap<Object, Double>>> labelFeatureMap = new HashMap<>(labelSize);
String[] cateCols = model.meta.get(HasSelectedCols.SELECTED_COLS);
int tokenNumber = cateCols.length;
HashMap<Long, String>[] tokenIndex = new HashMap[tokenNumber];
for (int i = 0; i < tokenNumber; i++) {
tokenIndex[i] = new HashMap<>((int) model.getNumberOfTokensOfColumn(cateCols[i]));
}
for (Tuple3<Integer, String, Long> tuple3 : model.tokenAndIndex) {
tokenIndex[tuple3.f0].put(tuple3.f2, tuple3.f1);
}
int cateIndex = 0;
for (int i = 0; i < featureSize; i++) {
if (isCategorical[i]) {
String featureName = featureNames[i];
HashSet<Object> featureValue = new HashSet<>();
double[] featureSum = new double[Math.toIntExact(model.getNumberOfTokensOfColumn(cateCols[cateIndex]))];
for (int j = 0; j < labelSize; j++) {
SparseVector sv = featureInfo[j][i];
int[] svIndices = sv.getIndices();
double[] svValues = sv.getValues();
// the value number of this feature.
int feaValNum = svIndices.length;
for (int k = 0; k < feaValNum; k++) {
featureSum[svIndices[k]] += svValues[k];
}
}
for (int j = 0; j < labelSize; j++) {
SparseVector sv = featureInfo[j][i];
int[] svIndices = sv.getIndices();
double[] svValues = sv.getValues();
int feaValNum = svIndices.length;
HashMap<Object, HashMap<Object, Double>> v;
if (!labelFeatureMap.containsKey(labels[j])) {
v = new HashMap<>();
} else {
v = labelFeatureMap.get(labels[j]);
}
HashMap<Object, Double> featureValues = new HashMap<>();
for (int k = 0; k < feaValNum; k++) {
Object key = tokenIndex[cateIndex].get((long) svIndices[k]);
featureValue.add(key);
double value = svValues[k] / featureSum[svIndices[k]];
featureValues.put(key, value);
}
v.put(featureName, featureValues);
labelFeatureMap.put(labels[j], v);
}
cateIndex++;
cateFeatureValue.put(featureName, featureValue);
}
}
// transform
List<String> listFeature = new ArrayList<>();
for (int i = 0; i < featureSize; i++) {
if (isCategorical[i]) {
listFeature.add(featureNames[i]);
}
}
HashMap<Object, HashMap<Object, HashMap<Object, Double>>> res = new HashMap<>(featureSize);
for (String o : listFeature) {
HashMap<Object, HashMap<Object, Double>> labelMap = new HashMap<>(labelSize);
for (Object label : labels) {
labelMap.put(label, labelFeatureMap.get(label).get(o));
}
res.put(o, labelMap);
}
return res;
}
use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.
the class CrossFeatureModelMapper method loadModel.
@Override
public void loadModel(List<Row> modelRows) {
MultiStringIndexerModelData data = new MultiStringIndexerModelDataConverter().load(modelRows);
String[] selectedCols = data.meta.get(CrossFeatureTrainParams.SELECTED_COLS);
selectedColIndices = TableUtil.findColIndices(dataColNames, selectedCols);
int featureNumber = data.tokenNumber.size();
tokenAndIndex = new HashMap[featureNumber];
nullIndex = new int[featureNumber];
Arrays.fill(nullIndex, -1);
carry = new int[featureNumber];
carry[0] = 1;
for (int i = 0; i < featureNumber - 1; i++) {
carry[i + 1] = (int) ((data.tokenNumber.get(i)) * carry[i]);
}
svLength = carry[featureNumber - 1] * (data.tokenNumber.get(featureNumber - 1).intValue());
for (int i = 0; i < featureNumber; i++) {
int thisSize = data.tokenNumber.get(i).intValue();
tokenAndIndex[i] = new HashMap<>(thisSize);
}
for (Tuple3<Integer, String, Long> tuple3 : data.tokenAndIndex) {
if (tuple3.f1 == null) {
nullIndex[tuple3.f0] = tuple3.f2.intValue();
} else {
tokenAndIndex[tuple3.f0].put(tuple3.f1, tuple3.f2.intValue());
}
}
dataIndices = new int[featureNumber];
}
Aggregations