Search in sources :

Example 1 with MultiStringIndexerModelDataConverter

use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.

the class Preprocessing method generateStringIndexerModel.

public static BatchOperator<?> generateStringIndexerModel(BatchOperator<?> input, Params params) {
    String[] categoricalColNames = null;
    if (params.contains(HasCategoricalCols.CATEGORICAL_COLS)) {
        categoricalColNames = params.get(HasCategoricalCols.CATEGORICAL_COLS);
    }
    BatchOperator<?> stringIndexerModel;
    if (categoricalColNames == null || categoricalColNames.length == 0) {
        MultiStringIndexerModelDataConverter emptyModel = new MultiStringIndexerModelDataConverter();
        stringIndexerModel = new DataSetWrapperBatchOp(MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1).mapPartition(new MapPartitionFunction<Integer, Row>() {

            private static final long serialVersionUID = -7481931851291494026L;

            @Override
            public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
            // pass
            }
        }), emptyModel.getModelSchema().getFieldNames(), emptyModel.getModelSchema().getFieldTypes()).setMLEnvironmentId(input.getMLEnvironmentId());
    } else {
        stringIndexerModel = new MultiStringIndexerTrainBatchOp().setMLEnvironmentId(input.getMLEnvironmentId()).setSelectedCols(categoricalColNames).setStringOrderType(HasStringOrderTypeDefaultAsRandom.StringOrderType.ALPHABET_ASC).linkFrom(input);
    }
    return stringIndexerModel;
}
Also used : MultiStringIndexerModelDataConverter(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) MultiStringIndexerTrainBatchOp(com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerTrainBatchOp) Collector(org.apache.flink.util.Collector) DataSetWrapperBatchOp(com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)

Example 2 with MultiStringIndexerModelDataConverter

use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.

the class CrossFeatureTrainBatchOp method linkFrom.

@Override
public CrossFeatureTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator in = checkAndGetFirst(inputs);
    long mlEnvId = getMLEnvironmentId();
    String[] selectedCols = getSelectedCols();
    final String[] selectedColType = new String[selectedCols.length];
    for (int i = 0; i < selectedCols.length; i++) {
        selectedColType[i] = FlinkTypeConverter.getTypeString(TableUtil.findColTypeWithAssertAndHint(in.getSchema(), selectedCols[i]));
    }
    DataSet<Tuple3<Integer, String, Long>> indexedToken = StringIndexerUtil.indexRandom(in.select(selectedCols).getDataSet(), 0L, false);
    DataSet<Row> values = indexedToken.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, String, Long>, Row>() {

        private static final long serialVersionUID = 2876851020570715540L;

        @Override
        public void mapPartition(Iterable<Tuple3<Integer, String, Long>> values, Collector<Row> out) throws Exception {
            Params meta = null;
            if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                meta = new Params().set(HasSelectedCols.SELECTED_COLS, selectedCols).set(HasSelectedColTypes.SELECTED_COL_TYPES, selectedColType);
            }
            new MultiStringIndexerModelDataConverter().save(Tuple2.of(meta, values), out);
        }
    }).name("build_model");
    this.setOutput(values, new MultiStringIndexerModelDataConverter().getModelSchema());
    DataSet<Row> sideDataSet = values.mapPartition(new BuildSideOutput()).setParallelism(1);
    Table sideModel = DataSetConversionUtil.toTable(mlEnvId, sideDataSet, new String[] { "index", "value" }, new TypeInformation[] { Types.INT, Types.STRING });
    this.setSideOutputTables(new Table[] { sideModel });
    ;
    return this;
}
Also used : MultiStringIndexerModelDataConverter(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter) Table(org.apache.flink.table.api.Table) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) CrossFeatureTrainParams(com.alibaba.alink.params.feature.CrossFeatureTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) Row(org.apache.flink.types.Row)

Example 3 with MultiStringIndexerModelDataConverter

use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.

the class MultiStringIndexerTrainBatchOp method linkFrom.

@Override
public MultiStringIndexerTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    final String[] selectedColNames = getSelectedCols();
    final HasStringOrderTypeDefaultAsRandom.StringOrderType orderType = getStringOrderType();
    final String[] selectedColSqlType = new String[selectedColNames.length];
    for (int i = 0; i < selectedColNames.length; i++) {
        selectedColSqlType[i] = FlinkTypeConverter.getTypeString(TableUtil.findColTypeWithAssertAndHint(in.getSchema(), selectedColNames[i]));
    }
    DataSet<Tuple2<Integer, String>> inputRows = in.select(selectedColNames).getDataSet().flatMap(new FlatMapFunction<Row, Tuple2<Integer, String>>() {

        @Override
        public void flatMap(Row row, Collector<Tuple2<Integer, String>> collector) throws Exception {
            for (int i = 0; i < selectedColNames.length; i++) {
                Object o = row.getField(i);
                if (null != o) {
                    collector.collect(Tuple2.of(i, String.valueOf(o)));
                }
            }
        }
    }).returns(new TupleTypeInfo<>(Types.INT, Types.STRING));
    DataSet<Tuple3<Integer, String, Long>> indexedToken = HugeStringIndexerUtil.indexTokens(inputRows, orderType, 0L);
    DataSet<Row> values = indexedToken.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, String, Long>, Row>() {

        private static final long serialVersionUID = 2876851020570715540L;

        @Override
        public void mapPartition(Iterable<Tuple3<Integer, String, Long>> values, Collector<Row> out) throws Exception {
            Params meta = null;
            if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                meta = new Params().set(HasSelectedCols.SELECTED_COLS, selectedColNames).set(HasSelectedColTypes.SELECTED_COL_TYPES, selectedColSqlType);
            }
            new MultiStringIndexerModelDataConverter().save(Tuple2.of(meta, values), out);
        }
    }).name("build_model").returns(new RowTypeInfo(new MultiStringIndexerModelDataConverter().getModelSchema().getFieldTypes()));
    this.setOutput(values, new MultiStringIndexerModelDataConverter().getModelSchema());
    return this;
}
Also used : MultiStringIndexerModelDataConverter(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) HasStringOrderTypeDefaultAsRandom(com.alibaba.alink.params.dataproc.HasStringOrderTypeDefaultAsRandom) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector) MultiStringIndexerTrainParams(com.alibaba.alink.params.dataproc.MultiStringIndexerTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Row(org.apache.flink.types.Row)

Example 4 with MultiStringIndexerModelDataConverter

use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.

the class NaiveBayesModelInfo method getCategoryFeatureInfo.

/**
 * This function gets the feature information of categorical features.
 * For each categorical feature, this function calculates the proportion among all the labels.
 */
public HashMap<Object, HashMap<Object, HashMap<Object, Double>>> getCategoryFeatureInfo() {
    MultiStringIndexerModelData model = new MultiStringIndexerModelDataConverter().load(stringIndexerModelSerialized);
    if (model.meta == null || !model.meta.contains(HasSelectedCols.SELECTED_COLS)) {
        return new HashMap<>(0);
    }
    HashMap<Object, HashMap<Object, HashMap<Object, Double>>> labelFeatureMap = new HashMap<>(labelSize);
    String[] cateCols = model.meta.get(HasSelectedCols.SELECTED_COLS);
    int tokenNumber = cateCols.length;
    HashMap<Long, String>[] tokenIndex = new HashMap[tokenNumber];
    for (int i = 0; i < tokenNumber; i++) {
        tokenIndex[i] = new HashMap<>((int) model.getNumberOfTokensOfColumn(cateCols[i]));
    }
    for (Tuple3<Integer, String, Long> tuple3 : model.tokenAndIndex) {
        tokenIndex[tuple3.f0].put(tuple3.f2, tuple3.f1);
    }
    int cateIndex = 0;
    for (int i = 0; i < featureSize; i++) {
        if (isCategorical[i]) {
            String featureName = featureNames[i];
            HashSet<Object> featureValue = new HashSet<>();
            double[] featureSum = new double[Math.toIntExact(model.getNumberOfTokensOfColumn(cateCols[cateIndex]))];
            for (int j = 0; j < labelSize; j++) {
                SparseVector sv = featureInfo[j][i];
                int[] svIndices = sv.getIndices();
                double[] svValues = sv.getValues();
                // the value number of this feature.
                int feaValNum = svIndices.length;
                for (int k = 0; k < feaValNum; k++) {
                    featureSum[svIndices[k]] += svValues[k];
                }
            }
            for (int j = 0; j < labelSize; j++) {
                SparseVector sv = featureInfo[j][i];
                int[] svIndices = sv.getIndices();
                double[] svValues = sv.getValues();
                int feaValNum = svIndices.length;
                HashMap<Object, HashMap<Object, Double>> v;
                if (!labelFeatureMap.containsKey(labels[j])) {
                    v = new HashMap<>();
                } else {
                    v = labelFeatureMap.get(labels[j]);
                }
                HashMap<Object, Double> featureValues = new HashMap<>();
                for (int k = 0; k < feaValNum; k++) {
                    Object key = tokenIndex[cateIndex].get((long) svIndices[k]);
                    featureValue.add(key);
                    double value = svValues[k] / featureSum[svIndices[k]];
                    featureValues.put(key, value);
                }
                v.put(featureName, featureValues);
                labelFeatureMap.put(labels[j], v);
            }
            cateIndex++;
            cateFeatureValue.put(featureName, featureValue);
        }
    }
    // transform
    List<String> listFeature = new ArrayList<>();
    for (int i = 0; i < featureSize; i++) {
        if (isCategorical[i]) {
            listFeature.add(featureNames[i]);
        }
    }
    HashMap<Object, HashMap<Object, HashMap<Object, Double>>> res = new HashMap<>(featureSize);
    for (String o : listFeature) {
        HashMap<Object, HashMap<Object, Double>> labelMap = new HashMap<>(labelSize);
        for (Object label : labels) {
            labelMap.put(label, labelFeatureMap.get(label).get(o));
        }
        res.put(o, labelMap);
    }
    return res;
}
Also used : MultiStringIndexerModelDataConverter(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SparseVector(com.alibaba.alink.common.linalg.SparseVector) MultiStringIndexerModelData(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelData) HashSet(java.util.HashSet)

Example 5 with MultiStringIndexerModelDataConverter

use of com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter in project Alink by alibaba.

the class CrossFeatureModelMapper method loadModel.

@Override
public void loadModel(List<Row> modelRows) {
    MultiStringIndexerModelData data = new MultiStringIndexerModelDataConverter().load(modelRows);
    String[] selectedCols = data.meta.get(CrossFeatureTrainParams.SELECTED_COLS);
    selectedColIndices = TableUtil.findColIndices(dataColNames, selectedCols);
    int featureNumber = data.tokenNumber.size();
    tokenAndIndex = new HashMap[featureNumber];
    nullIndex = new int[featureNumber];
    Arrays.fill(nullIndex, -1);
    carry = new int[featureNumber];
    carry[0] = 1;
    for (int i = 0; i < featureNumber - 1; i++) {
        carry[i + 1] = (int) ((data.tokenNumber.get(i)) * carry[i]);
    }
    svLength = carry[featureNumber - 1] * (data.tokenNumber.get(featureNumber - 1).intValue());
    for (int i = 0; i < featureNumber; i++) {
        int thisSize = data.tokenNumber.get(i).intValue();
        tokenAndIndex[i] = new HashMap<>(thisSize);
    }
    for (Tuple3<Integer, String, Long> tuple3 : data.tokenAndIndex) {
        if (tuple3.f1 == null) {
            nullIndex[tuple3.f0] = tuple3.f2.intValue();
        } else {
            tokenAndIndex[tuple3.f0].put(tuple3.f1, tuple3.f2.intValue());
        }
    }
    dataIndices = new int[featureNumber];
}
Also used : MultiStringIndexerModelDataConverter(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter) MultiStringIndexerModelData(com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelData)

Aggregations

MultiStringIndexerModelDataConverter (com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelDataConverter)5 Collector (org.apache.flink.util.Collector)3 MultiStringIndexerModelData (com.alibaba.alink.operator.common.dataproc.MultiStringIndexerModelData)2 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)2 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)2 Params (org.apache.flink.ml.api.misc.param.Params)2 Row (org.apache.flink.types.Row)2 SparseVector (com.alibaba.alink.common.linalg.SparseVector)1 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 MultiStringIndexerTrainBatchOp (com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerTrainBatchOp)1 DataSetWrapperBatchOp (com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)1 HasStringOrderTypeDefaultAsRandom (com.alibaba.alink.params.dataproc.HasStringOrderTypeDefaultAsRandom)1 MultiStringIndexerTrainParams (com.alibaba.alink.params.dataproc.MultiStringIndexerTrainParams)1 CrossFeatureTrainParams (com.alibaba.alink.params.feature.CrossFeatureTrainParams)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)1 MapPartitionFunction (org.apache.flink.api.common.functions.MapPartitionFunction)1 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)1