Search in sources :

Example 1 with TableSummary

use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.

the class PackBatchOperatorUtilTest method testPackBatchOps.

public void testPackBatchOps(List<Row> rows, TableSchema modelSchema) {
    assertArrayEquals(new String[] { "id", "p0", "p1", "p2", "p3", "p4", "p5" }, modelSchema.getFieldNames());
    assertArrayEquals(new TypeInformation[] { Types.LONG, Types.STRING, Types.INT, Types.INT, Types.INT, Types.DOUBLE, Types.DOUBLE }, modelSchema.getFieldTypes());
    assertEquals(7, rows.size());
    TableSummarizer summarizer = new TableSummarizer(modelSchema.getFieldNames(), new int[] { 0, 2, 3, 4, 5, 6 }, false);
    for (Row row : rows) {
        summarizer.visit(row);
    }
    TableSummary summary = summarizer.toSummary();
    assertEquals(6, summary.numMissingValue("p0"));
    assertEquals(1, summary.numMissingValue("p1"));
    assertEquals(1, summary.numMissingValue("p2"));
    assertEquals(3, summary.numMissingValue("p3"));
    assertEquals(1, summary.numMissingValue("p4"));
    assertEquals(5, summary.numMissingValue("p5"));
}
Also used : TableSummarizer(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummarizer) Row(org.apache.flink.types.Row) TableSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary)

Example 2 with TableSummary

use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.

the class ImputerModelDataConverter method serializeModel.

/**
 * Serialize the model to "Tuple3<Params, List<String>, List<Row>>"
 *
 * @param modelData The model data to serialize.
 * @return The serialization result.
 */
@Override
public Tuple3<Params, Iterable<String>, Iterable<Row>> serializeModel(Tuple3<Strategy, TableSummary, String> modelData) {
    Strategy strategy = modelData.f0;
    TableSummary summary = modelData.f1;
    String fillValue = modelData.f2;
    double[] values = null;
    Params meta = new Params().set(STRATEGY, strategy).set(SELECTED_COLS, selectedColNames);
    switch(strategy) {
        case MIN:
            values = new double[selectedColNames.length];
            for (int i = 0; i < selectedColNames.length; i++) {
                values[i] = summary.min(selectedColNames[i]);
            }
            break;
        case MAX:
            values = new double[selectedColNames.length];
            for (int i = 0; i < selectedColNames.length; i++) {
                values[i] = summary.max(selectedColNames[i]);
            }
            break;
        case MEAN:
            values = new double[selectedColNames.length];
            for (int i = 0; i < selectedColNames.length; i++) {
                values[i] = summary.mean(selectedColNames[i]);
            }
            break;
        default:
            meta.set(FILL_VALUE, fillValue);
    }
    List<String> data = new ArrayList<>();
    data.add(JsonConverter.toJson(values));
    return Tuple3.of(meta, data, new ArrayList<>());
}
Also used : ArrayList(java.util.ArrayList) Strategy(com.alibaba.alink.params.dataproc.ImputerTrainParams.Strategy) Params(org.apache.flink.ml.api.misc.param.Params) TableSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary)

Example 3 with TableSummary

use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.

the class CorrelationBatchOp method linkFrom.

@Override
public CorrelationBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    String[] selectedColNames = this.getParams().get(SELECTED_COLS);
    if (selectedColNames == null) {
        selectedColNames = in.getColNames();
    }
    // check col types must be double or bigint
    TableUtil.assertNumericalCols(in.getSchema(), selectedColNames);
    Method corrType = getMethod();
    if (Method.PEARSON == corrType) {
        DataSet<Tuple2<TableSummary, CorrelationResult>> srt = StatisticsHelper.pearsonCorrelation(in, selectedColNames);
        DataSet<Row> result = srt.flatMap(new FlatMapFunction<Tuple2<TableSummary, CorrelationResult>, Row>() {

            private static final long serialVersionUID = -4498296161046449646L;

            @Override
            public void flatMap(Tuple2<TableSummary, CorrelationResult> summary, Collector<Row> collector) {
                new CorrelationDataConverter().save(summary.f1, collector);
            }
        });
        this.setOutput(result, new CorrelationDataConverter().getModelSchema());
    } else {
        DataSet<Row> data = inputs[0].select(selectedColNames).getDataSet();
        DataSet<Row> rank = SpearmanCorrelation.calcRank(data, false);
        TypeInformation[] colTypes = new TypeInformation[selectedColNames.length];
        for (int i = 0; i < colTypes.length; i++) {
            colTypes[i] = Types.DOUBLE;
        }
        BatchOperator rankOp = new TableSourceBatchOp(DataSetConversionUtil.toTable(getMLEnvironmentId(), rank, selectedColNames, colTypes)).setMLEnvironmentId(getMLEnvironmentId());
        CorrelationBatchOp corrBatchOp = new CorrelationBatchOp().setMLEnvironmentId(getMLEnvironmentId()).setSelectedCols(selectedColNames);
        rankOp.link(corrBatchOp);
        this.setOutput(corrBatchOp.getDataSet(), corrBatchOp.getSchema());
    }
    return this;
}
Also used : CorrelationDataConverter(com.alibaba.alink.operator.common.statistics.basicstatistic.CorrelationDataConverter) CorrelationResult(com.alibaba.alink.operator.common.statistics.basicstatistic.CorrelationResult) TableSourceBatchOp(com.alibaba.alink.operator.batch.source.TableSourceBatchOp) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Row(org.apache.flink.types.Row) TableSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary)

Example 4 with TableSummary

use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.

the class SummarizerBatchOpTest method testLazy.

@Test
public void testLazy() throws Exception {
    Row[] testArray = new Row[] { Row.of("a", 1L, 1, 2.0, true), Row.of(null, 2L, 2, -3.0, true), Row.of("c", null, null, 2.0, false), Row.of("a", 0L, 0, null, null) };
    String[] colNames = new String[] { "f_string", "f_long", "f_int", "f_double", "f_boolean" };
    MemSourceBatchOp source = new MemSourceBatchOp(Arrays.asList(testArray), colNames);
    SummarizerBatchOp summarizer = new SummarizerBatchOp().setSelectedCols("f_string", "f_double", "f_int");
    summarizer.linkFrom(source);
    summarizer.lazyPrintSummary();
    summarizer.lazyCollectSummary(new Consumer<TableSummary>() {

        @Override
        public void accept(TableSummary summary) {
            Assert.assertEquals(0.3333333333333333, summary.mean("f_double"), 10e-8);
        }
    });
    BatchOperator.execute();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) TableSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary) Test(org.junit.Test)

Example 5 with TableSummary

use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.

the class SummarizerBatchOp method linkFrom.

@Override
public SummarizerBatchOp linkFrom(BatchOperator<?>... inputs) {
    checkOpSize(1, inputs);
    BatchOperator<?> in = inputs[0];
    String[] selectedColNames = in.getColNames();
    if (this.getParams().contains(SummarizerParams.SELECTED_COLS)) {
        selectedColNames = this.getParams().get(SummarizerParams.SELECTED_COLS);
    }
    DataSet<TableSummary> srt = StatisticsHelper.summary(in, selectedColNames);
    // result may result.
    DataSet<Row> out = srt.flatMap(new TableSummaryBuildModel());
    SummaryDataConverter converter = new SummaryDataConverter();
    this.setOutput(out, converter.getModelSchema());
    return this;
}
Also used : SummaryDataConverter(com.alibaba.alink.operator.common.statistics.basicstatistic.SummaryDataConverter) Row(org.apache.flink.types.Row) TableSummary(com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary)

Aggregations

TableSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary)11 Row (org.apache.flink.types.Row)5 Test (org.junit.Test)4 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)3 ArrayList (java.util.ArrayList)3 Params (org.apache.flink.ml.api.misc.param.Params)3 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)2 CorrelationResult (com.alibaba.alink.operator.common.statistics.basicstatistic.CorrelationResult)2 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)2 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)2 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)1 TableSourceBatchOp (com.alibaba.alink.operator.batch.source.TableSourceBatchOp)1 SummarizerBatchOp (com.alibaba.alink.operator.batch.statistics.SummarizerBatchOp)1 CorrelationDataConverter (com.alibaba.alink.operator.common.statistics.basicstatistic.CorrelationDataConverter)1 SummaryDataConverter (com.alibaba.alink.operator.common.statistics.basicstatistic.SummaryDataConverter)1 TableSummarizer (com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummarizer)1 Strategy (com.alibaba.alink.params.dataproc.ImputerTrainParams.Strategy)1 MaxAbsScalerTrainParams (com.alibaba.alink.params.dataproc.MaxAbsScalerTrainParams)1 MinMaxScalerTrainParams (com.alibaba.alink.params.dataproc.MinMaxScalerTrainParams)1 StandardTrainParams (com.alibaba.alink.params.dataproc.StandardTrainParams)1