use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.
the class PackBatchOperatorUtilTest method testPackBatchOps.
public void testPackBatchOps(List<Row> rows, TableSchema modelSchema) {
assertArrayEquals(new String[] { "id", "p0", "p1", "p2", "p3", "p4", "p5" }, modelSchema.getFieldNames());
assertArrayEquals(new TypeInformation[] { Types.LONG, Types.STRING, Types.INT, Types.INT, Types.INT, Types.DOUBLE, Types.DOUBLE }, modelSchema.getFieldTypes());
assertEquals(7, rows.size());
TableSummarizer summarizer = new TableSummarizer(modelSchema.getFieldNames(), new int[] { 0, 2, 3, 4, 5, 6 }, false);
for (Row row : rows) {
summarizer.visit(row);
}
TableSummary summary = summarizer.toSummary();
assertEquals(6, summary.numMissingValue("p0"));
assertEquals(1, summary.numMissingValue("p1"));
assertEquals(1, summary.numMissingValue("p2"));
assertEquals(3, summary.numMissingValue("p3"));
assertEquals(1, summary.numMissingValue("p4"));
assertEquals(5, summary.numMissingValue("p5"));
}
use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.
the class ImputerModelDataConverter method serializeModel.
/**
* Serialize the model to "Tuple3<Params, List<String>, List<Row>>"
*
* @param modelData The model data to serialize.
* @return The serialization result.
*/
@Override
public Tuple3<Params, Iterable<String>, Iterable<Row>> serializeModel(Tuple3<Strategy, TableSummary, String> modelData) {
Strategy strategy = modelData.f0;
TableSummary summary = modelData.f1;
String fillValue = modelData.f2;
double[] values = null;
Params meta = new Params().set(STRATEGY, strategy).set(SELECTED_COLS, selectedColNames);
switch(strategy) {
case MIN:
values = new double[selectedColNames.length];
for (int i = 0; i < selectedColNames.length; i++) {
values[i] = summary.min(selectedColNames[i]);
}
break;
case MAX:
values = new double[selectedColNames.length];
for (int i = 0; i < selectedColNames.length; i++) {
values[i] = summary.max(selectedColNames[i]);
}
break;
case MEAN:
values = new double[selectedColNames.length];
for (int i = 0; i < selectedColNames.length; i++) {
values[i] = summary.mean(selectedColNames[i]);
}
break;
default:
meta.set(FILL_VALUE, fillValue);
}
List<String> data = new ArrayList<>();
data.add(JsonConverter.toJson(values));
return Tuple3.of(meta, data, new ArrayList<>());
}
use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.
the class CorrelationBatchOp method linkFrom.
@Override
public CorrelationBatchOp linkFrom(BatchOperator<?>... inputs) {
BatchOperator<?> in = checkAndGetFirst(inputs);
String[] selectedColNames = this.getParams().get(SELECTED_COLS);
if (selectedColNames == null) {
selectedColNames = in.getColNames();
}
// check col types must be double or bigint
TableUtil.assertNumericalCols(in.getSchema(), selectedColNames);
Method corrType = getMethod();
if (Method.PEARSON == corrType) {
DataSet<Tuple2<TableSummary, CorrelationResult>> srt = StatisticsHelper.pearsonCorrelation(in, selectedColNames);
DataSet<Row> result = srt.flatMap(new FlatMapFunction<Tuple2<TableSummary, CorrelationResult>, Row>() {
private static final long serialVersionUID = -4498296161046449646L;
@Override
public void flatMap(Tuple2<TableSummary, CorrelationResult> summary, Collector<Row> collector) {
new CorrelationDataConverter().save(summary.f1, collector);
}
});
this.setOutput(result, new CorrelationDataConverter().getModelSchema());
} else {
DataSet<Row> data = inputs[0].select(selectedColNames).getDataSet();
DataSet<Row> rank = SpearmanCorrelation.calcRank(data, false);
TypeInformation[] colTypes = new TypeInformation[selectedColNames.length];
for (int i = 0; i < colTypes.length; i++) {
colTypes[i] = Types.DOUBLE;
}
BatchOperator rankOp = new TableSourceBatchOp(DataSetConversionUtil.toTable(getMLEnvironmentId(), rank, selectedColNames, colTypes)).setMLEnvironmentId(getMLEnvironmentId());
CorrelationBatchOp corrBatchOp = new CorrelationBatchOp().setMLEnvironmentId(getMLEnvironmentId()).setSelectedCols(selectedColNames);
rankOp.link(corrBatchOp);
this.setOutput(corrBatchOp.getDataSet(), corrBatchOp.getSchema());
}
return this;
}
use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.
the class SummarizerBatchOpTest method testLazy.
@Test
public void testLazy() throws Exception {
Row[] testArray = new Row[] { Row.of("a", 1L, 1, 2.0, true), Row.of(null, 2L, 2, -3.0, true), Row.of("c", null, null, 2.0, false), Row.of("a", 0L, 0, null, null) };
String[] colNames = new String[] { "f_string", "f_long", "f_int", "f_double", "f_boolean" };
MemSourceBatchOp source = new MemSourceBatchOp(Arrays.asList(testArray), colNames);
SummarizerBatchOp summarizer = new SummarizerBatchOp().setSelectedCols("f_string", "f_double", "f_int");
summarizer.linkFrom(source);
summarizer.lazyPrintSummary();
summarizer.lazyCollectSummary(new Consumer<TableSummary>() {
@Override
public void accept(TableSummary summary) {
Assert.assertEquals(0.3333333333333333, summary.mean("f_double"), 10e-8);
}
});
BatchOperator.execute();
}
use of com.alibaba.alink.operator.common.statistics.basicstatistic.TableSummary in project Alink by alibaba.
the class SummarizerBatchOp method linkFrom.
@Override
public SummarizerBatchOp linkFrom(BatchOperator<?>... inputs) {
checkOpSize(1, inputs);
BatchOperator<?> in = inputs[0];
String[] selectedColNames = in.getColNames();
if (this.getParams().contains(SummarizerParams.SELECTED_COLS)) {
selectedColNames = this.getParams().get(SummarizerParams.SELECTED_COLS);
}
DataSet<TableSummary> srt = StatisticsHelper.summary(in, selectedColNames);
// result may result.
DataSet<Row> out = srt.flatMap(new TableSummaryBuildModel());
SummaryDataConverter converter = new SummaryDataConverter();
this.setOutput(out, converter.getModelSchema());
return this;
}
Aggregations