use of org.talend.dataprep.dataset.StatisticsAdapter in project data-prep by Talend.
the class ActionTestWorkbench method test.
public static void test(Collection<DataSetRow> input, AnalyzerService analyzerService, ActionRegistry actionRegistry, RunnableAction... actions) {
final List<RunnableAction> allActions = new ArrayList<>();
Collections.addAll(allActions, actions);
final DataSet dataSet = new DataSet();
final RowMetadata rowMetadata = input.iterator().next().getRowMetadata();
final DataSetMetadata dataSetMetadata = new DataSetMetadata();
dataSetMetadata.setRowMetadata(rowMetadata);
dataSet.setMetadata(dataSetMetadata);
dataSet.setRecords(input.stream());
final TestOutputNode outputNode = new TestOutputNode(input);
Pipeline pipeline = //
Pipeline.Builder.builder().withActionRegistry(actionRegistry).withInitialMetadata(rowMetadata, //
true).withActions(//
allActions).withAnalyzerService(analyzerService).withStatisticsAdapter(//
new StatisticsAdapter(40)).withOutput(//
() -> outputNode).build();
pipeline.execute(dataSet);
// Some tests rely on the metadata changes in the provided metadata so set back modified columns in row metadata
// (although this should be avoided in tests).
// TODO Make this method return the modified metadata iso. setting modified columns.
rowMetadata.setColumns(outputNode.getMetadata().getColumns());
for (DataSetRow dataSetRow : input) {
dataSetRow.setRowMetadata(rowMetadata);
}
}
use of org.talend.dataprep.dataset.StatisticsAdapter in project data-prep by Talend.
the class StatisticsUtilsTest method adaptColumn.
private void adaptColumn(final ColumnMetadata column, final DataTypeEnum type) {
Analyzers.Result result = new Analyzers.Result();
// Data type
DataTypeOccurences dataType = new DataTypeOccurences();
dataType.increment(type);
result.add(dataType);
// Semantic type
SemanticType semanticType = new SemanticType();
CategoryFrequency category1 = new CategoryFrequency("category 1", "category 1");
category1.setScore(99);
semanticType.increment(category1, 1);
result.add(semanticType);
// Suggested types
CategoryFrequency category2 = new CategoryFrequency("category 2", "category 2");
category2.setScore(81);
semanticType.increment(category2, 1);
CategoryFrequency category3 = new CategoryFrequency("category 3", "category 3");
category3.setScore(50);
semanticType.increment(category3, 1);
// Value quality
ValueQualityStatistics valueQualityStatistics = new ValueQualityStatistics();
valueQualityStatistics.setEmptyCount(10);
valueQualityStatistics.setInvalidCount(20);
valueQualityStatistics.setValidCount(30);
result.add(valueQualityStatistics);
// Cardinality
CardinalityStatistics cardinalityStatistics = new CardinalityStatistics();
cardinalityStatistics.incrementCount();
cardinalityStatistics.add("distinctValue");
result.add(cardinalityStatistics);
// Data frequency
DataTypeFrequencyStatistics dataFrequencyStatistics = new DataTypeFrequencyStatistics();
dataFrequencyStatistics.add("frequentValue1");
dataFrequencyStatistics.add("frequentValue1");
dataFrequencyStatistics.add("frequentValue2");
dataFrequencyStatistics.add("frequentValue2");
result.add(dataFrequencyStatistics);
// Pattern frequency
PatternFrequencyStatistics patternFrequencyStatistics = new PatternFrequencyStatistics();
patternFrequencyStatistics.add("999a999");
patternFrequencyStatistics.add("999a999");
patternFrequencyStatistics.add("999aaaa");
patternFrequencyStatistics.add("999aaaa");
result.add(patternFrequencyStatistics);
// Quantiles
QuantileStatistics quantileStatistics = new QuantileStatistics();
quantileStatistics.add(1d);
quantileStatistics.add(2d);
quantileStatistics.endAddValue();
result.add(quantileStatistics);
// Summary
SummaryStatistics summaryStatistics = new SummaryStatistics();
summaryStatistics.addData(1d);
summaryStatistics.addData(2d);
result.add(summaryStatistics);
// Histogram
StreamNumberHistogramStatistics histogramStatistics = new StreamNumberHistogramStatistics();
histogramStatistics.setNumberOfBins(2);
histogramStatistics.add(1);
histogramStatistics.add(2);
result.add(histogramStatistics);
// Text length
TextLengthStatistics textLengthStatistics = new TextLengthStatistics();
textLengthStatistics.setMaxTextLength(30);
textLengthStatistics.setMinTextLength(10);
textLengthStatistics.setSumTextLength(40);
textLengthStatistics.setCount(5);
result.add(textLengthStatistics);
StatisticsAdapter adapter = new StatisticsAdapter(40);
adapter.adapt(Collections.singletonList(integerColumn), Collections.singletonList(result));
adapter.adapt(Collections.singletonList(stringColumn), Collections.singletonList(result));
}
use of org.talend.dataprep.dataset.StatisticsAdapter in project data-prep by Talend.
the class DataSetService method getDataSetColumnSemanticCategories.
/**
* Return the semantic types for a given dataset / column.
*
* @param datasetId the datasetId id.
* @param columnId the column id.
* @return the semantic types for a given dataset / column.
*/
@RequestMapping(value = "/datasets/{datasetId}/columns/{columnId}/types", method = GET)
@ApiOperation(value = "list the types of the wanted column", notes = "This list can be used by user to change the column type.")
@Timed
@PublicAPI
public List<SemanticDomain> getDataSetColumnSemanticCategories(@ApiParam(value = "The dataset id") @PathVariable String datasetId, @ApiParam(value = "The column id") @PathVariable String columnId) {
LOG.debug("listing semantic categories for dataset #{} column #{}", datasetId, columnId);
final DataSetMetadata metadata = dataSetMetadataRepository.get(datasetId);
if (metadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, ExceptionContext.withBuilder().put("id", datasetId).build());
} else {
try (final Stream<DataSetRow> records = contentStore.stream(metadata)) {
final ColumnMetadata columnMetadata = metadata.getRowMetadata().getById(columnId);
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
records.map(r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
final StatisticsAdapter statisticsAdapter = new StatisticsAdapter(40);
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
LOG.debug("found {} for dataset #{}, column #{}", columnMetadata.getSemanticDomains(), datasetId, columnId);
return columnMetadata.getSemanticDomains();
}
}
}
Aggregations