use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.
the class StatisticsUtilsTest method adaptColumn.
private void adaptColumn(final ColumnMetadata column, final DataTypeEnum type) {
Analyzers.Result result = new Analyzers.Result();
// Data type
DataTypeOccurences dataType = new DataTypeOccurences();
dataType.increment(type);
result.add(dataType);
// Semantic type
SemanticType semanticType = new SemanticType();
CategoryFrequency category1 = new CategoryFrequency("category 1", "category 1");
category1.setScore(99);
semanticType.increment(category1, 1);
result.add(semanticType);
// Suggested types
CategoryFrequency category2 = new CategoryFrequency("category 2", "category 2");
category2.setScore(81);
semanticType.increment(category2, 1);
CategoryFrequency category3 = new CategoryFrequency("category 3", "category 3");
category3.setScore(50);
semanticType.increment(category3, 1);
// Value quality
ValueQualityStatistics valueQualityStatistics = new ValueQualityStatistics();
valueQualityStatistics.setEmptyCount(10);
valueQualityStatistics.setInvalidCount(20);
valueQualityStatistics.setValidCount(30);
result.add(valueQualityStatistics);
// Cardinality
CardinalityStatistics cardinalityStatistics = new CardinalityStatistics();
cardinalityStatistics.incrementCount();
cardinalityStatistics.add("distinctValue");
result.add(cardinalityStatistics);
// Data frequency
DataTypeFrequencyStatistics dataFrequencyStatistics = new DataTypeFrequencyStatistics();
dataFrequencyStatistics.add("frequentValue1");
dataFrequencyStatistics.add("frequentValue1");
dataFrequencyStatistics.add("frequentValue2");
dataFrequencyStatistics.add("frequentValue2");
result.add(dataFrequencyStatistics);
// Pattern frequency
PatternFrequencyStatistics patternFrequencyStatistics = new PatternFrequencyStatistics();
patternFrequencyStatistics.add("999a999");
patternFrequencyStatistics.add("999a999");
patternFrequencyStatistics.add("999aaaa");
patternFrequencyStatistics.add("999aaaa");
result.add(patternFrequencyStatistics);
// Quantiles
QuantileStatistics quantileStatistics = new QuantileStatistics();
quantileStatistics.add(1d);
quantileStatistics.add(2d);
quantileStatistics.endAddValue();
result.add(quantileStatistics);
// Summary
SummaryStatistics summaryStatistics = new SummaryStatistics();
summaryStatistics.addData(1d);
summaryStatistics.addData(2d);
result.add(summaryStatistics);
// Histogram
StreamNumberHistogramStatistics histogramStatistics = new StreamNumberHistogramStatistics();
histogramStatistics.setNumberOfBins(2);
histogramStatistics.add(1);
histogramStatistics.add(2);
result.add(histogramStatistics);
// Text length
TextLengthStatistics textLengthStatistics = new TextLengthStatistics();
textLengthStatistics.setMaxTextLength(30);
textLengthStatistics.setMinTextLength(10);
textLengthStatistics.setSumTextLength(40);
textLengthStatistics.setCount(5);
result.add(textLengthStatistics);
StatisticsAdapter adapter = new StatisticsAdapter(40);
adapter.adapt(Collections.singletonList(integerColumn), Collections.singletonList(result));
adapter.adapt(Collections.singletonList(stringColumn), Collections.singletonList(result));
}
use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.
the class DateParser method guessPattern.
/**
* Guess the pattern from the given value.
*
* @param value the value to get the date time from.
* @param column the column metadata
* @return the wanted parsed date time. For date only value, time is set to 00:00:00.
*/
DatePattern guessPattern(String value, ColumnMetadata column) {
if (StringUtils.isEmpty(value)) {
throw new DateTimeException("No pattern can be found out of '" + value + "'");
}
// call DQ on the given value
try (Analyzer<Analyzers.Result> analyzer = analyzerService.build(column, AnalyzerService.Analysis.PATTERNS)) {
analyzer.analyze(value);
analyzer.end();
// only one value --> only one result
final Analyzers.Result result = analyzer.getResult().get(0);
if (result.exist(PatternFrequencyStatistics.class)) {
final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(1);
List<PatternFrequency> patterns = new ArrayList<>(1);
topTerms.forEach((s, o) -> patterns.add(new PatternFrequency(s, o)));
// get & check the results
final List<DatePattern> results = getPatterns(patterns);
if (results.isEmpty()) {
throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
}
// as Christopher L. said : "there can be only one" :-)
return getPatterns(patterns).get(0);
} else {
throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
}
} catch (Exception e) {
throw new DateTimeException("Unable to close analyzer after analyzing value '" + value + "'", e);
}
}
use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.
the class StatisticsAdapter method injectPatternFrequency.
private void injectPatternFrequency(final ColumnMetadata column, final Analyzers.Result result) {
if (result.exist(PatternFrequencyStatistics.class)) {
final Statistics statistics = column.getStatistics();
final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(15);
if (topTerms != null) {
statistics.getPatternFrequencies().clear();
topTerms.forEach((s, o) -> statistics.getPatternFrequencies().add(new PatternFrequency(s, o)));
}
}
}
Aggregations