Search in sources :

Example 1 with PatternFrequencyStatistics

use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.

the class StatisticsUtilsTest method adaptColumn.

private void adaptColumn(final ColumnMetadata column, final DataTypeEnum type) {
    Analyzers.Result result = new Analyzers.Result();
    // Data type
    DataTypeOccurences dataType = new DataTypeOccurences();
    dataType.increment(type);
    result.add(dataType);
    // Semantic type
    SemanticType semanticType = new SemanticType();
    CategoryFrequency category1 = new CategoryFrequency("category 1", "category 1");
    category1.setScore(99);
    semanticType.increment(category1, 1);
    result.add(semanticType);
    // Suggested types
    CategoryFrequency category2 = new CategoryFrequency("category 2", "category 2");
    category2.setScore(81);
    semanticType.increment(category2, 1);
    CategoryFrequency category3 = new CategoryFrequency("category 3", "category 3");
    category3.setScore(50);
    semanticType.increment(category3, 1);
    // Value quality
    ValueQualityStatistics valueQualityStatistics = new ValueQualityStatistics();
    valueQualityStatistics.setEmptyCount(10);
    valueQualityStatistics.setInvalidCount(20);
    valueQualityStatistics.setValidCount(30);
    result.add(valueQualityStatistics);
    // Cardinality
    CardinalityStatistics cardinalityStatistics = new CardinalityStatistics();
    cardinalityStatistics.incrementCount();
    cardinalityStatistics.add("distinctValue");
    result.add(cardinalityStatistics);
    // Data frequency
    DataTypeFrequencyStatistics dataFrequencyStatistics = new DataTypeFrequencyStatistics();
    dataFrequencyStatistics.add("frequentValue1");
    dataFrequencyStatistics.add("frequentValue1");
    dataFrequencyStatistics.add("frequentValue2");
    dataFrequencyStatistics.add("frequentValue2");
    result.add(dataFrequencyStatistics);
    // Pattern frequency
    PatternFrequencyStatistics patternFrequencyStatistics = new PatternFrequencyStatistics();
    patternFrequencyStatistics.add("999a999");
    patternFrequencyStatistics.add("999a999");
    patternFrequencyStatistics.add("999aaaa");
    patternFrequencyStatistics.add("999aaaa");
    result.add(patternFrequencyStatistics);
    // Quantiles
    QuantileStatistics quantileStatistics = new QuantileStatistics();
    quantileStatistics.add(1d);
    quantileStatistics.add(2d);
    quantileStatistics.endAddValue();
    result.add(quantileStatistics);
    // Summary
    SummaryStatistics summaryStatistics = new SummaryStatistics();
    summaryStatistics.addData(1d);
    summaryStatistics.addData(2d);
    result.add(summaryStatistics);
    // Histogram
    StreamNumberHistogramStatistics histogramStatistics = new StreamNumberHistogramStatistics();
    histogramStatistics.setNumberOfBins(2);
    histogramStatistics.add(1);
    histogramStatistics.add(2);
    result.add(histogramStatistics);
    // Text length
    TextLengthStatistics textLengthStatistics = new TextLengthStatistics();
    textLengthStatistics.setMaxTextLength(30);
    textLengthStatistics.setMinTextLength(10);
    textLengthStatistics.setSumTextLength(40);
    textLengthStatistics.setCount(5);
    result.add(textLengthStatistics);
    StatisticsAdapter adapter = new StatisticsAdapter(40);
    adapter.adapt(Collections.singletonList(integerColumn), Collections.singletonList(result));
    adapter.adapt(Collections.singletonList(stringColumn), Collections.singletonList(result));
}
Also used : SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) Analyzers(org.talend.dataquality.common.inference.Analyzers) CategoryFrequency(org.talend.dataquality.semantic.recognizer.CategoryFrequency) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) StatisticsAdapter(org.talend.dataprep.dataset.StatisticsAdapter) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) StreamNumberHistogramStatistics(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramStatistics) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics)

Example 2 with PatternFrequencyStatistics

use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.

the class DateParser method guessPattern.

/**
 * Guess the pattern from the given value.
 *
 * @param value the value to get the date time from.
 * @param column the column metadata
 * @return the wanted parsed date time. For date only value, time is set to 00:00:00.
 */
DatePattern guessPattern(String value, ColumnMetadata column) {
    if (StringUtils.isEmpty(value)) {
        throw new DateTimeException("No pattern can be found out of '" + value + "'");
    }
    // call DQ on the given value
    try (Analyzer<Analyzers.Result> analyzer = analyzerService.build(column, AnalyzerService.Analysis.PATTERNS)) {
        analyzer.analyze(value);
        analyzer.end();
        // only one value --> only one result
        final Analyzers.Result result = analyzer.getResult().get(0);
        if (result.exist(PatternFrequencyStatistics.class)) {
            final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
            final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(1);
            List<PatternFrequency> patterns = new ArrayList<>(1);
            topTerms.forEach((s, o) -> patterns.add(new PatternFrequency(s, o)));
            // get & check the results
            final List<DatePattern> results = getPatterns(patterns);
            if (results.isEmpty()) {
                throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
            }
            // as Christopher L. said : "there can be only one" :-)
            return getPatterns(patterns).get(0);
        } else {
            throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
        }
    } catch (Exception e) {
        throw new DateTimeException("Unable to close analyzer after analyzing value '" + value + "'", e);
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) DateTimeException(java.time.DateTimeException) DateTimeException(java.time.DateTimeException) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics)

Example 3 with PatternFrequencyStatistics

use of org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics in project data-prep by Talend.

the class StatisticsAdapter method injectPatternFrequency.

private void injectPatternFrequency(final ColumnMetadata column, final Analyzers.Result result) {
    if (result.exist(PatternFrequencyStatistics.class)) {
        final Statistics statistics = column.getStatistics();
        final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
        final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(15);
        if (topTerms != null) {
            statistics.getPatternFrequencies().clear();
            topTerms.forEach((s, o) -> statistics.getPatternFrequencies().add(new PatternFrequency(s, o)));
        }
    }
}
Also used : PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) StreamNumberHistogramStatistics(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramStatistics) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) StreamDateHistogramStatistics(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics)

Aggregations

PatternFrequencyStatistics (org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics)3 StreamNumberHistogramStatistics (org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramStatistics)2 Analyzers (org.talend.dataquality.common.inference.Analyzers)2 ValueQualityStatistics (org.talend.dataquality.common.inference.ValueQualityStatistics)2 CardinalityStatistics (org.talend.dataquality.statistics.cardinality.CardinalityStatistics)2 DataTypeFrequencyStatistics (org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics)2 QuantileStatistics (org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics)2 SummaryStatistics (org.talend.dataquality.statistics.numeric.summary.SummaryStatistics)2 TextLengthStatistics (org.talend.dataquality.statistics.text.TextLengthStatistics)2 DateTimeException (java.time.DateTimeException)1 PatternFrequency (org.talend.dataprep.api.dataset.statistics.PatternFrequency)1 StreamDateHistogramStatistics (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics)1 StatisticsAdapter (org.talend.dataprep.dataset.StatisticsAdapter)1 CategoryFrequency (org.talend.dataquality.semantic.recognizer.CategoryFrequency)1 SemanticType (org.talend.dataquality.semantic.statistics.SemanticType)1 DataTypeOccurences (org.talend.dataquality.statistics.type.DataTypeOccurences)1