Search in sources :

Example 1 with SemanticAnalyzer

use of org.talend.dataquality.semantic.statistics.SemanticAnalyzer in project data-prep by Talend.

the class AnalyzerService method build.

/**
 * Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
 * all the wanted analysis settings for the analyzer.
 *
 * @param columns  A list of columns, may be null or empty.
 * @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
 *                 once.
 * @return A ready to use {@link Analyzer}.
 */
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
    if (columns == null || columns.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Get all needed analysis
    final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
    for (Analysis setting : settings) {
        if (setting != null) {
            all.add(setting);
            all.addAll(Arrays.asList(setting.dependencies));
        }
    }
    if (all.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Column types
    DataTypeEnum[] types = TypeUtils.convert(columns);
    // Semantic domains
    List<String> domainList = // 
    columns.stream().map(// 
    ColumnMetadata::getDomain).map(// 
    d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
    final String[] domains = domainList.toArray(new String[domainList.size()]);
    DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
    // Build all analyzers
    List<Analyzer> analyzers = new ArrayList<>();
    for (Analysis setting : settings) {
        switch(setting) {
            case SEMANTIC:
                final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
                semanticAnalyzer.setLimit(Integer.MAX_VALUE);
                semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
                analyzers.add(semanticAnalyzer);
                break;
            case HISTOGRAM:
                analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
                analyzers.add(new StreamNumberHistogramAnalyzer(types));
                break;
            case QUALITY:
                final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
                columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
                analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
                true));
                break;
            case CARDINALITY:
                analyzers.add(new CardinalityAnalyzer());
                break;
            case PATTERNS:
                analyzers.add(buildPatternAnalyzer(columns));
                break;
            case LENGTH:
                analyzers.add(new TextLengthAnalyzer());
                break;
            case QUANTILES:
                boolean acceptQuantiles = false;
                for (DataTypeEnum type : types) {
                    if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
                        acceptQuantiles = true;
                        break;
                    }
                }
                if (acceptQuantiles) {
                    analyzers.add(new QuantileAnalyzer(types));
                }
                break;
            case SUMMARY:
                analyzers.add(new SummaryAnalyzer(types));
                break;
            case TYPE:
                boolean shouldUseTypeAnalysis = true;
                for (Analysis analysis : settings) {
                    if (analysis == Analysis.QUALITY) {
                        shouldUseTypeAnalysis = false;
                        break;
                    }
                }
                if (shouldUseTypeAnalysis) {
                    final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
                    analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
                } else {
                    LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
                }
                break;
            case FREQUENCY:
                analyzers.add(new DataTypeFrequencyAnalyzer());
                break;
            default:
                throw new IllegalArgumentException("Missing support for '" + setting + "'.");
        }
    }
    // Merge all analyzers into one
    final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
    analyzer.init();
    if (LOGGER.isDebugEnabled()) {
        // Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
        return new ResourceMonitoredAnalyzer(analyzer);
    } else {
        return analyzer;
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) java.util(java.util) StringUtils(org.apache.commons.lang.StringUtils) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) TypeUtils(org.talend.dataprep.api.type.TypeUtils) Metadata(org.talend.dataquality.common.inference.Metadata) DateParser(org.talend.dataprep.transformation.actions.date.DateParser) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) LoggerFactory(org.slf4j.LoggerFactory) SemanticCategoryEnum(org.talend.dataquality.semantic.classifier.SemanticCategoryEnum) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) DateTimePatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.DateTimePatternRecognizer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) EmptyPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.EmptyPatternRecognizer) SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) PrintWriter(java.io.PrintWriter) DictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.DictionarySnapshotProvider) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) Logger(org.slf4j.Logger) LatinExtendedCharPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.LatinExtendedCharPatternRecognizer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StringWriter(java.io.StringWriter) RowMetadataUtils(org.talend.dataprep.api.dataset.row.RowMetadataUtils) StreamDateHistogramStatistics(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics) StandardDictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.StandardDictionarySnapshotProvider) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) Collectors(java.util.stream.Collectors) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) Analyzer(org.talend.dataquality.common.inference.Analyzer) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics) AbstractPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.AbstractPatternRecognizer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) Analyzer(org.talend.dataquality.common.inference.Analyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer)

Aggregations

PrintWriter (java.io.PrintWriter)1 StringWriter (java.io.StringWriter)1 java.util (java.util)1 Collectors (java.util.stream.Collectors)1 StringUtils (org.apache.commons.lang.StringUtils)1 Logger (org.slf4j.Logger)1 LoggerFactory (org.slf4j.LoggerFactory)1 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)1 RowMetadataUtils (org.talend.dataprep.api.dataset.row.RowMetadataUtils)1 StreamDateHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer)1 StreamDateHistogramStatistics (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics)1 StreamNumberHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer)1 TypeUtils (org.talend.dataprep.api.type.TypeUtils)1 DateParser (org.talend.dataprep.transformation.actions.date.DateParser)1 NullAnalyzer (org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer)1 Analyzer (org.talend.dataquality.common.inference.Analyzer)1 Analyzers (org.talend.dataquality.common.inference.Analyzers)1 Metadata (org.talend.dataquality.common.inference.Metadata)1 ValueQualityStatistics (org.talend.dataquality.common.inference.ValueQualityStatistics)1 SemanticCategoryEnum (org.talend.dataquality.semantic.classifier.SemanticCategoryEnum)1