Search in sources :

Example 1 with Analyzers

use of org.talend.dataquality.common.inference.Analyzers in project data-prep by Talend.

the class AnalyzerService method build.

/**
 * Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
 * all the wanted analysis settings for the analyzer.
 *
 * @param columns  A list of columns, may be null or empty.
 * @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
 *                 once.
 * @return A ready to use {@link Analyzer}.
 */
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
    if (columns == null || columns.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Get all needed analysis
    final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
    for (Analysis setting : settings) {
        if (setting != null) {
            all.add(setting);
            all.addAll(Arrays.asList(setting.dependencies));
        }
    }
    if (all.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Column types
    DataTypeEnum[] types = TypeUtils.convert(columns);
    // Semantic domains
    List<String> domainList = // 
    columns.stream().map(// 
    ColumnMetadata::getDomain).map(// 
    d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
    final String[] domains = domainList.toArray(new String[domainList.size()]);
    DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
    // Build all analyzers
    List<Analyzer> analyzers = new ArrayList<>();
    for (Analysis setting : settings) {
        switch(setting) {
            case SEMANTIC:
                final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
                semanticAnalyzer.setLimit(Integer.MAX_VALUE);
                semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
                analyzers.add(semanticAnalyzer);
                break;
            case HISTOGRAM:
                analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
                analyzers.add(new StreamNumberHistogramAnalyzer(types));
                break;
            case QUALITY:
                final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
                columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
                analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
                true));
                break;
            case CARDINALITY:
                analyzers.add(new CardinalityAnalyzer());
                break;
            case PATTERNS:
                analyzers.add(buildPatternAnalyzer(columns));
                break;
            case LENGTH:
                analyzers.add(new TextLengthAnalyzer());
                break;
            case QUANTILES:
                boolean acceptQuantiles = false;
                for (DataTypeEnum type : types) {
                    if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
                        acceptQuantiles = true;
                        break;
                    }
                }
                if (acceptQuantiles) {
                    analyzers.add(new QuantileAnalyzer(types));
                }
                break;
            case SUMMARY:
                analyzers.add(new SummaryAnalyzer(types));
                break;
            case TYPE:
                boolean shouldUseTypeAnalysis = true;
                for (Analysis analysis : settings) {
                    if (analysis == Analysis.QUALITY) {
                        shouldUseTypeAnalysis = false;
                        break;
                    }
                }
                if (shouldUseTypeAnalysis) {
                    final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
                    analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
                } else {
                    LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
                }
                break;
            case FREQUENCY:
                analyzers.add(new DataTypeFrequencyAnalyzer());
                break;
            default:
                throw new IllegalArgumentException("Missing support for '" + setting + "'.");
        }
    }
    // Merge all analyzers into one
    final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
    analyzer.init();
    if (LOGGER.isDebugEnabled()) {
        // Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
        return new ResourceMonitoredAnalyzer(analyzer);
    } else {
        return analyzer;
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) java.util(java.util) StringUtils(org.apache.commons.lang.StringUtils) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) TypeUtils(org.talend.dataprep.api.type.TypeUtils) Metadata(org.talend.dataquality.common.inference.Metadata) DateParser(org.talend.dataprep.transformation.actions.date.DateParser) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) LoggerFactory(org.slf4j.LoggerFactory) SemanticCategoryEnum(org.talend.dataquality.semantic.classifier.SemanticCategoryEnum) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) DateTimePatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.DateTimePatternRecognizer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) EmptyPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.EmptyPatternRecognizer) SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) PrintWriter(java.io.PrintWriter) DictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.DictionarySnapshotProvider) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) Logger(org.slf4j.Logger) LatinExtendedCharPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.LatinExtendedCharPatternRecognizer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StringWriter(java.io.StringWriter) RowMetadataUtils(org.talend.dataprep.api.dataset.row.RowMetadataUtils) StreamDateHistogramStatistics(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics) StandardDictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.StandardDictionarySnapshotProvider) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) Collectors(java.util.stream.Collectors) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) Analyzer(org.talend.dataquality.common.inference.Analyzer) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics) AbstractPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.AbstractPatternRecognizer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) Analyzer(org.talend.dataquality.common.inference.Analyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer)

Example 2 with Analyzers

use of org.talend.dataquality.common.inference.Analyzers in project data-prep by Talend.

the class SchemaAnalysis method analyze.

@Override
public void analyze(String dataSetId) {
    if (StringUtils.isEmpty(dataSetId)) {
        throw new IllegalArgumentException("Data set id cannot be null or empty.");
    }
    DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
    datasetLock.lock();
    try {
        DataSetMetadata metadata = repository.get(dataSetId);
        if (metadata == null) {
            LOGGER.info("Unable to analyze schema of data set #{}: seems to be removed.", dataSetId);
            return;
        }
        // Schema analysis
        try (Stream<DataSetRow> stream = store.stream(metadata, 100)) {
            LOGGER.info("Analyzing schema in dataset #{}...", dataSetId);
            // Configure analyzers
            final List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
            try (Analyzer<Analyzers.Result> analyzer = analyzerService.schemaAnalysis(columns)) {
                // Determine schema for the content.
                stream.limit(100).map(row -> row.toArray(DataSetRow.SKIP_TDP_ID)).forEach(analyzer::analyze);
                // Find the best suitable type
                adapter.adapt(columns, analyzer.getResult());
                LOGGER.info("Analyzed schema in dataset #{}.", dataSetId);
                metadata.getLifecycle().schemaAnalyzed(true);
                repository.save(metadata);
            }
        } catch (Exception e) {
            LOGGER.error("Unable to analyse schema for dataset " + dataSetId + ".", e);
            TDPException.rethrowOrWrap(e, UNABLE_TO_ANALYZE_COLUMN_TYPES);
        }
    } finally {
        datasetLock.unlock();
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) StringUtils(org.apache.commons.lang.StringUtils) TDPException(org.talend.dataprep.exception.TDPException) Logger(org.slf4j.Logger) DataSetMetadataRepository(org.talend.dataprep.dataset.store.metadata.DataSetMetadataRepository) LoggerFactory(org.slf4j.LoggerFactory) Autowired(org.springframework.beans.factory.annotation.Autowired) AnalyzerService(org.talend.dataprep.quality.AnalyzerService) List(java.util.List) Component(org.springframework.stereotype.Component) Stream(java.util.stream.Stream) UNABLE_TO_ANALYZE_COLUMN_TYPES(org.talend.dataprep.exception.error.DataSetErrorCodes.UNABLE_TO_ANALYZE_COLUMN_TYPES) DistributedLock(org.talend.dataprep.lock.DistributedLock) StatisticsAdapter(org.talend.dataprep.dataset.StatisticsAdapter) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) Analyzer(org.talend.dataquality.common.inference.Analyzer) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) ContentStoreRouter(org.talend.dataprep.dataset.store.content.ContentStoreRouter) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DistributedLock(org.talend.dataprep.lock.DistributedLock) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) TDPException(org.talend.dataprep.exception.TDPException)

Aggregations

StringUtils (org.apache.commons.lang.StringUtils)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)2 Analyzer (org.talend.dataquality.common.inference.Analyzer)2 Analyzers (org.talend.dataquality.common.inference.Analyzers)2 PrintWriter (java.io.PrintWriter)1 StringWriter (java.io.StringWriter)1 java.util (java.util)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 Autowired (org.springframework.beans.factory.annotation.Autowired)1 Component (org.springframework.stereotype.Component)1 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)1 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)1 RowMetadataUtils (org.talend.dataprep.api.dataset.row.RowMetadataUtils)1 StreamDateHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer)1 StreamDateHistogramStatistics (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics)1 StreamNumberHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer)1