use of org.talend.dataquality.common.inference.Analyzers in project data-prep by Talend.
the class AnalyzerService method build.
/**
* Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
* all the wanted analysis settings for the analyzer.
*
* @param columns A list of columns, may be null or empty.
* @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
* once.
* @return A ready to use {@link Analyzer}.
*/
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
if (columns == null || columns.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Get all needed analysis
final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
for (Analysis setting : settings) {
if (setting != null) {
all.add(setting);
all.addAll(Arrays.asList(setting.dependencies));
}
}
if (all.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Column types
DataTypeEnum[] types = TypeUtils.convert(columns);
// Semantic domains
List<String> domainList = //
columns.stream().map(//
ColumnMetadata::getDomain).map(//
d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
final String[] domains = domainList.toArray(new String[domainList.size()]);
DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
// Build all analyzers
List<Analyzer> analyzers = new ArrayList<>();
for (Analysis setting : settings) {
switch(setting) {
case SEMANTIC:
final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
semanticAnalyzer.setLimit(Integer.MAX_VALUE);
semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
analyzers.add(semanticAnalyzer);
break;
case HISTOGRAM:
analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
analyzers.add(new StreamNumberHistogramAnalyzer(types));
break;
case QUALITY:
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
true));
break;
case CARDINALITY:
analyzers.add(new CardinalityAnalyzer());
break;
case PATTERNS:
analyzers.add(buildPatternAnalyzer(columns));
break;
case LENGTH:
analyzers.add(new TextLengthAnalyzer());
break;
case QUANTILES:
boolean acceptQuantiles = false;
for (DataTypeEnum type : types) {
if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
acceptQuantiles = true;
break;
}
}
if (acceptQuantiles) {
analyzers.add(new QuantileAnalyzer(types));
}
break;
case SUMMARY:
analyzers.add(new SummaryAnalyzer(types));
break;
case TYPE:
boolean shouldUseTypeAnalysis = true;
for (Analysis analysis : settings) {
if (analysis == Analysis.QUALITY) {
shouldUseTypeAnalysis = false;
break;
}
}
if (shouldUseTypeAnalysis) {
final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
} else {
LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
}
break;
case FREQUENCY:
analyzers.add(new DataTypeFrequencyAnalyzer());
break;
default:
throw new IllegalArgumentException("Missing support for '" + setting + "'.");
}
}
// Merge all analyzers into one
final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
analyzer.init();
if (LOGGER.isDebugEnabled()) {
// Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
return new ResourceMonitoredAnalyzer(analyzer);
} else {
return analyzer;
}
}
use of org.talend.dataquality.common.inference.Analyzers in project data-prep by Talend.
the class SchemaAnalysis method analyze.
@Override
public void analyze(String dataSetId) {
if (StringUtils.isEmpty(dataSetId)) {
throw new IllegalArgumentException("Data set id cannot be null or empty.");
}
DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
datasetLock.lock();
try {
DataSetMetadata metadata = repository.get(dataSetId);
if (metadata == null) {
LOGGER.info("Unable to analyze schema of data set #{}: seems to be removed.", dataSetId);
return;
}
// Schema analysis
try (Stream<DataSetRow> stream = store.stream(metadata, 100)) {
LOGGER.info("Analyzing schema in dataset #{}...", dataSetId);
// Configure analyzers
final List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
try (Analyzer<Analyzers.Result> analyzer = analyzerService.schemaAnalysis(columns)) {
// Determine schema for the content.
stream.limit(100).map(row -> row.toArray(DataSetRow.SKIP_TDP_ID)).forEach(analyzer::analyze);
// Find the best suitable type
adapter.adapt(columns, analyzer.getResult());
LOGGER.info("Analyzed schema in dataset #{}.", dataSetId);
metadata.getLifecycle().schemaAnalyzed(true);
repository.save(metadata);
}
} catch (Exception e) {
LOGGER.error("Unable to analyse schema for dataset " + dataSetId + ".", e);
TDPException.rethrowOrWrap(e, UNABLE_TO_ANALYZE_COLUMN_TYPES);
}
} finally {
datasetLock.unlock();
}
}
Aggregations