use of org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer in project data-prep by Talend.
the class AnalyzerService method build.
/**
* Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
* all the wanted analysis settings for the analyzer.
*
* @param columns A list of columns, may be null or empty.
* @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
* once.
* @return A ready to use {@link Analyzer}.
*/
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
if (columns == null || columns.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Get all needed analysis
final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
for (Analysis setting : settings) {
if (setting != null) {
all.add(setting);
all.addAll(Arrays.asList(setting.dependencies));
}
}
if (all.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Column types
DataTypeEnum[] types = TypeUtils.convert(columns);
// Semantic domains
List<String> domainList = //
columns.stream().map(//
ColumnMetadata::getDomain).map(//
d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
final String[] domains = domainList.toArray(new String[domainList.size()]);
DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
// Build all analyzers
List<Analyzer> analyzers = new ArrayList<>();
for (Analysis setting : settings) {
switch(setting) {
case SEMANTIC:
final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
semanticAnalyzer.setLimit(Integer.MAX_VALUE);
semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
analyzers.add(semanticAnalyzer);
break;
case HISTOGRAM:
analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
analyzers.add(new StreamNumberHistogramAnalyzer(types));
break;
case QUALITY:
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
true));
break;
case CARDINALITY:
analyzers.add(new CardinalityAnalyzer());
break;
case PATTERNS:
analyzers.add(buildPatternAnalyzer(columns));
break;
case LENGTH:
analyzers.add(new TextLengthAnalyzer());
break;
case QUANTILES:
boolean acceptQuantiles = false;
for (DataTypeEnum type : types) {
if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
acceptQuantiles = true;
break;
}
}
if (acceptQuantiles) {
analyzers.add(new QuantileAnalyzer(types));
}
break;
case SUMMARY:
analyzers.add(new SummaryAnalyzer(types));
break;
case TYPE:
boolean shouldUseTypeAnalysis = true;
for (Analysis analysis : settings) {
if (analysis == Analysis.QUALITY) {
shouldUseTypeAnalysis = false;
break;
}
}
if (shouldUseTypeAnalysis) {
final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
} else {
LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
}
break;
case FREQUENCY:
analyzers.add(new DataTypeFrequencyAnalyzer());
break;
default:
throw new IllegalArgumentException("Missing support for '" + setting + "'.");
}
}
// Merge all analyzers into one
final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
analyzer.init();
if (LOGGER.isDebugEnabled()) {
// Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
return new ResourceMonitoredAnalyzer(analyzer);
} else {
return analyzer;
}
}
Aggregations