use of org.talend.dataquality.statistics.type.DataTypeEnum in project data-prep by Talend.
the class TypeUtilsTest method testConvertDouble.
@Test
public void testConvertDouble() throws Exception {
ColumnMetadata metadata = column().id(1).type(Type.DOUBLE).build();
DataTypeEnum[] types = TypeUtils.convert(Collections.singletonList(metadata));
assertThat(types[0], is(DataTypeEnum.DOUBLE));
metadata = column().id(2).type(Type.FLOAT).build();
types = TypeUtils.convert(Collections.singletonList(metadata));
assertThat(types[0], is(DataTypeEnum.DOUBLE));
}
use of org.talend.dataquality.statistics.type.DataTypeEnum in project data-prep by Talend.
the class AnalyzerService method build.
/**
* Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
* all the wanted analysis settings for the analyzer.
*
* @param columns A list of columns, may be null or empty.
* @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
* once.
* @return A ready to use {@link Analyzer}.
*/
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
if (columns == null || columns.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Get all needed analysis
final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
for (Analysis setting : settings) {
if (setting != null) {
all.add(setting);
all.addAll(Arrays.asList(setting.dependencies));
}
}
if (all.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Column types
DataTypeEnum[] types = TypeUtils.convert(columns);
// Semantic domains
List<String> domainList = //
columns.stream().map(//
ColumnMetadata::getDomain).map(//
d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
final String[] domains = domainList.toArray(new String[domainList.size()]);
DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
// Build all analyzers
List<Analyzer> analyzers = new ArrayList<>();
for (Analysis setting : settings) {
switch(setting) {
case SEMANTIC:
final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
semanticAnalyzer.setLimit(Integer.MAX_VALUE);
semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
analyzers.add(semanticAnalyzer);
break;
case HISTOGRAM:
analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
analyzers.add(new StreamNumberHistogramAnalyzer(types));
break;
case QUALITY:
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
true));
break;
case CARDINALITY:
analyzers.add(new CardinalityAnalyzer());
break;
case PATTERNS:
analyzers.add(buildPatternAnalyzer(columns));
break;
case LENGTH:
analyzers.add(new TextLengthAnalyzer());
break;
case QUANTILES:
boolean acceptQuantiles = false;
for (DataTypeEnum type : types) {
if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
acceptQuantiles = true;
break;
}
}
if (acceptQuantiles) {
analyzers.add(new QuantileAnalyzer(types));
}
break;
case SUMMARY:
analyzers.add(new SummaryAnalyzer(types));
break;
case TYPE:
boolean shouldUseTypeAnalysis = true;
for (Analysis analysis : settings) {
if (analysis == Analysis.QUALITY) {
shouldUseTypeAnalysis = false;
break;
}
}
if (shouldUseTypeAnalysis) {
final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
} else {
LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
}
break;
case FREQUENCY:
analyzers.add(new DataTypeFrequencyAnalyzer());
break;
default:
throw new IllegalArgumentException("Missing support for '" + setting + "'.");
}
}
// Merge all analyzers into one
final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
analyzer.init();
if (LOGGER.isDebugEnabled()) {
// Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
return new ResourceMonitoredAnalyzer(analyzer);
} else {
return analyzer;
}
}
use of org.talend.dataquality.statistics.type.DataTypeEnum in project data-prep by Talend.
the class StatisticsAdapter method injectDataTypeAnalysis.
private void injectDataTypeAnalysis(final ColumnMetadata column, final Analyzers.Result result) {
if (result.exist(DataTypeOccurences.class) && !column.isTypeForced()) {
final DataTypeOccurences dataType = result.get(DataTypeOccurences.class);
final DataTypeEnum suggestedEnumType = dataType.getSuggestedType();
final Type suggestedColumnType = Type.get(suggestedEnumType.name());
// the suggested type can be modified by #injectValueQuality
column.setType(suggestedColumnType.getName());
}
injectValueQuality(column, result);
}
Aggregations