use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class SchemaAnalysis method analyze.
@Override
public void analyze(String dataSetId) {
if (StringUtils.isEmpty(dataSetId)) {
throw new IllegalArgumentException("Data set id cannot be null or empty.");
}
DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
datasetLock.lock();
try {
DataSetMetadata metadata = repository.get(dataSetId);
if (metadata == null) {
LOGGER.info("Unable to analyze schema of data set #{}: seems to be removed.", dataSetId);
return;
}
// Schema analysis
try (Stream<DataSetRow> stream = store.stream(metadata, 100)) {
LOGGER.info("Analyzing schema in dataset #{}...", dataSetId);
// Configure analyzers
final List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
try (Analyzer<Analyzers.Result> analyzer = analyzerService.schemaAnalysis(columns)) {
// Determine schema for the content.
stream.limit(100).map(row -> row.toArray(DataSetRow.SKIP_TDP_ID)).forEach(analyzer::analyze);
// Find the best suitable type
adapter.adapt(columns, analyzer.getResult());
LOGGER.info("Analyzed schema in dataset #{}.", dataSetId);
metadata.getLifecycle().schemaAnalyzed(true);
repository.save(metadata);
}
} catch (Exception e) {
LOGGER.error("Unable to analyse schema for dataset " + dataSetId + ".", e);
TDPException.rethrowOrWrap(e, UNABLE_TO_ANALYZE_COLUMN_TYPES);
}
} finally {
datasetLock.unlock();
}
}
Aggregations