use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class DataSetContentStore method stream.
/**
* Similarly to {@link #get(DataSetMetadata)} returns the content of the data set but as a {@link Stream stream} of
* {@link DataSetRow rows} instead of JSON content.
*
* @param dataSetMetadata The {@link DataSetMetadata data set} to read rows from.
* @param limit A limit to pass to raw content supplier (use -1 for "no limit). Used as parameter to call
* {@link #get(DataSetMetadata, long)}.
* @return A valid <b>{@link DataSetRow}</b> stream.
*/
public Stream<DataSetRow> stream(DataSetMetadata dataSetMetadata, long limit) {
final InputStream inputStream = get(dataSetMetadata, limit);
final DataSetRowIterator iterator = new DataSetRowIterator(inputStream);
final Iterable<DataSetRow> rowIterable = () -> iterator;
Stream<DataSetRow> dataSetRowStream = StreamSupport.stream(rowIterable.spliterator(), false);
// make sure to close the original input stream when closing this one
AtomicLong tdpId = new AtomicLong(1);
final List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
final Analyzer<Analyzers.Result> analyzer = service.build(columns, AnalyzerService.Analysis.QUALITY);
dataSetRowStream = dataSetRowStream.filter(r -> !r.isEmpty()).map(r -> {
final String[] values = r.order(columns).toArray(DataSetRow.SKIP_TDP_ID);
analyzer.analyze(values);
return r;
}).map(// Mark invalid columns as detected by provided analyzer.
new InvalidMarker(columns, analyzer)).map(r -> {
//
r.setTdpId(tdpId.getAndIncrement());
return r;
}).onClose(() -> {
//
try {
inputStream.close();
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
});
return dataSetRowStream;
}
use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class TransformationService method getSemanticDomains.
/**
* Return the semantic domains for the given parameters.
*
* @param metadata the dataset metadata.
* @param columnId the column id to analyze.
* @param records the dataset records.
* @return the semantic domains for the given parameters.
* @throws IOException can happen...
*/
private List<SemanticDomain> getSemanticDomains(DataSetMetadata metadata, String columnId, InputStream records) throws IOException {
// copy the column metadata and set the semantic domain forced flag to false to make sure the statistics adapter set all
// available domains
final ColumnMetadata columnMetadata = //
column().copy(//
metadata.getRowMetadata().getById(columnId)).semanticDomainForce(//
false).build();
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
try (final JsonParser parser = mapper.getFactory().createParser(new InputStreamReader(records, UTF_8))) {
final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser);
dataSet.getRecords().map(//
r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
}
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
return columnMetadata.getSemanticDomains();
}
use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class AnalyzerService method build.
/**
* Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
* all the wanted analysis settings for the analyzer.
*
* @param columns A list of columns, may be null or empty.
* @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
* once.
* @return A ready to use {@link Analyzer}.
*/
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
if (columns == null || columns.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Get all needed analysis
final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
for (Analysis setting : settings) {
if (setting != null) {
all.add(setting);
all.addAll(Arrays.asList(setting.dependencies));
}
}
if (all.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Column types
DataTypeEnum[] types = TypeUtils.convert(columns);
// Semantic domains
List<String> domainList = //
columns.stream().map(//
ColumnMetadata::getDomain).map(//
d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
final String[] domains = domainList.toArray(new String[domainList.size()]);
DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
// Build all analyzers
List<Analyzer> analyzers = new ArrayList<>();
for (Analysis setting : settings) {
switch(setting) {
case SEMANTIC:
final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
semanticAnalyzer.setLimit(Integer.MAX_VALUE);
semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
analyzers.add(semanticAnalyzer);
break;
case HISTOGRAM:
analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
analyzers.add(new StreamNumberHistogramAnalyzer(types));
break;
case QUALITY:
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
true));
break;
case CARDINALITY:
analyzers.add(new CardinalityAnalyzer());
break;
case PATTERNS:
analyzers.add(buildPatternAnalyzer(columns));
break;
case LENGTH:
analyzers.add(new TextLengthAnalyzer());
break;
case QUANTILES:
boolean acceptQuantiles = false;
for (DataTypeEnum type : types) {
if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
acceptQuantiles = true;
break;
}
}
if (acceptQuantiles) {
analyzers.add(new QuantileAnalyzer(types));
}
break;
case SUMMARY:
analyzers.add(new SummaryAnalyzer(types));
break;
case TYPE:
boolean shouldUseTypeAnalysis = true;
for (Analysis analysis : settings) {
if (analysis == Analysis.QUALITY) {
shouldUseTypeAnalysis = false;
break;
}
}
if (shouldUseTypeAnalysis) {
final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
} else {
LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
}
break;
case FREQUENCY:
analyzers.add(new DataTypeFrequencyAnalyzer());
break;
default:
throw new IllegalArgumentException("Missing support for '" + setting + "'.");
}
}
// Merge all analyzers into one
final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
analyzer.init();
if (LOGGER.isDebugEnabled()) {
// Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
return new ResourceMonitoredAnalyzer(analyzer);
} else {
return analyzer;
}
}
use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class DataSetService method getDataSetColumnSemanticCategories.
/**
* Return the semantic types for a given dataset / column.
*
* @param datasetId the datasetId id.
* @param columnId the column id.
* @return the semantic types for a given dataset / column.
*/
@RequestMapping(value = "/datasets/{datasetId}/columns/{columnId}/types", method = GET)
@ApiOperation(value = "list the types of the wanted column", notes = "This list can be used by user to change the column type.")
@Timed
@PublicAPI
public List<SemanticDomain> getDataSetColumnSemanticCategories(@ApiParam(value = "The dataset id") @PathVariable String datasetId, @ApiParam(value = "The column id") @PathVariable String columnId) {
LOG.debug("listing semantic categories for dataset #{} column #{}", datasetId, columnId);
final DataSetMetadata metadata = dataSetMetadataRepository.get(datasetId);
if (metadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, ExceptionContext.withBuilder().put("id", datasetId).build());
} else {
try (final Stream<DataSetRow> records = contentStore.stream(metadata)) {
final ColumnMetadata columnMetadata = metadata.getRowMetadata().getById(columnId);
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
records.map(r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
final StatisticsAdapter statisticsAdapter = new StatisticsAdapter(40);
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
LOG.debug("found {} for dataset #{}, column #{}", columnMetadata.getSemanticDomains(), datasetId, columnId);
return columnMetadata.getSemanticDomains();
}
}
}
use of org.talend.dataquality.common.inference.Analyzer in project data-prep by Talend.
the class QualityAnalysis method computeQuality.
/**
* Compute the quality (count, valid, invalid and empty) of the given dataset.
*
* @param dataset the dataset metadata.
* @param records the dataset records
* @param limit indicates how many records will be read from stream. Use a number < 0 to perform a full scan of
*/
public void computeQuality(DataSetMetadata dataset, Stream<DataSetRow> records, long limit) {
// Compute valid / invalid / empty count, need data types for analyzer first
final List<ColumnMetadata> columns = dataset.getRowMetadata().getColumns();
if (columns.isEmpty()) {
LOGGER.debug("Skip analysis of {} (no column information).", dataset.getId());
return;
}
try (Analyzer<Analyzers.Result> analyzer = analyzerService.qualityAnalysis(columns)) {
if (limit > 0) {
// Only limit number of rows if limit > 0 (use limit to speed up sync analysis.
LOGGER.debug("Limit analysis to the first {}.", limit);
records = records.limit(limit);
} else {
LOGGER.debug("Performing full analysis.");
}
records.map(row -> row.toArray(DataSetRow.SKIP_TDP_ID)).forEach(analyzer::analyze);
// Determine content size
final List<Analyzers.Result> result = analyzer.getResult();
adapter.adapt(columns, result);
// Remember the number of records
if (!result.isEmpty()) {
final long recordCount = result.get(0).get(ValueQualityStatistics.class).getCount();
dataset.getContent().setNbRecords((int) recordCount);
}
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
Aggregations