use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.
the class FormatAnalysis method analyze.
/**
* @see SynchronousDataSetAnalyzer#analyze(String)
*/
@Override
public void analyze(String dataSetId) {
if (StringUtils.isEmpty(dataSetId)) {
throw new IllegalArgumentException("Data set id cannot be null or empty.");
}
final Marker marker = Markers.dataset(dataSetId);
DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
datasetLock.lock();
try {
DataSetMetadata metadata = repository.get(dataSetId);
if (metadata != null) {
Format detectedFormat = null;
for (byte[] bom : BOMS) {
try (InputStream content = store.getAsRaw(metadata, 10)) {
// 10 line should be enough to detect format
detectedFormat = detector.detect(addBOM(content, bom));
} catch (IOException e) {
throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT, e);
}
if (detectedFormat != null && !(detectedFormat.getFormatFamily() instanceof UnsupportedFormatFamily)) {
break;
}
}
LOG.debug(marker, "using {} to parse the dataset", detectedFormat);
verifyFormat(detectedFormat);
internalUpdateMetadata(metadata, detectedFormat);
LOG.debug(marker, "format analysed for dataset");
} else {
LOG.info(marker, "Data set no longer exists.");
}
} finally {
datasetLock.unlock();
}
}
use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.
the class ObjectDataSetMetadataRepository method clear.
@Override
public void clear() {
// Remove all data set (but use lock for remaining asynchronous processes).
list().forEach(m -> {
if (m != null) {
final DistributedLock lock = createDatasetMetadataLock(m.getId());
try {
lock.lock();
remove(m.getId());
} finally {
lock.unlock();
}
}
});
LOGGER.debug("dataset metadata repository cleared.");
}
use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.
the class QualityAnalysis method analyze.
/**
* Analyse the dataset metadata quality.
*
* @param dataSetId the dataset id.
*/
@Override
public void analyze(String dataSetId) {
if (StringUtils.isEmpty(dataSetId)) {
throw new IllegalArgumentException("Data set id cannot be null or empty.");
}
DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
datasetLock.lock();
try {
DataSetMetadata metadata = repository.get(dataSetId);
if (metadata == null) {
LOGGER.info("Unable to analyze quality of data set #{}: seems to be removed.", dataSetId);
return;
}
// e.g. excel multi sheet dataset when user has not choose the sheet yet
if (!metadata.getLifecycle().isInProgress()) {
LOGGER.debug("No need to recompute quality of data set #{} (statistics are completed).", dataSetId);
return;
}
try (Stream<DataSetRow> stream = store.stream(metadata)) {
if (!metadata.getLifecycle().schemaAnalyzed()) {
LOGGER.debug("Schema information must be computed before quality analysis can be performed, ignoring message");
// no acknowledge to allow re-poll.
return;
}
LOGGER.debug("Analyzing quality of dataset #{}...", metadata.getId());
// New data set, or reached the max limit of records for synchronous analysis, trigger a full scan (but
// async).
final long dataSetSize = metadata.getContent().getNbRecords();
final boolean isNewDataSet = dataSetSize == 0;
if (isNewDataSet || dataSetSize == maxRecord) {
// If data set size is maxRecord, performs a full scan, otherwise only take first maxRecord
// records.
computeQuality(metadata, stream, dataSetSize == maxRecord ? -1 : maxRecord);
}
// Turn on / off "in progress" flag
if (isNewDataSet && metadata.getContent().getNbRecords() >= maxRecord) {
metadata.getLifecycle().setInProgress(true);
} else {
metadata.getLifecycle().setInProgress(false);
}
// ... all quality is now analyzed, mark it so.
metadata.getLifecycle().qualityAnalyzed(true);
repository.save(metadata);
LOGGER.debug("Analyzed quality of dataset #{}.", dataSetId);
} catch (Exception e) {
LOGGER.warn("dataset '{}' generate an error, message: {} ", dataSetId, e.getMessage());
throw new TDPException(DataSetErrorCodes.UNABLE_TO_ANALYZE_DATASET_QUALITY, e);
}
} finally {
datasetLock.unlock();
}
}
use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.
the class SchemaAnalysis method analyze.
@Override
public void analyze(String dataSetId) {
if (StringUtils.isEmpty(dataSetId)) {
throw new IllegalArgumentException("Data set id cannot be null or empty.");
}
DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
datasetLock.lock();
try {
DataSetMetadata metadata = repository.get(dataSetId);
if (metadata == null) {
LOGGER.info("Unable to analyze schema of data set #{}: seems to be removed.", dataSetId);
return;
}
// Schema analysis
try (Stream<DataSetRow> stream = store.stream(metadata, 100)) {
LOGGER.info("Analyzing schema in dataset #{}...", dataSetId);
// Configure analyzers
final List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
try (Analyzer<Analyzers.Result> analyzer = analyzerService.schemaAnalysis(columns)) {
// Determine schema for the content.
stream.limit(100).map(row -> row.toArray(DataSetRow.SKIP_TDP_ID)).forEach(analyzer::analyze);
// Find the best suitable type
adapter.adapt(columns, analyzer.getResult());
LOGGER.info("Analyzed schema in dataset #{}.", dataSetId);
metadata.getLifecycle().schemaAnalyzed(true);
repository.save(metadata);
}
} catch (Exception e) {
LOGGER.error("Unable to analyse schema for dataset " + dataSetId + ".", e);
TDPException.rethrowOrWrap(e, UNABLE_TO_ANALYZE_COLUMN_TYPES);
}
} finally {
datasetLock.unlock();
}
}
Aggregations