Search in sources :

Example 6 with DistributedLock

use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.

the class ContentAnalysis method analyze.

/**
 * @see DataSetAnalyzer#analyze(String)
 */
@Override
public void analyze(String dataSetId) {
    // defensive programming
    if (StringUtils.isEmpty(dataSetId)) {
        throw new IllegalArgumentException("Data set id cannot be null or empty.");
    }
    DistributedLock datasetLock = repository.createDatasetMetadataLock(dataSetId);
    datasetLock.lock();
    try {
        DataSetMetadata metadata = repository.get(dataSetId);
        if (metadata != null) {
            LOG.info("Indexing content of data set #{}...", metadata.getId());
            updateHeaderAndFooter(metadata);
            updateLimit(metadata);
            metadata.getLifecycle().contentIndexed(true);
            repository.save(metadata);
            LOG.info("Indexed content of data set #{}.", dataSetId);
        } else {
            // $NON-NLS-1$
            LOG.info("Data set #{} no longer exists.", dataSetId);
        }
    } finally {
        datasetLock.unlock();
    }
}
Also used : DistributedLock(org.talend.dataprep.lock.DistributedLock) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata)

Example 7 with DistributedLock

use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.

the class DataSetService method updateDataSet.

/**
 * Updates a data set metadata. If no data set exists for given id, a {@link TDPException} is thrown.
 *
 * @param dataSetId The id of data set to be updated.
 * @param dataSetMetadata The new content for the data set. If empty, existing content will <b>not</b> be replaced.
 * For delete operation, look at {@link #delete(String)}.
 */
@RequestMapping(value = "/datasets/{id}", method = PUT)
@ApiOperation(value = "Update a data set metadata by id", notes = "Update a data set metadata according to the content of the PUT body. Id should be a UUID returned by the list operation. Not valid or non existing data set id return an error response.")
@Timed
public void updateDataSet(@PathVariable(value = "id") @ApiParam(name = "id", value = "Id of the data set to update") String dataSetId, @RequestBody DataSetMetadata dataSetMetadata) {
    if (dataSetMetadata != null && dataSetMetadata.getName() != null) {
        checkDataSetName(dataSetMetadata.getName());
    }
    final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
    lock.lock();
    try {
        DataSetMetadata metadataForUpdate = dataSetMetadataRepository.get(dataSetId);
        if (metadataForUpdate == null) {
            // No need to silently create the data set metadata: associated content will most likely not exist.
            throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
        }
        LOG.debug("updateDataSet: {}", dataSetMetadata);
        publisher.publishEvent(new DatasetUpdatedEvent(dataSetMetadata));
        // 
        // Only part of the metadata can be updated, so the original dataset metadata is loaded and updated
        // 
        DataSetMetadata original = metadataBuilder.metadata().copy(metadataForUpdate).build();
        try {
            // update the name
            metadataForUpdate.setName(dataSetMetadata.getName());
            // update the sheet content (in case of a multi-sheet excel file)
            if (metadataForUpdate.getSchemaParserResult() != null) {
                Optional<Schema.SheetContent> sheetContentFound = metadataForUpdate.getSchemaParserResult().getSheetContents().stream().filter(sheetContent -> dataSetMetadata.getSheetName().equals(sheetContent.getName())).findFirst();
                if (sheetContentFound.isPresent()) {
                    List<ColumnMetadata> columnMetadatas = sheetContentFound.get().getColumnMetadatas();
                    if (metadataForUpdate.getRowMetadata() == null) {
                        metadataForUpdate.setRowMetadata(new RowMetadata(emptyList()));
                    }
                    metadataForUpdate.getRowMetadata().setColumns(columnMetadatas);
                }
                metadataForUpdate.setSheetName(dataSetMetadata.getSheetName());
                metadataForUpdate.setSchemaParserResult(null);
            }
            // Location updates
            metadataForUpdate.setLocation(dataSetMetadata.getLocation());
            // update parameters & encoding (so that user can change import parameters for CSV)
            metadataForUpdate.getContent().setParameters(dataSetMetadata.getContent().getParameters());
            metadataForUpdate.setEncoding(dataSetMetadata.getEncoding());
            // update limit
            final Optional<Long> newLimit = dataSetMetadata.getContent().getLimit();
            newLimit.ifPresent(limit -> metadataForUpdate.getContent().setLimit(limit));
            // Validate that the new data set metadata and removes the draft status
            final String formatFamilyId = dataSetMetadata.getContent().getFormatFamilyId();
            if (formatFamilyFactory.hasFormatFamily(formatFamilyId)) {
                FormatFamily format = formatFamilyFactory.getFormatFamily(formatFamilyId);
                try {
                    DraftValidator draftValidator = format.getDraftValidator();
                    DraftValidator.Result result = draftValidator.validate(dataSetMetadata);
                    if (result.isDraft()) {
                        // This is not an exception case: data set may remain a draft after update (although rather
                        // unusual)
                        LOG.warn("Data set #{} is still a draft after update.", dataSetId);
                        return;
                    }
                    // Data set metadata to update is no longer a draft
                    metadataForUpdate.setDraft(false);
                } catch (UnsupportedOperationException e) {
                // no need to validate draft here
                }
            }
            // update schema
            formatAnalyzer.update(original, metadataForUpdate);
            // save the result
            dataSetMetadataRepository.save(metadataForUpdate);
            // all good mate!! so send that to jms
            // Asks for a in depth schema analysis (for column type information).
            analyzeDataSet(dataSetId, true, singletonList(FormatAnalysis.class));
        } catch (TDPException e) {
            throw e;
        } catch (Exception e) {
            throw new TDPException(UNABLE_TO_CREATE_OR_UPDATE_DATASET, e);
        }
    } finally {
        lock.unlock();
    }
}
Also used : VolumeMetered(org.talend.dataprep.metrics.VolumeMetered) RequestParam(org.springframework.web.bind.annotation.RequestParam) ImportBuilder(org.talend.dataprep.api.dataset.Import.ImportBuilder) FormatFamilyFactory(org.talend.dataprep.schema.FormatFamilyFactory) Autowired(org.springframework.beans.factory.annotation.Autowired) ApiParam(io.swagger.annotations.ApiParam) StringUtils(org.apache.commons.lang3.StringUtils) TEXT_PLAIN_VALUE(org.springframework.http.MediaType.TEXT_PLAIN_VALUE) SortAndOrderHelper.getDataSetMetadataComparator(org.talend.dataprep.util.SortAndOrderHelper.getDataSetMetadataComparator) Collections.singletonList(java.util.Collections.singletonList) SemanticDomain(org.talend.dataprep.api.dataset.statistics.SemanticDomain) BeanConversionService(org.talend.dataprep.conversions.BeanConversionService) PipedInputStream(java.io.PipedInputStream) DistributedLock(org.talend.dataprep.lock.DistributedLock) Arrays.asList(java.util.Arrays.asList) Map(java.util.Map) DataprepBundle.message(org.talend.dataprep.i18n.DataprepBundle.message) UserData(org.talend.dataprep.api.user.UserData) TaskExecutor(org.springframework.core.task.TaskExecutor) MAX_STORAGE_MAY_BE_EXCEEDED(org.talend.dataprep.exception.error.DataSetErrorCodes.MAX_STORAGE_MAY_BE_EXCEEDED) DataSet(org.talend.dataprep.api.dataset.DataSet) LocalStoreLocation(org.talend.dataprep.api.dataset.location.LocalStoreLocation) FormatFamily(org.talend.dataprep.schema.FormatFamily) Resource(javax.annotation.Resource) Set(java.util.Set) DatasetUpdatedEvent(org.talend.dataprep.dataset.event.DatasetUpdatedEvent) RestController(org.springframework.web.bind.annotation.RestController) QuotaService(org.talend.dataprep.dataset.store.QuotaService) Stream(java.util.stream.Stream) StreamSupport.stream(java.util.stream.StreamSupport.stream) FlagNames(org.talend.dataprep.api.dataset.row.FlagNames) UNEXPECTED_CONTENT(org.talend.dataprep.exception.error.CommonErrorCodes.UNEXPECTED_CONTENT) Analyzers(org.talend.dataquality.common.inference.Analyzers) DataSetLocatorService(org.talend.dataprep.api.dataset.location.locator.DataSetLocatorService) Callable(java.util.concurrent.Callable) Schema(org.talend.dataprep.schema.Schema) ArrayList(java.util.ArrayList) Value(org.springframework.beans.factory.annotation.Value) RequestBody(org.springframework.web.bind.annotation.RequestBody) DataSetLocationService(org.talend.dataprep.api.dataset.location.DataSetLocationService) AnalyzerService(org.talend.dataprep.quality.AnalyzerService) UserDataRepository(org.talend.dataprep.user.store.UserDataRepository) Markers(org.talend.dataprep.log.Markers) Api(io.swagger.annotations.Api) DraftValidator(org.talend.dataprep.schema.DraftValidator) HttpResponseContext(org.talend.dataprep.http.HttpResponseContext) Sort(org.talend.dataprep.util.SortAndOrderHelper.Sort) IOException(java.io.IOException) PipedOutputStream(java.io.PipedOutputStream) FormatAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.FormatAnalysis) ContentAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.ContentAnalysis) SchemaAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.SchemaAnalysis) HttpStatus(org.springframework.http.HttpStatus) FilterService(org.talend.dataprep.api.filter.FilterService) Marker(org.slf4j.Marker) NullOutputStream(org.apache.commons.io.output.NullOutputStream) StatisticsAdapter(org.talend.dataprep.dataset.StatisticsAdapter) Timed(org.talend.dataprep.metrics.Timed) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) PathVariable(org.springframework.web.bind.annotation.PathVariable) DataSetMetadataBuilder(org.talend.dataprep.dataset.DataSetMetadataBuilder) URLDecoder(java.net.URLDecoder) DataSetErrorCodes(org.talend.dataprep.exception.error.DataSetErrorCodes) PUT(org.springframework.web.bind.annotation.RequestMethod.PUT) LoggerFactory(org.slf4j.LoggerFactory) SEMANTIC(org.talend.dataprep.quality.AnalyzerService.Analysis.SEMANTIC) ApiOperation(io.swagger.annotations.ApiOperation) UNABLE_TO_CREATE_OR_UPDATE_DATASET(org.talend.dataprep.exception.error.DataSetErrorCodes.UNABLE_TO_CREATE_OR_UPDATE_DATASET) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) StrictlyBoundedInputStream(org.talend.dataprep.dataset.store.content.StrictlyBoundedInputStream) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) UNSUPPORTED_CONTENT(org.talend.dataprep.exception.error.DataSetErrorCodes.UNSUPPORTED_CONTENT) TimeToLive(org.talend.dataprep.cache.ContentCache.TimeToLive) Order(org.talend.dataprep.util.SortAndOrderHelper.Order) Collections.emptyList(java.util.Collections.emptyList) PublicAPI(org.talend.dataprep.security.PublicAPI) RequestMethod(org.springframework.web.bind.annotation.RequestMethod) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) ContentCache(org.talend.dataprep.cache.ContentCache) INVALID_DATASET_NAME(org.talend.dataprep.exception.error.DataSetErrorCodes.INVALID_DATASET_NAME) List(java.util.List) Optional(java.util.Optional) Analyzer(org.talend.dataquality.common.inference.Analyzer) RequestHeader(org.springframework.web.bind.annotation.RequestHeader) Pattern(java.util.regex.Pattern) Security(org.talend.dataprep.security.Security) Spliterator(java.util.Spliterator) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) ComponentProperties(org.talend.dataprep.parameters.jsonschema.ComponentProperties) TDPException(org.talend.dataprep.exception.TDPException) JsonErrorCodeDescription(org.talend.dataprep.exception.json.JsonErrorCodeDescription) RequestMapping(org.springframework.web.bind.annotation.RequestMapping) UNABLE_CREATE_DATASET(org.talend.dataprep.exception.error.DataSetErrorCodes.UNABLE_CREATE_DATASET) HashMap(java.util.HashMap) GET(org.springframework.web.bind.annotation.RequestMethod.GET) Import(org.talend.dataprep.api.dataset.Import) ExceptionContext.build(org.talend.daikon.exception.ExceptionContext.build) ExceptionContext(org.talend.daikon.exception.ExceptionContext) Charset(java.nio.charset.Charset) UpdateColumnParameters(org.talend.dataprep.dataset.service.api.UpdateColumnParameters) VersionService(org.talend.dataprep.api.service.info.VersionService) POST(org.springframework.web.bind.annotation.RequestMethod.POST) OutputStream(java.io.OutputStream) DataSetLocation(org.talend.dataprep.api.dataset.DataSetLocation) Logger(org.slf4j.Logger) LocaleContextHolder.getLocale(org.springframework.context.i18n.LocaleContextHolder.getLocale) UpdateDataSetCacheKey(org.talend.dataprep.dataset.service.cache.UpdateDataSetCacheKey) IOUtils(org.apache.commons.compress.utils.IOUtils) APPLICATION_JSON_VALUE(org.springframework.http.MediaType.APPLICATION_JSON_VALUE) ResponseBody(org.springframework.web.bind.annotation.ResponseBody) Certification(org.talend.dataprep.api.dataset.DataSetGovernance.Certification) EncodingSupport(org.talend.dataprep.configuration.EncodingSupport) Comparator(java.util.Comparator) InputStream(java.io.InputStream) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) FormatAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.FormatAnalysis) FormatFamily(org.talend.dataprep.schema.FormatFamily) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) IOException(java.io.IOException) TDPException(org.talend.dataprep.exception.TDPException) TDPException(org.talend.dataprep.exception.TDPException) DistributedLock(org.talend.dataprep.lock.DistributedLock) DatasetUpdatedEvent(org.talend.dataprep.dataset.event.DatasetUpdatedEvent) DraftValidator(org.talend.dataprep.schema.DraftValidator) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) Timed(org.talend.dataprep.metrics.Timed) ApiOperation(io.swagger.annotations.ApiOperation) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Example 8 with DistributedLock

use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.

the class DataSetService method updateDatasetColumn.

/**
 * Update the column of the data set and computes the
 *
 * @param dataSetId the dataset id.
 * @param columnId the column id.
 * @param parameters the new type and domain.
 */
@RequestMapping(value = "/datasets/{datasetId}/column/{columnId}", method = POST)
@ApiOperation(value = "Update a column type and/or domain")
@Timed
public void updateDatasetColumn(@PathVariable(value = "datasetId") @ApiParam(name = "datasetId", value = "Id of the dataset") final String dataSetId, @PathVariable(value = "columnId") @ApiParam(name = "columnId", value = "Id of the column") final String columnId, @RequestBody final UpdateColumnParameters parameters) {
    final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
    lock.lock();
    try {
        // check that dataset exists
        final DataSetMetadata dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
        if (dataSetMetadata == null) {
            throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
        }
        LOG.debug("update dataset column for #{} with type {} and/or domain {}", dataSetId, parameters.getType(), parameters.getDomain());
        // get the column
        final ColumnMetadata column = dataSetMetadata.getRowMetadata().getById(columnId);
        if (column == null) {
            throw new // 
            TDPException(// 
            DataSetErrorCodes.COLUMN_DOES_NOT_EXIST, // 
            build().put("id", // 
            dataSetId).put("columnid", columnId));
        }
        // update type/domain
        if (parameters.getType() != null) {
            column.setType(parameters.getType());
        }
        if (parameters.getDomain() != null) {
            // erase domain to let only type
            if (parameters.getDomain().isEmpty()) {
                column.setDomain("");
                column.setDomainLabel("");
                column.setDomainFrequency(0);
            } else // change domain
            {
                final SemanticDomain semanticDomain = column.getSemanticDomains().stream().filter(// 
                dom -> StringUtils.equals(dom.getId(), parameters.getDomain())).findFirst().orElse(null);
                if (semanticDomain != null) {
                    column.setDomain(semanticDomain.getId());
                    column.setDomainLabel(semanticDomain.getLabel());
                    column.setDomainFrequency(semanticDomain.getScore());
                }
            }
        }
        // save
        dataSetMetadataRepository.save(dataSetMetadata);
        // analyze the updated dataset (not all analysis are performed)
        analyzeDataSet(// 
        dataSetId, // 
        false, asList(ContentAnalysis.class, FormatAnalysis.class, SchemaAnalysis.class));
    } finally {
        lock.unlock();
    }
}
Also used : TDPException(org.talend.dataprep.exception.TDPException) DistributedLock(org.talend.dataprep.lock.DistributedLock) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) FormatAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.FormatAnalysis) SemanticDomain(org.talend.dataprep.api.dataset.statistics.SemanticDomain) SchemaAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.SchemaAnalysis) ContentAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.ContentAnalysis) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Timed(org.talend.dataprep.metrics.Timed) ApiOperation(io.swagger.annotations.ApiOperation) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Example 9 with DistributedLock

use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.

the class BackgroundAnalysis method analyze.

/**
 * @see DataSetAnalyzer#analyze
 */
public void analyze(String dataSetId) {
    if (StringUtils.isEmpty(dataSetId)) {
        throw new IllegalArgumentException("Data set id cannot be null or empty.");
    }
    LOGGER.debug("Statistics analysis starts for {}", dataSetId);
    DataSetMetadata metadata = repository.get(dataSetId);
    if (metadata != null) {
        if (!metadata.getLifecycle().schemaAnalyzed()) {
            LOGGER.debug("Dataset {}, schema information must be computed before quality analysis can be performed, ignoring message", metadata.getId());
            // no acknowledge to allow re-poll.
            return;
        }
        final List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
        if (columns.isEmpty()) {
            LOGGER.debug("Skip statistics of {} (no column information).", metadata.getId());
        } else {
            // base analysis
            try (final Stream<DataSetRow> stream = store.stream(metadata)) {
                try (Analyzer<Analyzers.Result> analyzer = analyzerService.schemaAnalysis(columns)) {
                    computeStatistics(analyzer, columns, stream);
                    LOGGER.debug("Base statistics analysis done for{}", dataSetId);
                    // Save base analysis
                    saveAnalyzerResults(dataSetId, analyzer);
                }
            } catch (Exception e) {
                LOGGER.warn("Base statistics analysis, dataset {} generates an error", dataSetId, e);
                throw new TDPException(UNABLE_TO_ANALYZE_DATASET_QUALITY, e);
            }
            // advanced analysis
            try (final Stream<DataSetRow> stream = store.stream(metadata)) {
                try (Analyzer<Analyzers.Result> analyzer = analyzerService.full(columns)) {
                    computeStatistics(analyzer, columns, stream);
                    updateNbRecords(metadata, analyzer.getResult());
                    LOGGER.debug("Advanced statistics analysis done for{}", dataSetId);
                    // Save advanced analysis
                    saveAnalyzerResults(dataSetId, analyzer);
                }
            } catch (Exception e) {
                LOGGER.warn("Advanced statistics analysis, dataset {} generates an error", dataSetId, e);
                throw new TDPException(UNABLE_TO_ANALYZE_DATASET_QUALITY, e);
            }
            // Tag data set quality: now analyzed
            DistributedLock datasetLock = repository.createDatasetMetadataLock(metadata.getId());
            try {
                datasetLock.lock();
                final DataSetMetadata dataSetMetadata = repository.get(dataSetId);
                if (dataSetMetadata != null) {
                    dataSetMetadata.getLifecycle().qualityAnalyzed(true);
                    repository.save(metadata);
                }
            } finally {
                datasetLock.unlock();
            }
            LOGGER.info("Statistics analysis done for {}", dataSetId);
        }
    } else {
        LOGGER.info("Unable to analyze quality of data set #{}: seems to be removed.", dataSetId);
    }
}
Also used : TDPException(org.talend.dataprep.exception.TDPException) DistributedLock(org.talend.dataprep.lock.DistributedLock) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) TDPException(org.talend.dataprep.exception.TDPException)

Example 10 with DistributedLock

use of org.talend.dataprep.lock.DistributedLock in project data-prep by Talend.

the class BackgroundAnalysis method saveAnalyzerResults.

private void saveAnalyzerResults(String id, Analyzer<Analyzers.Result> analyzer) {
    DistributedLock datasetLock = repository.createDatasetMetadataLock(id);
    try {
        datasetLock.lock();
        final DataSetMetadata dataSetMetadata = repository.get(id);
        if (dataSetMetadata != null) {
            adapter.adapt(dataSetMetadata.getRowMetadata().getColumns(), analyzer.getResult());
            repository.save(dataSetMetadata);
        }
    } finally {
        datasetLock.unlock();
    }
}
Also used : DistributedLock(org.talend.dataprep.lock.DistributedLock) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata)

Aggregations

DistributedLock (org.talend.dataprep.lock.DistributedLock)14 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)13 TDPException (org.talend.dataprep.exception.TDPException)8 ApiOperation (io.swagger.annotations.ApiOperation)5 RequestMapping (org.springframework.web.bind.annotation.RequestMapping)5 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)5 Timed (org.talend.dataprep.metrics.Timed)5 InputStream (java.io.InputStream)4 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)4 IOException (java.io.IOException)3 PipedInputStream (java.io.PipedInputStream)3 Marker (org.slf4j.Marker)3 SemanticDomain (org.talend.dataprep.api.dataset.statistics.SemanticDomain)3 OutputStream (java.io.OutputStream)2 PipedOutputStream (java.io.PipedOutputStream)2 List (java.util.List)2 Stream (java.util.stream.Stream)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2 Autowired (org.springframework.beans.factory.annotation.Autowired)2