use of org.talend.dataprep.api.dataset.DataSet in project data-prep by Talend.
the class DataSetService method getDataSetColumnSemanticCategories.
/**
* Return the semantic types for a given dataset / column.
*
* @param datasetId the datasetId id.
* @param columnId the column id.
* @return the semantic types for a given dataset / column.
*/
@RequestMapping(value = "/datasets/{datasetId}/columns/{columnId}/types", method = GET)
@ApiOperation(value = "list the types of the wanted column", notes = "This list can be used by user to change the column type.")
@Timed
@PublicAPI
public List<SemanticDomain> getDataSetColumnSemanticCategories(@ApiParam(value = "The dataset id") @PathVariable String datasetId, @ApiParam(value = "The column id") @PathVariable String columnId) {
LOG.debug("listing semantic categories for dataset #{} column #{}", datasetId, columnId);
final DataSetMetadata metadata = dataSetMetadataRepository.get(datasetId);
if (metadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, ExceptionContext.withBuilder().put("id", datasetId).build());
} else {
try (final Stream<DataSetRow> records = contentStore.stream(metadata)) {
final ColumnMetadata columnMetadata = metadata.getRowMetadata().getById(columnId);
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
records.map(r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
final StatisticsAdapter statisticsAdapter = new StatisticsAdapter(40);
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
LOG.debug("found {} for dataset #{}, column #{}", columnMetadata.getSemanticDomains(), datasetId, columnId);
return columnMetadata.getSemanticDomains();
}
}
}
use of org.talend.dataprep.api.dataset.DataSet in project data-prep by Talend.
the class DataSetService method updateDataSet.
/**
* Updates a data set metadata. If no data set exists for given id, a {@link TDPException} is thrown.
*
* @param dataSetId The id of data set to be updated.
* @param dataSetMetadata The new content for the data set. If empty, existing content will <b>not</b> be replaced.
* For delete operation, look at {@link #delete(String)}.
*/
@RequestMapping(value = "/datasets/{id}", method = PUT)
@ApiOperation(value = "Update a data set metadata by id", notes = "Update a data set metadata according to the content of the PUT body. Id should be a UUID returned by the list operation. Not valid or non existing data set id return an error response.")
@Timed
public void updateDataSet(@PathVariable(value = "id") @ApiParam(name = "id", value = "Id of the data set to update") String dataSetId, @RequestBody DataSetMetadata dataSetMetadata) {
if (dataSetMetadata != null && dataSetMetadata.getName() != null) {
checkDataSetName(dataSetMetadata.getName());
}
final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
lock.lock();
try {
DataSetMetadata metadataForUpdate = dataSetMetadataRepository.get(dataSetId);
if (metadataForUpdate == null) {
// No need to silently create the data set metadata: associated content will most likely not exist.
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
}
LOG.debug("updateDataSet: {}", dataSetMetadata);
publisher.publishEvent(new DatasetUpdatedEvent(dataSetMetadata));
//
// Only part of the metadata can be updated, so the original dataset metadata is loaded and updated
//
DataSetMetadata original = metadataBuilder.metadata().copy(metadataForUpdate).build();
try {
// update the name
metadataForUpdate.setName(dataSetMetadata.getName());
// update the sheet content (in case of a multi-sheet excel file)
if (metadataForUpdate.getSchemaParserResult() != null) {
Optional<Schema.SheetContent> sheetContentFound = metadataForUpdate.getSchemaParserResult().getSheetContents().stream().filter(sheetContent -> dataSetMetadata.getSheetName().equals(sheetContent.getName())).findFirst();
if (sheetContentFound.isPresent()) {
List<ColumnMetadata> columnMetadatas = sheetContentFound.get().getColumnMetadatas();
if (metadataForUpdate.getRowMetadata() == null) {
metadataForUpdate.setRowMetadata(new RowMetadata(emptyList()));
}
metadataForUpdate.getRowMetadata().setColumns(columnMetadatas);
}
metadataForUpdate.setSheetName(dataSetMetadata.getSheetName());
metadataForUpdate.setSchemaParserResult(null);
}
// Location updates
metadataForUpdate.setLocation(dataSetMetadata.getLocation());
// update parameters & encoding (so that user can change import parameters for CSV)
metadataForUpdate.getContent().setParameters(dataSetMetadata.getContent().getParameters());
metadataForUpdate.setEncoding(dataSetMetadata.getEncoding());
// update limit
final Optional<Long> newLimit = dataSetMetadata.getContent().getLimit();
newLimit.ifPresent(limit -> metadataForUpdate.getContent().setLimit(limit));
// Validate that the new data set metadata and removes the draft status
final String formatFamilyId = dataSetMetadata.getContent().getFormatFamilyId();
if (formatFamilyFactory.hasFormatFamily(formatFamilyId)) {
FormatFamily format = formatFamilyFactory.getFormatFamily(formatFamilyId);
try {
DraftValidator draftValidator = format.getDraftValidator();
DraftValidator.Result result = draftValidator.validate(dataSetMetadata);
if (result.isDraft()) {
// This is not an exception case: data set may remain a draft after update (although rather
// unusual)
LOG.warn("Data set #{} is still a draft after update.", dataSetId);
return;
}
// Data set metadata to update is no longer a draft
metadataForUpdate.setDraft(false);
} catch (UnsupportedOperationException e) {
// no need to validate draft here
}
}
// update schema
formatAnalyzer.update(original, metadataForUpdate);
// save the result
dataSetMetadataRepository.save(metadataForUpdate);
// all good mate!! so send that to jms
// Asks for a in depth schema analysis (for column type information).
analyzeDataSet(dataSetId, true, singletonList(FormatAnalysis.class));
} catch (TDPException e) {
throw e;
} catch (Exception e) {
throw new TDPException(UNABLE_TO_CREATE_OR_UPDATE_DATASET, e);
}
} finally {
lock.unlock();
}
}
use of org.talend.dataprep.api.dataset.DataSet in project data-prep by Talend.
the class DataSetService method getMetadata.
/**
* Returns the data set {@link DataSetMetadata metadata} for given <code>dataSetId</code>.
*
* @param dataSetId A data set id. If <code>null</code> <b>or</b> if no data set with provided id exits, operation
* returns {@link org.apache.http.HttpStatus#SC_NO_CONTENT} if metadata does not exist.
*/
@RequestMapping(value = "/datasets/{id}/metadata", method = RequestMethod.GET)
@ApiOperation(value = "Get metadata information of a data set by id", notes = "Get metadata information of a data set by id. Not valid or non existing data set id returns empty content.")
@Timed
@ResponseBody
public DataSet getMetadata(@PathVariable(value = "id") @ApiParam(name = "id", value = "Id of the data set metadata") String dataSetId) {
if (dataSetId == null) {
HttpResponseContext.status(HttpStatus.NO_CONTENT);
return null;
}
LOG.debug("get dataset metadata for {}", dataSetId);
DataSetMetadata metadata = dataSetMetadataRepository.get(dataSetId);
if (metadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
}
if (!metadata.getLifecycle().schemaAnalyzed()) {
HttpResponseContext.status(HttpStatus.ACCEPTED);
return DataSet.empty();
}
DataSet dataSet = new DataSet();
dataSet.setMetadata(conversionService.convert(metadata, UserDataSetMetadata.class));
LOG.info("found dataset {} for #{}", dataSet.getMetadata().getName(), dataSetId);
return dataSet;
}
use of org.talend.dataprep.api.dataset.DataSet in project data-prep by Talend.
the class DataSetServiceTest method invalid_us_states.
@Test
public void invalid_us_states() throws Exception {
String dataSetId = given().body(IOUtils.toString(this.getClass().getResourceAsStream("../invalid_us_states.csv"), UTF_8)).queryParam(CONTENT_TYPE, "text/csv").when().post("/datasets").asString();
assertQueueMessages(dataSetId);
InputStream content = when().get("/datasets/{id}/content?metadata=true", dataSetId).asInputStream();
String contentAsString = IOUtils.toString(content, UTF_8);
final DataSet dataset = mapper.readerFor(DataSet.class).readValue(contentAsString);
assertThat(dataset, is(notNullValue()));
assertThat(dataset.getMetadata().getRowMetadata().getColumns().isEmpty(), is(false));
final ColumnMetadata column = dataset.getMetadata().getRowMetadata().getColumns().get(0);
// us state code
assertThat(column.getDomain(), is("US_STATE_CODE"));
// 2 invalid values
assertThat(column.getQuality().getInvalid(), is(2));
}
use of org.talend.dataprep.api.dataset.DataSet in project data-prep by Talend.
the class DataSetServiceTest method shouldUpdateSeparatorWithHeader.
/**
* see https://jira.talendforge.org/browse/TDP-1066
*/
@Test
public void shouldUpdateSeparatorWithHeader() throws Exception {
// given
String dataSetId = createCSVDataSet(this.getClass().getResourceAsStream("../avengers.psv"), "tpd-1066");
InputStream metadataInput = when().get("/datasets/{id}/metadata", dataSetId).asInputStream();
DataSet dataSet = mapper.readerFor(DataSet.class).readValue(metadataInput);
DataSetMetadata metadata = dataSet.getMetadata();
// when
final Map<String, String> parameters = metadata.getContent().getParameters();
parameters.put(CSVFormatFamily.SEPARATOR_PARAMETER, "|");
parameters.remove(CSVFormatFamily.HEADER_COLUMNS_PARAMETER);
final int statusCode = //
given().contentType(//
JSON).body(//
mapper.writer().writeValueAsString(metadata)).expect().statusCode(200).log().ifError().when().put("/datasets/{id}", dataSetId).getStatusCode();
assertThat(statusCode, is(200));
assertQueueMessages(dataSetId);
// then
InputStream expected = this.getClass().getResourceAsStream("../avengers_expected.json");
String datasetContent = given().when().get("/datasets/{id}/content?metadata=true", dataSetId).asString();
assertThat(datasetContent, sameJSONAsFile(expected));
}
Aggregations