use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class CSVSchemaParserTest method should_parse_csv.
@Test
public void should_parse_csv() throws IOException {
try (InputStream inputStream = this.getClass().getResourceAsStream("simple.csv")) {
final String[] columns = { "first name", "last name" };
DataSetMetadata datasetMetadata = ioTestUtils.getSimpleDataSetMetadata(columns);
resetParameters(datasetMetadata, ";", Arrays.asList(columns), 1, false);
Schema result = csvSchemaParser.parse(new SchemaParser.Request(inputStream, datasetMetadata));
List<ColumnMetadata> actual = result.getSheetContents().get(0).getColumnMetadatas();
Assert.assertEquals(datasetMetadata.getRowMetadata().getColumns(), actual);
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class DataSetService method updateDataSet.
/**
* Updates a data set metadata. If no data set exists for given id, a {@link TDPException} is thrown.
*
* @param dataSetId The id of data set to be updated.
* @param dataSetMetadata The new content for the data set. If empty, existing content will <b>not</b> be replaced.
* For delete operation, look at {@link #delete(String)}.
*/
@RequestMapping(value = "/datasets/{id}", method = PUT)
@ApiOperation(value = "Update a data set metadata by id", notes = "Update a data set metadata according to the content of the PUT body. Id should be a UUID returned by the list operation. Not valid or non existing data set id return an error response.")
@Timed
public void updateDataSet(@PathVariable(value = "id") @ApiParam(name = "id", value = "Id of the data set to update") String dataSetId, @RequestBody DataSetMetadata dataSetMetadata) {
if (dataSetMetadata != null && dataSetMetadata.getName() != null) {
checkDataSetName(dataSetMetadata.getName());
}
final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
lock.lock();
try {
DataSetMetadata metadataForUpdate = dataSetMetadataRepository.get(dataSetId);
if (metadataForUpdate == null) {
// No need to silently create the data set metadata: associated content will most likely not exist.
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
}
LOG.debug("updateDataSet: {}", dataSetMetadata);
publisher.publishEvent(new DatasetUpdatedEvent(dataSetMetadata));
//
// Only part of the metadata can be updated, so the original dataset metadata is loaded and updated
//
DataSetMetadata original = metadataBuilder.metadata().copy(metadataForUpdate).build();
try {
// update the name
metadataForUpdate.setName(dataSetMetadata.getName());
// update the sheet content (in case of a multi-sheet excel file)
if (metadataForUpdate.getSchemaParserResult() != null) {
Optional<Schema.SheetContent> sheetContentFound = metadataForUpdate.getSchemaParserResult().getSheetContents().stream().filter(sheetContent -> dataSetMetadata.getSheetName().equals(sheetContent.getName())).findFirst();
if (sheetContentFound.isPresent()) {
List<ColumnMetadata> columnMetadatas = sheetContentFound.get().getColumnMetadatas();
if (metadataForUpdate.getRowMetadata() == null) {
metadataForUpdate.setRowMetadata(new RowMetadata(emptyList()));
}
metadataForUpdate.getRowMetadata().setColumns(columnMetadatas);
}
metadataForUpdate.setSheetName(dataSetMetadata.getSheetName());
metadataForUpdate.setSchemaParserResult(null);
}
// Location updates
metadataForUpdate.setLocation(dataSetMetadata.getLocation());
// update parameters & encoding (so that user can change import parameters for CSV)
metadataForUpdate.getContent().setParameters(dataSetMetadata.getContent().getParameters());
metadataForUpdate.setEncoding(dataSetMetadata.getEncoding());
// update limit
final Optional<Long> newLimit = dataSetMetadata.getContent().getLimit();
newLimit.ifPresent(limit -> metadataForUpdate.getContent().setLimit(limit));
// Validate that the new data set metadata and removes the draft status
final String formatFamilyId = dataSetMetadata.getContent().getFormatFamilyId();
if (formatFamilyFactory.hasFormatFamily(formatFamilyId)) {
FormatFamily format = formatFamilyFactory.getFormatFamily(formatFamilyId);
try {
DraftValidator draftValidator = format.getDraftValidator();
DraftValidator.Result result = draftValidator.validate(dataSetMetadata);
if (result.isDraft()) {
// This is not an exception case: data set may remain a draft after update (although rather
// unusual)
LOG.warn("Data set #{} is still a draft after update.", dataSetId);
return;
}
// Data set metadata to update is no longer a draft
metadataForUpdate.setDraft(false);
} catch (UnsupportedOperationException e) {
// no need to validate draft here
}
}
// update schema
formatAnalyzer.update(original, metadataForUpdate);
// save the result
dataSetMetadataRepository.save(metadataForUpdate);
// all good mate!! so send that to jms
// Asks for a in depth schema analysis (for column type information).
analyzeDataSet(dataSetId, true, singletonList(FormatAnalysis.class));
} catch (TDPException e) {
throw e;
} catch (Exception e) {
throw new TDPException(UNABLE_TO_CREATE_OR_UPDATE_DATASET, e);
}
} finally {
lock.unlock();
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class HtmlSchemaParserTest method read_html_TDP_1136.
@Test
public void read_html_TDP_1136() throws Exception {
try (InputStream inputStream = this.getClass().getResourceAsStream("sales-force.xls")) {
// We do know the format and therefore we go directly to the HTML schema guessing
SchemaParser.Request request = getRequest(inputStream, "#1");
request.getMetadata().setEncoding("UTF-16");
Schema result = parser.parse(request);
Assertions.assertThat(result).isNotNull();
Assertions.assertThat(result.getSheetContents()).isNotNull().isNotEmpty().hasSize(1);
List<ColumnMetadata> columnMetadatas = result.getSheetContents().get(0).getColumnMetadatas();
Assertions.assertThat(columnMetadatas).isNotNull().isNotEmpty().hasSize(7);
//
Assertions.assertThat(columnMetadatas.get(0)).isEqualToComparingOnlyGivenFields(//
ColumnMetadata.Builder.column().type(Type.STRING).id(0).name("UID").build(), "id", "name", "type");
//
Assertions.assertThat(columnMetadatas.get(1)).isEqualToComparingOnlyGivenFields(//
ColumnMetadata.Builder.column().type(Type.STRING).id(1).name("Team Member: Name").build(), "id", "name", "type");
//
Assertions.assertThat(columnMetadatas.get(2)).isEqualToComparingOnlyGivenFields(//
ColumnMetadata.Builder.column().type(Type.STRING).id(2).name("Country").build(), "id", "name", "type");
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class HtmlSerializerTest method html_serializer.
@Test
public void html_serializer() throws Exception {
final SchemaParser.Request request;
final Schema result;
try (InputStream inputStream = this.getClass().getResourceAsStream("sales-force.xls")) {
// We do know the format and therefore we go directly to the HTML schema guessing
request = getRequest(inputStream, "#2");
request.getMetadata().setEncoding("UTF-16");
result = htmlSchemaGuesser.parse(request);
}
try (InputStream inputStream = this.getClass().getResourceAsStream("sales-force.xls")) {
request.getMetadata().getRowMetadata().setColumns(result.getSheetContents().get(0).getColumnMetadatas());
InputStream jsonStream = htmlSerializer.serialize(inputStream, request.getMetadata(), -1);
String json = IOUtils.toString(jsonStream, UTF_8);
logger.debug("json: {}", json);
ObjectMapper mapper = new ObjectMapper();
CollectionType collectionType = mapper.getTypeFactory().constructCollectionType(ArrayList.class, TreeMap.class);
List<Map<String, String>> values = mapper.readValue(json, collectionType);
logger.debug("values: {}", values);
Map<String, String> row0 = values.get(0);
//
Assertions.assertThat(row0).contains(//
MapEntry.entry("0000", "000001"), //
MapEntry.entry("0001", "Jennifer BOS"), //
MapEntry.entry("0002", "France"), MapEntry.entry("0003", "jbos@talend.com"));
}
}
Aggregations