use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class SuggestDataSetActions method onExecute.
/**
* Retrieve the dataset metadata and look for the possible actions.
*
* @return the dataset possible actions.
*/
private HttpRequestBase onExecute() {
try {
// retrieve dataset metadata
DataSetMetadata metadata = getInput();
// queries its possible actions
final HttpPost post = new HttpPost(transformationServiceUrl + "/suggest/dataset");
post.setHeader(new BasicHeader("Content-Type", MediaType.APPLICATION_JSON_VALUE));
byte[] dataSetMetadataJSON = objectMapper.writer().writeValueAsBytes(metadata);
post.setEntity(new ByteArrayEntity(dataSetMetadataJSON));
return post;
} catch (JsonProcessingException e) {
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class CSVSchemaParser method parse.
/**
* @param request container with information needed to parse the raw data.
* @return
*/
@Override
public Schema parse(Request request) {
List<Schema.SheetContent> sheetContents = new ArrayList<>();
sheetContents.add(new Schema.SheetContent(META_KEY, new ArrayList<>()));
try {
final DataSetMetadata metadata = request.getMetadata();
final Map<String, String> parameters = guess(request, metadata.getEncoding());
metadata.getContent().setParameters(parameters);
List<String> header = csvFormatUtils.retrieveHeader(parameters);
if (header == null || header.isEmpty()) {
throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT);
}
LOGGER.debug("Columns found: {}", header);
// By default, consider all columns as Strings (to be refined by deeper analysis).
LOGGER.debug("Setting default type for columns...");
int i = 0;
for (String column : header) {
//
sheetContents.stream().filter(sheetContent -> META_KEY.equals(sheetContent.getName())).findFirst().get().getColumnMetadatas().add(column().id(i++).name(column).type(Type.STRING).build());
}
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNABLE_TO_READ_CONTENT, e);
}
return //
Schema.Builder.parserResult().sheetContents(//
sheetContents).draft(false).build();
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class ActionTestWorkbench method test.
public static void test(Collection<DataSetRow> input, AnalyzerService analyzerService, ActionRegistry actionRegistry, RunnableAction... actions) {
final List<RunnableAction> allActions = new ArrayList<>();
Collections.addAll(allActions, actions);
final DataSet dataSet = new DataSet();
final RowMetadata rowMetadata = input.iterator().next().getRowMetadata();
final DataSetMetadata dataSetMetadata = new DataSetMetadata();
dataSetMetadata.setRowMetadata(rowMetadata);
dataSet.setMetadata(dataSetMetadata);
dataSet.setRecords(input.stream());
final TestOutputNode outputNode = new TestOutputNode(input);
Pipeline pipeline = //
Pipeline.Builder.builder().withActionRegistry(actionRegistry).withInitialMetadata(rowMetadata, //
true).withActions(//
allActions).withAnalyzerService(analyzerService).withStatisticsAdapter(//
new StatisticsAdapter(40)).withOutput(//
() -> outputNode).build();
pipeline.execute(dataSet);
// Some tests rely on the metadata changes in the provided metadata so set back modified columns in row metadata
// (although this should be avoided in tests).
// TODO Make this method return the modified metadata iso. setting modified columns.
rowMetadata.setColumns(outputNode.getMetadata().getColumns());
for (DataSetRow dataSetRow : input) {
dataSetRow.setRowMetadata(rowMetadata);
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class DataSetContentStore method stream.
/**
* Similarly to {@link #get(DataSetMetadata)} returns the content of the data set but as a {@link Stream stream} of
* {@link DataSetRow rows} instead of JSON content.
*
* @param dataSetMetadata The {@link DataSetMetadata data set} to read rows from.
* @param limit A limit to pass to raw content supplier (use -1 for "no limit). Used as parameter to call
* {@link #get(DataSetMetadata, long)}.
* @return A valid <b>{@link DataSetRow}</b> stream.
*/
public Stream<DataSetRow> stream(DataSetMetadata dataSetMetadata, long limit) {
final InputStream inputStream = get(dataSetMetadata, limit);
final DataSetRowIterator iterator = new DataSetRowIterator(inputStream);
final Iterable<DataSetRow> rowIterable = () -> iterator;
Stream<DataSetRow> dataSetRowStream = StreamSupport.stream(rowIterable.spliterator(), false);
// make sure to close the original input stream when closing this one
AtomicLong tdpId = new AtomicLong(1);
final List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
final Analyzer<Analyzers.Result> analyzer = service.build(columns, AnalyzerService.Analysis.QUALITY);
dataSetRowStream = dataSetRowStream.filter(r -> !r.isEmpty()).map(r -> {
final String[] values = r.order(columns).toArray(DataSetRow.SKIP_TDP_ID);
analyzer.analyze(values);
return r;
}).map(// Mark invalid columns as detected by provided analyzer.
new InvalidMarker(columns, analyzer)).map(r -> {
//
r.setTdpId(tdpId.getAndIncrement());
return r;
}).onClose(() -> {
//
try {
inputStream.close();
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
});
return dataSetRowStream;
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class DataSetAPITest method testDataSetCreateWithSpace.
@Test
public void testDataSetCreateWithSpace() throws Exception {
// given
String dataSetId = testClient.createDataset("dataset/dataset.csv", "Test with spaces");
// when
final DataSetMetadata metadata = dataSetMetadataRepository.get(dataSetId);
// then
assertNotNull(metadata);
assertEquals("Test with spaces", metadata.getName());
}
Aggregations