use of org.talend.dataprep.api.dataset.RowMetadata in project data-prep by Talend.
the class PipelineDiffTransformer method buildExecutable.
/**
* Starts the transformation in preview mode.
*
* @param input the dataset content.
* @param configuration The {@link Configuration configuration} for this transformation.
*/
@Override
public ExecutableTransformer buildExecutable(DataSet input, Configuration configuration) {
Validate.notNull(input, "Input cannot be null.");
final PreviewConfiguration previewConfiguration = (PreviewConfiguration) configuration;
final RowMetadata rowMetadata = input.getMetadata().getRowMetadata();
final TransformerWriter writer = writerRegistrationService.getWriter(configuration.formatId(), configuration.output(), configuration.getArguments());
// Build diff pipeline
final Node diffWriterNode = new DiffWriterNode(writer);
final String referenceActions = previewConfiguration.getReferenceActions();
final String previewActions = previewConfiguration.getPreviewActions();
final Pipeline referencePipeline = buildPipeline(rowMetadata, referenceActions);
final Pipeline previewPipeline = buildPipeline(rowMetadata, previewActions);
// Filter source records (extract TDP ids information)
final List<Long> indexes = previewConfiguration.getIndexes();
final boolean isIndexLimited = indexes != null && !indexes.isEmpty();
final Long minIndex = isIndexLimited ? indexes.stream().mapToLong(Long::longValue).min().getAsLong() : 0L;
final Long maxIndex = isIndexLimited ? indexes.stream().mapToLong(Long::longValue).max().getAsLong() : Long.MAX_VALUE;
final Predicate<DataSetRow> filter = isWithinWantedIndexes(minIndex, maxIndex);
// Build diff pipeline
final Node diffPipeline = //
NodeBuilder.filteredSource(filter).dispatchTo(referencePipeline, //
previewPipeline).zipTo(//
diffWriterNode).build();
// wrap this transformer into an ExecutableTransformer
return new ExecutableTransformer() {
@Override
public void execute() {
// Run diff
try {
// Print pipeline before execution (for debug purposes).
diffPipeline.logStatus(LOGGER, "Before execution: {}");
input.getRecords().forEach(r -> diffPipeline.exec().receive(r, rowMetadata));
diffPipeline.exec().signal(Signal.END_OF_STREAM);
} finally {
// Print pipeline after execution (for debug purposes).
diffPipeline.logStatus(LOGGER, "After execution: {}");
}
}
@Override
public void signal(Signal signal) {
diffPipeline.exec().signal(signal);
}
};
}
use of org.talend.dataprep.api.dataset.RowMetadata in project data-prep by Talend.
the class AggregationService method aggregate.
/**
* Process an aggregation.
*
* @param parameters the aggregation parameters.
* @param dataset the dataset input.
* @return the aggregation result.
*/
public AggregationResult aggregate(AggregationParameters parameters, DataSet dataset) {
// check the parameters
if (parameters.getOperations().isEmpty() || parameters.getGroupBy().isEmpty()) {
throw new TDPException(CommonErrorCodes.BAD_AGGREGATION_PARAMETERS);
}
AggregationResult result = new AggregationResult(parameters.getOperations().get(0).getOperator());
// get the aggregator
Aggregator aggregator = factory.get(parameters);
// Build optional filter
final DataSetMetadata metadata = dataset.getMetadata();
final RowMetadata rowMetadata = metadata != null ? metadata.getRowMetadata() : new RowMetadata();
final Predicate<DataSetRow> filter = filterService.build(parameters.getFilter(), rowMetadata);
// process the dataset
dataset.getRecords().filter(filter).forEach(row -> aggregator.accept(row, result));
// Normalize result (perform clean / optimization now that all input was processed).
aggregator.normalize(result);
return result;
}
use of org.talend.dataprep.api.dataset.RowMetadata in project data-prep by Talend.
the class DataSetServiceTest method updateDatasetColumn_should_update_domain.
@Test
public void updateDatasetColumn_should_update_domain() throws Exception {
// given
final String dataSetId = //
given().body(//
IOUtils.toString(this.getClass().getResourceAsStream(TAGADA_CSV), UTF_8)).queryParam(CONTENT_TYPE, //
"text/csv").when().post(//
"/datasets").asString();
final ColumnMetadata column;
// update the metadata in the repository (lock mechanism is needed otherwise semantic domain will be erased by
// analysis)
final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
DataSetMetadata dataSetMetadata;
RowMetadata row;
lock.lock();
try {
dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
assertNotNull(dataSetMetadata);
row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
column = row.getById("0002");
final SemanticDomain jsoDomain = new SemanticDomain("JSO", "JSO label", 1.0F);
column.getSemanticDomains().add(jsoDomain);
dataSetMetadataRepository.save(dataSetMetadata);
} finally {
lock.unlock();
}
assertThat(column.getDomain(), is("FIRST_NAME"));
assertThat(column.getDomainLabel(), is("First Name"));
assertThat(column.getDomainFrequency(), is(100.0F));
// when
final Response res = //
given().body(//
"{\"domain\": \"JSO\"}").when().contentType(//
JSON).post("/datasets/{dataSetId}/column/{columnId}", dataSetId, "0002");
// then
res.then().statusCode(200);
dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
assertNotNull(dataSetMetadata);
row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
final ColumnMetadata actual = row.getById("0002");
assertThat(actual.getDomain(), is("JSO"));
assertThat(actual.getDomainLabel(), is("JSO label"));
assertThat(actual.getDomainFrequency(), is(1.0F));
}
use of org.talend.dataprep.api.dataset.RowMetadata in project data-prep by Talend.
the class DataSetServiceTest method updateDatasetColumn_should_update_type.
@Test
public void updateDatasetColumn_should_update_type() throws Exception {
// given
final String dataSetId = //
given().body(//
IOUtils.toString(this.getClass().getResourceAsStream(TAGADA_CSV), UTF_8)).queryParam(CONTENT_TYPE, //
"text/csv").when().post(//
"/datasets").asString();
DataSetMetadata dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
Assert.assertNotNull(dataSetMetadata);
RowMetadata row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
final ColumnMetadata column = row.getById("0002");
assertThat(column.getDomain(), is("FIRST_NAME"));
assertThat(column.getDomainLabel(), is("First Name"));
assertThat(column.getDomainFrequency(), is(100.0F));
assertThat(column.getType(), is("string"));
// when
final Response res = //
given().body(//
"{\"type\": \"integer\"}").when().contentType(//
JSON).post("/datasets/{dataSetId}/column/{columnId}", dataSetId, "0002");
// then
res.then().statusCode(200);
dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
Assert.assertNotNull(dataSetMetadata);
row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
final ColumnMetadata actual = row.getById("0002");
assertThat(actual.getDomain(), is("FIRST_NAME"));
assertThat(actual.getDomainLabel(), is("First Name"));
assertThat(actual.getDomainFrequency(), is(100.0F));
assertThat(actual.getType(), is("integer"));
}
use of org.talend.dataprep.api.dataset.RowMetadata in project data-prep by Talend.
the class CSVWriterTest method buildSimpleRow.
private DataSetRow buildSimpleRow() {
final List<ColumnMetadata> columns = new ArrayList<>();
columns.add(column().id(1).name("id").type(Type.STRING).build());
columns.add(column().id(2).name("firstname").type(Type.STRING).build());
Map<String, String> values = new HashMap<>();
values.put("0001", "64a5456ac148b64524ef165");
values.put("0002", "Superman");
return new DataSetRow(new RowMetadata(columns), values);
}
Aggregations