use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class CopyColumnTest method test_TDP_567_with_force_false.
@Test
public void test_TDP_567_with_force_false() throws Exception {
List<ColumnMetadata> input = new ArrayList<>();
final ColumnMetadata original = createMetadata("0001", "column");
original.setStatistics(new Statistics());
SemanticDomain semanticDomain = new SemanticDomain("mountain_goat", "Mountain goat pale pale", 1);
original.setDomain("beer");
original.setDomainFrequency(1);
original.setDomainLabel("the best beer");
original.setDomainForced(false);
original.setTypeForced(false);
original.setSemanticDomains(Collections.singletonList(semanticDomain));
input.add(original);
RowMetadata rowMetadata = new RowMetadata(input);
assertThat(rowMetadata.getColumns()).isNotNull().isNotEmpty().hasSize(1);
final DataSetRow row = new DataSetRow(rowMetadata);
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
List<ColumnMetadata> actual = row.getRowMetadata().getColumns();
assertThat(actual).isNotNull().isNotEmpty().hasSize(2);
assertEquals(actual.get(1).getStatistics(), original.getStatistics());
//
assertThat(actual.get(1)).isEqualToComparingOnlyGivenFields(original, "domain", "domainLabel", "domainFrequency", "domainForced", "typeForced");
//
assertThat(actual.get(1).getSemanticDomains()).isNotNull().isNotEmpty().contains(semanticDomain);
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class CopyColumnTest method test_TDP_567_with_force_true.
@Test
public void test_TDP_567_with_force_true() throws Exception {
List<ColumnMetadata> input = new ArrayList<>();
final ColumnMetadata original = createMetadata("0001", "column");
original.setStatistics(new Statistics());
SemanticDomain semanticDomain = new SemanticDomain("mountain_goat", "Mountain goat pale pale", 1);
original.setDomain("beer");
original.setDomainFrequency(1);
original.setDomainLabel("the best beer");
original.setDomainForced(true);
original.setTypeForced(true);
original.setSemanticDomains(Collections.singletonList(semanticDomain));
input.add(original);
RowMetadata rowMetadata = new RowMetadata(input);
assertThat(rowMetadata.getColumns()).isNotNull().isNotEmpty().hasSize(1);
final DataSetRow row = new DataSetRow(rowMetadata);
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
List<ColumnMetadata> actual = row.getRowMetadata().getColumns();
assertThat(actual).isNotNull().isNotEmpty().hasSize(2);
assertEquals(actual.get(1).getStatistics(), original.getStatistics());
//
assertThat(actual.get(1)).isEqualToComparingOnlyGivenFields(original, "domain", "domainLabel", "domainFrequency", "domainForced", "typeForced");
//
assertThat(actual.get(1).getSemanticDomains()).isNotNull().isNotEmpty().contains(semanticDomain);
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class CopyColumnTest method should_copy_semantic.
@Test
public void should_copy_semantic() throws Exception {
List<ColumnMetadata> input = new ArrayList<>();
final ColumnMetadata original = createMetadata("0001", "column");
original.setStatistics(new Statistics());
SemanticDomain semanticDomain = new SemanticDomain("mountain_goat", "Mountain goat pale pale", 1);
original.setDomain("beer");
original.setDomainFrequency(1);
original.setDomainLabel("the best beer");
original.setSemanticDomains(Collections.singletonList(semanticDomain));
input.add(original);
RowMetadata rowMetadata = new RowMetadata(input);
assertThat(rowMetadata.getColumns()).isNotNull().isNotEmpty().hasSize(1);
final DataSetRow row = new DataSetRow(rowMetadata);
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
List<ColumnMetadata> actual = row.getRowMetadata().getColumns();
assertThat(actual).isNotNull().isNotEmpty().hasSize(2);
assertEquals(actual.get(1).getStatistics(), original.getStatistics());
//
assertThat(actual.get(1)).isEqualToComparingOnlyGivenFields(original, "domain", "domainLabel", "domainFrequency");
//
assertThat(actual.get(1).getSemanticDomains()).isNotNull().isNotEmpty().contains(semanticDomain);
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class DataSetService method getDataSetColumnSemanticCategories.
/**
* Return the semantic types for a given dataset / column.
*
* @param datasetId the datasetId id.
* @param columnId the column id.
* @return the semantic types for a given dataset / column.
*/
@RequestMapping(value = "/datasets/{datasetId}/columns/{columnId}/types", method = GET)
@ApiOperation(value = "list the types of the wanted column", notes = "This list can be used by user to change the column type.")
@Timed
@PublicAPI
public List<SemanticDomain> getDataSetColumnSemanticCategories(@ApiParam(value = "The dataset id") @PathVariable String datasetId, @ApiParam(value = "The column id") @PathVariable String columnId) {
LOG.debug("listing semantic categories for dataset #{} column #{}", datasetId, columnId);
final DataSetMetadata metadata = dataSetMetadataRepository.get(datasetId);
if (metadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, ExceptionContext.withBuilder().put("id", datasetId).build());
} else {
try (final Stream<DataSetRow> records = contentStore.stream(metadata)) {
final ColumnMetadata columnMetadata = metadata.getRowMetadata().getById(columnId);
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
records.map(r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
final StatisticsAdapter statisticsAdapter = new StatisticsAdapter(40);
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
LOG.debug("found {} for dataset #{}, column #{}", columnMetadata.getSemanticDomains(), datasetId, columnId);
return columnMetadata.getSemanticDomains();
}
}
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class DataSetService method updateDatasetColumn.
/**
* Update the column of the data set and computes the
*
* @param dataSetId the dataset id.
* @param columnId the column id.
* @param parameters the new type and domain.
*/
@RequestMapping(value = "/datasets/{datasetId}/column/{columnId}", method = POST)
@ApiOperation(value = "Update a column type and/or domain")
@Timed
public void updateDatasetColumn(@PathVariable(value = "datasetId") @ApiParam(name = "datasetId", value = "Id of the dataset") final String dataSetId, @PathVariable(value = "columnId") @ApiParam(name = "columnId", value = "Id of the column") final String columnId, @RequestBody final UpdateColumnParameters parameters) {
final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
lock.lock();
try {
// check that dataset exists
final DataSetMetadata dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
if (dataSetMetadata == null) {
throw new TDPException(DataSetErrorCodes.DATASET_DOES_NOT_EXIST, build().put("id", dataSetId));
}
LOG.debug("update dataset column for #{} with type {} and/or domain {}", dataSetId, parameters.getType(), parameters.getDomain());
// get the column
final ColumnMetadata column = dataSetMetadata.getRowMetadata().getById(columnId);
if (column == null) {
throw new //
TDPException(//
DataSetErrorCodes.COLUMN_DOES_NOT_EXIST, //
build().put("id", //
dataSetId).put("columnid", columnId));
}
// update type/domain
if (parameters.getType() != null) {
column.setType(parameters.getType());
}
if (parameters.getDomain() != null) {
// erase domain to let only type
if (parameters.getDomain().isEmpty()) {
column.setDomain("");
column.setDomainLabel("");
column.setDomainFrequency(0);
} else // change domain
{
final SemanticDomain semanticDomain = column.getSemanticDomains().stream().filter(//
dom -> StringUtils.equals(dom.getId(), parameters.getDomain())).findFirst().orElse(null);
if (semanticDomain != null) {
column.setDomain(semanticDomain.getId());
column.setDomainLabel(semanticDomain.getLabel());
column.setDomainFrequency(semanticDomain.getScore());
}
}
}
// save
dataSetMetadataRepository.save(dataSetMetadata);
// analyze the updated dataset (not all analysis are performed)
analyzeDataSet(//
dataSetId, //
false, asList(ContentAnalysis.class, FormatAnalysis.class, SchemaAnalysis.class));
} finally {
lock.unlock();
}
}
Aggregations